all tools updated and improved

pull/4/head
Kai Staats 2016-07-07 23:01:28 -06:00
parent 9cd58eb5c5
commit 76652e9364
5 changed files with 212 additions and 67 deletions

View File

@ -0,0 +1,65 @@
# Karoo Dataset Builder
# by Kai Staats, MSc UCT / AIMS and Arun Kumar, PhD
# version 0.9.1.2
import sys
import numpy as np
np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters before line-wrapping in order to view Trees
'''
In machine learning, it is often the case that your engaged dataset is derived from a larger parent. In constructing
the subset, if we grab a series of datapoints (rows in a .csv) from the larger dataset in sequential order, only from
the top, middle, or bottom, we will likely bias the new dataset and incorrectly train the machine learning algorithm.
Therefore, it is imperative that we engage a random function, guided only by the number of data points for each class.
This script can be used before or after karoo_normalise.py but assumes no header has yet been applied to the .csv.
'''
### USER INTERACTION ###
if len(sys.argv) == 1: print '\n\t\033[31mERROR! You have not assigned an input file. Try again ...\033[0;0m'; sys.exit()
elif len(sys.argv) > 2: print '\n\t\033[31mERROR! You have assigned too many command line arguments. Try again ...\033[0;0m'; sys.exit()
else: filename = sys.argv[1]
n = range(1,101)
while True:
try:
labels = raw_input('\n\tEnter number of unique class labels, or 0 for a regression dataset (default 2): ')
if labels not in str(n) and labels not in '': raise ValueError()
# if labels == '0': labels = 1; break
labels = labels or 2; labels = int(labels); break
except ValueError: print '\n\t\033[32mEnter a number from 0 including 100. Try again ...\033[0;0m'
n = range(10,10001)
while True:
try:
samples = raw_input('\n\tEnter number of desired datapoints per class (default 100): ')
if samples not in str(n) and samples not in '': raise ValueError()
if samples == '0': samples = 10; break
samples = samples or 100; samples = int(samples); break
except ValueError: print '\n\t\033[32mEnter a number from 10 including 10000. Try again ...\033[0;0m'
### LOAD THE ORIGINAL DATASET ###
print '\n\t\033[36m\n\tLoading dataset:', filename, '\033[0;0m\n'
data = np.loadtxt(filename, delimiter = ',') # load data
data_sort = np.empty(shape = [0, data.shape[1]]) # build an empty array of the proper dimensions
### SORT DATA by LABEL ###
for label in range(labels):
data_list = np.where(data[:,-1] == label) # build a list of all rows which end in the current label
data_select = np.random.choice(data_list[0], samples, replace = False) # select user defined 'samples' from list
print data_select
data_sort = np.append(data_sort, data[data_select], axis = 0)
### SAVE THE SORTED DATASET ###
file_tmp = filename.split('.')[0]
np.savetxt(file_tmp + '-SORT.csv', data_sort, delimiter = ',')
print '\n\t\033[36mThe sorted dataset has been written to the file:', file_tmp + '-SORT.csv', '\033[0;0m'

View File

@ -1,11 +1,16 @@
# Karoo Feature Set Prep
# Prepare a balanced feature set
# Prepare a balanced dataset
# by Kai Staats, MSc UCT / AIMS and Arun Kumar, PhD
import sys
import numpy as np
if len(sys.argv) == 1: print '\n\t\033[31mERROR! You have not assigned an input file. Try again ...\033[0;0m'; sys.exit()
elif len(sys.argv) > 2: print '\n\t\033[31mERROR! You have assigned too many command line arguments. Try again ...\033[0;0m'; sys.exit()
filename = sys.argv[1] # 'data/pixel_classifier/kat7-20150924-SUBSET.csv'
print '\n\t\033[36m You have opted to load the dataset:', filename, '\033[0;0m'
samples = 5000
# do NOT use readline as that is very, very slow

View File

@ -1,34 +1,39 @@
# Karoo GP Iris Plot
# Plot a function generated by Karoo GP against a scatter of the Iris data
# Karoo Iris Plot
# by Kai Staats, MSc UCT / AIMS and Arun Kumar, PhD
# version 0.9.1.2
# See https://www.youtube.com/channel/UCfzlCWGWYyIQ0aLC5w48gBQ for a good plotting tutorial
import sys
import numpy as np
import matplotlib.pyplot as mpl
from mpl_toolkits.mplot3d import Axes3D
# data = np.loadtxt('../files/Iris_dataset/data_IRIS_setosa-vs-versicolor_3-col_PLOT.csv', delimiter=',', dtype = str)
# data = np.loadtxt('../files/Iris_dataset/data_IRIS_versicolor-vs-virginica_3-col_PLOT.csv', delimiter=',', dtype = str)
data = np.loadtxt('../files/Iris_dataset/data_IRIS_virginica-vs-setosa_3-col_PLOT.csv', delimiter=',', dtype = str)
np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters before line-wrapping in order to view Trees
# http://stn.spotfire.com/spotfire_client_help/norm/norm_normalizing_columns.htm
# to scale between 0 and 1: n - min(list) / (max(list) - min(list))
'''
This is a functional, even it not fully automated script designed to help you visualise your 2D or 3D data against a
function generated by Karoo GP. The challenge comes with solving complex equations for a single variable such that
you have a plot-able function. If the algebra required is beyond your skills (or you forgot what you learned in high
school), tools such as Matlab may be of some assistance. If you desire to normalise your data in advance of using this
script, the Karoo GP normalisation script included in the karoo_gp/toos/ directory is very easy to use.
### PLOT THE DATA ###
def fx_normalize(array):
By default, this script plots a Karoo GP derived function against a scatter plot of one of the Iris datasets
included with this package: karoo_gp/files/Iris_dataset/data_IRIS_virginica-vs-setosa_3-col_PLOT.csv
norm = []
array_min = np.min(array)
array_max = np.max(array)
for col in range(1, len(array) + 1):
n = float((array[col - 1] - array_min) / (array_max - array_min))
norm = np.append(norm, n)
return norm
If you are new to plotting, https://www.youtube.com/channel/UCfzlCWGWYyIQ0aLC5w48gBQ for a good plotting tutorial
provides a good, visual tutorial, as do many, many other web and video based guides.
'''
### USER INTERACTION ###
if len(sys.argv) == 1:
filename = '../files/Iris_dataset/data_IRIS_virginica-vs-setosa_3-col_PLOT.csv'
print '\n\t\033[31mYou have not assigned an input file, therefore "IRIS_virginica-vs-setosa_3-col_PLOT" will be used.\033[0;0m'
elif len(sys.argv) > 2: print '\n\t\033[31mERROR! You have assigned too many command line arguments. Try again ...\033[0;0m'; sys.exit()
else: filename = sys.argv[1]
### LOAD THE DATA and PREPARE AN EMPTY ARRAY ###
print '\n\t\033[36mLoading dataset:', filename, '\033[0;0m\n'
data = np.loadtxt(filename, delimiter=',', dtype = str)
data_a, data_b, data_c = [], [], []
tmp = data[:,0]
@ -43,33 +48,24 @@ tmp = data[:,2]
for n in range(len(tmp)):
data_c.append(float(tmp[n]))
# normalise the data
# data_a = fx_normalize(data_a)
# data_b = fx_normalize(data_b)
# data_c = fx_normalize(data_c)
fig = mpl.figure()
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(data_a, data_b, data_c, c = 'r', marker = 'o')
### PLOT THE FUNCTION ###
b = np.arange(2, 4, 0.25)
c = np.arange(2, 4, 0.25)
### PREP THE FUNCTION ###
b = np.arange(2, 4, 0.25) # plot from n to m in steps o
c = np.arange(2, 4, 0.25) # plot from n to m in steps o
b, c = np.meshgrid(b, c)
# -b*c + c**2 + c - 1 --> ?
# -a/c - b**2 + c**2 --> ?
# -a - b + c**2 --> a = -b + c**2
# -b*c + c**2 + c - 1 # Karoo GP derived function
# -a/c - b**2 + c**2 # Karoo GP derived function
# -a - b + c**2 # Karoo GP derived function becomes a = -b + c**2
a = -b + c**2
# normalise the function
# a = fx_normalize(a)
# b = fx_normalize(b)
# c = fx_normalize(c)
ax.plot_wireframe(a,b,c)
### PLOT THE FUNCTION and DATA###
fig = mpl.figure()
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(data_a, data_b, data_c, c = 'r', marker = 'o') # 3D data
ax.plot_wireframe(a,b,c) # 3D function
ax.set_xlabel('a')
ax.set_ylabel('b')
@ -77,3 +73,4 @@ ax.set_zlabel('c')
mpl.show()

View File

@ -1,12 +1,17 @@
# Karoo Multiclass Classifer Test
# Play with quantity of class labels against a range of results
# by Kai Staats, MSc UCT / AIMS
# version 0.9.1.2
'''
This is a toy script, designed to allow you to play with multiclass classification using the same underlying function
as employed by Karoo GP.
'''
from numpy import arange
while True:
try:
class_type = raw_input('\t Select (i)finite or (f)inite class bins (default i): ')
class_type = raw_input('\t Select (i)nfinite or (f)inite wing bins (default i): ')
if class_type not in ('i','f',''): raise ValueError()
class_type = class_type or 'i'; break
except ValueError: print '\033[32mSelect from the options given. Try again ...\n\033[0;0m'
@ -21,45 +26,41 @@ while True:
except ValueError: print '\033[32m Enter a number from 3 including 100. Try again ...\n\033[0;0m'
skew = (class_labels / 2) - 1
min_val = 0 - skew - 1
if class_labels & 1: max_val = 0 + skew + 3
else: max_val = 0 + skew + 2
min_val = 0 - skew - 1 # add a data point to the left
if class_labels & 1: max_val = 0 + skew + 3 # add a data point to the right if odd number of class labels
else: max_val = 0 + skew + 2 # add a data point to the right if even number of class labels
print '\n\t class_labels =', range(class_labels)
print '\t solutions = [', min_val, '...', max_val - .5,']'
print '\t skew =', skew, '\n'
# a simple binary classifier, for comparison
# if result <= 0 and label == 0: fitness = 1
# elif result > 0 and label == 1: fitness = 1
# else: fitness = 0
if class_type == 'i':
for result in arange(min_val, max_val, .5):
for solution in arange(min_val, max_val, 0.5):
for label in range(class_labels):
if label == 0 and result <= 0 - skew: # check for the first class
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas\033[1m', result, '\033[0;0m\033[36m<= boundary', 0 - skew, '\033[0;0m'
if label == 0 and solution <= 0 - skew: # check for the first class
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas\033[1m', solution, '\033[0;0m\033[36m<=', 0 - skew, '\033[0;0m'
elif label == class_labels - 1 and result > label - 1 - skew: # check for the last class
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas\033[1m', result, '\033[0;0m\033[36m> boundary', label - skew, '\033[0;0m'
elif label == class_labels - 1 and solution > label - 1 - skew: # check for the last class
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas\033[1m', solution, '\033[0;0m\033[36m>', label - 1 - skew, '\033[0;0m'
elif (label - 1) - skew < result <= label - skew: # check for class bins between first and last
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas boundary', (label - 1) - skew, '<\033[1m', result, '\033[0;0m\033[36m<=', 'boundary', label - skew, '\033[0;0m'
elif label - 1 - skew < solution <= label - skew: # check for class bins between first and last
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas', label - 1 - skew, '<\033[1m', solution, '\033[0;0m\033[36m<=', label - skew, '\033[0;0m'
else: fitness = 0; print '\t\033[36m no match for', result, 'in class', label, '\033[0;0m' # no class match
else: fitness = 0 #; print '\t\033[36m no match for', solution, 'in class', label, '\033[0;0m' # no class match
print ''
# print ''
if class_type == 'f':
for result in arange(min_val, max_val, .5):
for solution in arange(min_val, max_val, .5):
for label in range(class_labels):
if (label - 1) - skew < result <= label - skew: # check for discrete, finite class bins
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas boundary', (label - 1) - skew, '<\033[1m', result, '\033[0;0m\033[36m<=', 'boundary', label - skew, '\033[0;0m'
if label - 1 - skew < solution <= label - skew: # check for discrete, finite class bins
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas', label - 1 - skew, '<\033[1m', solution, '\033[0;0m\033[36m<=', label - skew, '\033[0;0m'
else: fitness = 0; print '\t\033[36m no match for', result, 'in class', label, '\033[0;0m' # no class match
else: fitness = 0 #; print '\t\033[36m no match for', solution, 'in class', label, '\033[0;0m' # no class match
print ''
# print ''

View File

@ -0,0 +1,77 @@
# Karoo Data Normalisation
# by Kai Staats, MSc UCT
# version 0.9.1.2
import sys
import numpy as np
np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters before line-wrapping in order to view Trees
'''
This script works with a raw dataset to prepare a new, normalised dataset. It does so by comparing all values in each
given column, finding the maximum and minimum values, and then modifying each value to fall between a high of 1 and
low of 0. The modified values are written to a new file, the original remaining untouched.
This script can be used before or after karoo_features_sort.py but assumes no header has yet been applied to the .csv.
'''
def normalise(array):
'''
The formula was derived from stn.spotfire.com/spotfire_client_help/norm/norm_normalizing_columns.htm
'''
norm = []
array_norm = []
array_min = np.min(array)
array_max = np.max(array)
for col in range(1, len(array) + 1):
norm = float((array[col - 1] - array_min) / (array_max - array_min))
norm = round(norm, fp) # force to 4 decimal points
array_norm = np.append(array_norm, norm)
return array_norm
### USER INTERACTION ###
if len(sys.argv) == 1: print '\n\t\033[31mERROR! You have not assigned an input file. Try again ...\033[0;0m'; sys.exit()
elif len(sys.argv) > 2: print '\n\t\033[31mERROR! You have assigned too many command line arguments. Try again ...\033[0;0m'; sys.exit()
else: filename = sys.argv[1]
n = range(1,9)
while True:
try:
fp = raw_input('\n\tEnter number of floating points desired in normalised data (default 4): ')
if fp not in str(n) and fp not in '': raise ValueError()
if fp == '0': fp = 1; break
fp = fp or 4; fp = int(fp); break
except ValueError: print '\n\t\033[32mEnter a number from 1 including 8. Try again ...\033[0;0m'
### LOAD THE DATA and PREPARE AN EMPTY ARRAY ###
print '\n\t\033[36mLoading dataset:', filename, '\033[0;0m\n'
data = np.loadtxt(filename, delimiter = ',') # load data
data_norm = np.zeros(shape = (data.shape[0], data.shape[1])) # build an empty dataset which matches the shape of the original
### NORMALISE THE DATA ###
for col in range(data.shape[1] - 1):
print '\tnormalising column:', col
colsum = []
for row in range(data.shape[0]):
colsum = np.append(colsum, data[row,col])
data_norm[:,col] = normalise(colsum) # add each normalised column of data
data_norm[:,data.shape[1] - 1] = data[:,data.shape[1] - 1] # add the labels again
### SAVE THE NORMALISED DATA ###
file_tmp = filename.split('.')[0]
np.savetxt(file_tmp + '-NORM.csv', data_norm, delimiter = ',')
print '\n\t\033[36mThe normlised dataset has been written to the file:', file_tmp + '-NORM.csv', '\033[0;0m'