all tools updated and improved
parent
9cd58eb5c5
commit
76652e9364
|
@ -0,0 +1,65 @@
|
|||
# Karoo Dataset Builder
|
||||
# by Kai Staats, MSc UCT / AIMS and Arun Kumar, PhD
|
||||
# version 0.9.1.2
|
||||
|
||||
import sys
|
||||
import numpy as np
|
||||
|
||||
np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters before line-wrapping in order to view Trees
|
||||
|
||||
'''
|
||||
In machine learning, it is often the case that your engaged dataset is derived from a larger parent. In constructing
|
||||
the subset, if we grab a series of datapoints (rows in a .csv) from the larger dataset in sequential order, only from
|
||||
the top, middle, or bottom, we will likely bias the new dataset and incorrectly train the machine learning algorithm.
|
||||
Therefore, it is imperative that we engage a random function, guided only by the number of data points for each class.
|
||||
|
||||
This script can be used before or after karoo_normalise.py but assumes no header has yet been applied to the .csv.
|
||||
'''
|
||||
|
||||
### USER INTERACTION ###
|
||||
if len(sys.argv) == 1: print '\n\t\033[31mERROR! You have not assigned an input file. Try again ...\033[0;0m'; sys.exit()
|
||||
elif len(sys.argv) > 2: print '\n\t\033[31mERROR! You have assigned too many command line arguments. Try again ...\033[0;0m'; sys.exit()
|
||||
else: filename = sys.argv[1]
|
||||
|
||||
n = range(1,101)
|
||||
while True:
|
||||
try:
|
||||
labels = raw_input('\n\tEnter number of unique class labels, or 0 for a regression dataset (default 2): ')
|
||||
if labels not in str(n) and labels not in '': raise ValueError()
|
||||
# if labels == '0': labels = 1; break
|
||||
labels = labels or 2; labels = int(labels); break
|
||||
except ValueError: print '\n\t\033[32mEnter a number from 0 including 100. Try again ...\033[0;0m'
|
||||
|
||||
n = range(10,10001)
|
||||
while True:
|
||||
try:
|
||||
samples = raw_input('\n\tEnter number of desired datapoints per class (default 100): ')
|
||||
if samples not in str(n) and samples not in '': raise ValueError()
|
||||
if samples == '0': samples = 10; break
|
||||
samples = samples or 100; samples = int(samples); break
|
||||
except ValueError: print '\n\t\033[32mEnter a number from 10 including 10000. Try again ...\033[0;0m'
|
||||
|
||||
|
||||
### LOAD THE ORIGINAL DATASET ###
|
||||
print '\n\t\033[36m\n\tLoading dataset:', filename, '\033[0;0m\n'
|
||||
data = np.loadtxt(filename, delimiter = ',') # load data
|
||||
data_sort = np.empty(shape = [0, data.shape[1]]) # build an empty array of the proper dimensions
|
||||
|
||||
|
||||
### SORT DATA by LABEL ###
|
||||
for label in range(labels):
|
||||
data_list = np.where(data[:,-1] == label) # build a list of all rows which end in the current label
|
||||
|
||||
data_select = np.random.choice(data_list[0], samples, replace = False) # select user defined 'samples' from list
|
||||
print data_select
|
||||
|
||||
data_sort = np.append(data_sort, data[data_select], axis = 0)
|
||||
|
||||
|
||||
### SAVE THE SORTED DATASET ###
|
||||
file_tmp = filename.split('.')[0]
|
||||
np.savetxt(file_tmp + '-SORT.csv', data_sort, delimiter = ',')
|
||||
|
||||
print '\n\t\033[36mThe sorted dataset has been written to the file:', file_tmp + '-SORT.csv', '\033[0;0m'
|
||||
|
||||
|
|
@ -1,11 +1,16 @@
|
|||
# Karoo Feature Set Prep
|
||||
# Prepare a balanced feature set
|
||||
# Prepare a balanced dataset
|
||||
# by Kai Staats, MSc UCT / AIMS and Arun Kumar, PhD
|
||||
|
||||
import sys
|
||||
import numpy as np
|
||||
|
||||
if len(sys.argv) == 1: print '\n\t\033[31mERROR! You have not assigned an input file. Try again ...\033[0;0m'; sys.exit()
|
||||
elif len(sys.argv) > 2: print '\n\t\033[31mERROR! You have assigned too many command line arguments. Try again ...\033[0;0m'; sys.exit()
|
||||
|
||||
filename = sys.argv[1] # 'data/pixel_classifier/kat7-20150924-SUBSET.csv'
|
||||
print '\n\t\033[36m You have opted to load the dataset:', filename, '\033[0;0m'
|
||||
|
||||
samples = 5000
|
||||
|
||||
# do NOT use readline as that is very, very slow
|
||||
|
|
|
@ -1,34 +1,39 @@
|
|||
# Karoo GP Iris Plot
|
||||
# Plot a function generated by Karoo GP against a scatter of the Iris data
|
||||
# Karoo Iris Plot
|
||||
# by Kai Staats, MSc UCT / AIMS and Arun Kumar, PhD
|
||||
# version 0.9.1.2
|
||||
|
||||
# See https://www.youtube.com/channel/UCfzlCWGWYyIQ0aLC5w48gBQ for a good plotting tutorial
|
||||
|
||||
import sys
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as mpl
|
||||
from mpl_toolkits.mplot3d import Axes3D
|
||||
|
||||
# data = np.loadtxt('../files/Iris_dataset/data_IRIS_setosa-vs-versicolor_3-col_PLOT.csv', delimiter=',', dtype = str)
|
||||
# data = np.loadtxt('../files/Iris_dataset/data_IRIS_versicolor-vs-virginica_3-col_PLOT.csv', delimiter=',', dtype = str)
|
||||
data = np.loadtxt('../files/Iris_dataset/data_IRIS_virginica-vs-setosa_3-col_PLOT.csv', delimiter=',', dtype = str)
|
||||
np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters before line-wrapping in order to view Trees
|
||||
|
||||
# http://stn.spotfire.com/spotfire_client_help/norm/norm_normalizing_columns.htm
|
||||
# to scale between 0 and 1: n - min(list) / (max(list) - min(list))
|
||||
'''
|
||||
This is a functional, even it not fully automated script designed to help you visualise your 2D or 3D data against a
|
||||
function generated by Karoo GP. The challenge comes with solving complex equations for a single variable such that
|
||||
you have a plot-able function. If the algebra required is beyond your skills (or you forgot what you learned in high
|
||||
school), tools such as Matlab may be of some assistance. If you desire to normalise your data in advance of using this
|
||||
script, the Karoo GP normalisation script included in the karoo_gp/toos/ directory is very easy to use.
|
||||
|
||||
### PLOT THE DATA ###
|
||||
def fx_normalize(array):
|
||||
By default, this script plots a Karoo GP derived function against a scatter plot of one of the Iris datasets
|
||||
included with this package: karoo_gp/files/Iris_dataset/data_IRIS_virginica-vs-setosa_3-col_PLOT.csv
|
||||
|
||||
norm = []
|
||||
array_min = np.min(array)
|
||||
array_max = np.max(array)
|
||||
|
||||
for col in range(1, len(array) + 1):
|
||||
n = float((array[col - 1] - array_min) / (array_max - array_min))
|
||||
norm = np.append(norm, n)
|
||||
|
||||
return norm
|
||||
If you are new to plotting, https://www.youtube.com/channel/UCfzlCWGWYyIQ0aLC5w48gBQ for a good plotting tutorial
|
||||
provides a good, visual tutorial, as do many, many other web and video based guides.
|
||||
'''
|
||||
|
||||
### USER INTERACTION ###
|
||||
if len(sys.argv) == 1:
|
||||
filename = '../files/Iris_dataset/data_IRIS_virginica-vs-setosa_3-col_PLOT.csv'
|
||||
print '\n\t\033[31mYou have not assigned an input file, therefore "IRIS_virginica-vs-setosa_3-col_PLOT" will be used.\033[0;0m'
|
||||
|
||||
elif len(sys.argv) > 2: print '\n\t\033[31mERROR! You have assigned too many command line arguments. Try again ...\033[0;0m'; sys.exit()
|
||||
else: filename = sys.argv[1]
|
||||
|
||||
|
||||
### LOAD THE DATA and PREPARE AN EMPTY ARRAY ###
|
||||
print '\n\t\033[36mLoading dataset:', filename, '\033[0;0m\n'
|
||||
data = np.loadtxt(filename, delimiter=',', dtype = str)
|
||||
data_a, data_b, data_c = [], [], []
|
||||
|
||||
tmp = data[:,0]
|
||||
|
@ -43,33 +48,24 @@ tmp = data[:,2]
|
|||
for n in range(len(tmp)):
|
||||
data_c.append(float(tmp[n]))
|
||||
|
||||
# normalise the data
|
||||
# data_a = fx_normalize(data_a)
|
||||
# data_b = fx_normalize(data_b)
|
||||
# data_c = fx_normalize(data_c)
|
||||
|
||||
fig = mpl.figure()
|
||||
ax = fig.add_subplot(111, projection = '3d')
|
||||
|
||||
ax.scatter(data_a, data_b, data_c, c = 'r', marker = 'o')
|
||||
|
||||
|
||||
### PLOT THE FUNCTION ###
|
||||
b = np.arange(2, 4, 0.25)
|
||||
c = np.arange(2, 4, 0.25)
|
||||
### PREP THE FUNCTION ###
|
||||
b = np.arange(2, 4, 0.25) # plot from n to m in steps o
|
||||
c = np.arange(2, 4, 0.25) # plot from n to m in steps o
|
||||
b, c = np.meshgrid(b, c)
|
||||
|
||||
# -b*c + c**2 + c - 1 --> ?
|
||||
# -a/c - b**2 + c**2 --> ?
|
||||
# -a - b + c**2 --> a = -b + c**2
|
||||
# -b*c + c**2 + c - 1 # Karoo GP derived function
|
||||
# -a/c - b**2 + c**2 # Karoo GP derived function
|
||||
# -a - b + c**2 # Karoo GP derived function becomes a = -b + c**2
|
||||
a = -b + c**2
|
||||
|
||||
# normalise the function
|
||||
# a = fx_normalize(a)
|
||||
# b = fx_normalize(b)
|
||||
# c = fx_normalize(c)
|
||||
|
||||
ax.plot_wireframe(a,b,c)
|
||||
### PLOT THE FUNCTION and DATA###
|
||||
fig = mpl.figure()
|
||||
|
||||
ax = fig.add_subplot(111, projection = '3d')
|
||||
ax.scatter(data_a, data_b, data_c, c = 'r', marker = 'o') # 3D data
|
||||
ax.plot_wireframe(a,b,c) # 3D function
|
||||
|
||||
ax.set_xlabel('a')
|
||||
ax.set_ylabel('b')
|
||||
|
@ -77,3 +73,4 @@ ax.set_zlabel('c')
|
|||
|
||||
mpl.show()
|
||||
|
||||
|
||||
|
|
|
@ -1,12 +1,17 @@
|
|||
# Karoo Multiclass Classifer Test
|
||||
# Play with quantity of class labels against a range of results
|
||||
# by Kai Staats, MSc UCT / AIMS
|
||||
# version 0.9.1.2
|
||||
|
||||
'''
|
||||
This is a toy script, designed to allow you to play with multiclass classification using the same underlying function
|
||||
as employed by Karoo GP.
|
||||
'''
|
||||
|
||||
from numpy import arange
|
||||
|
||||
while True:
|
||||
try:
|
||||
class_type = raw_input('\t Select (i)finite or (f)inite class bins (default i): ')
|
||||
class_type = raw_input('\t Select (i)nfinite or (f)inite wing bins (default i): ')
|
||||
if class_type not in ('i','f',''): raise ValueError()
|
||||
class_type = class_type or 'i'; break
|
||||
except ValueError: print '\033[32mSelect from the options given. Try again ...\n\033[0;0m'
|
||||
|
@ -21,45 +26,41 @@ while True:
|
|||
except ValueError: print '\033[32m Enter a number from 3 including 100. Try again ...\n\033[0;0m'
|
||||
|
||||
skew = (class_labels / 2) - 1
|
||||
min_val = 0 - skew - 1
|
||||
if class_labels & 1: max_val = 0 + skew + 3
|
||||
else: max_val = 0 + skew + 2
|
||||
min_val = 0 - skew - 1 # add a data point to the left
|
||||
if class_labels & 1: max_val = 0 + skew + 3 # add a data point to the right if odd number of class labels
|
||||
else: max_val = 0 + skew + 2 # add a data point to the right if even number of class labels
|
||||
|
||||
print '\n\t class_labels =', range(class_labels)
|
||||
print '\t solutions = [', min_val, '...', max_val - .5,']'
|
||||
print '\t skew =', skew, '\n'
|
||||
|
||||
# a simple binary classifier, for comparison
|
||||
# if result <= 0 and label == 0: fitness = 1
|
||||
# elif result > 0 and label == 1: fitness = 1
|
||||
# else: fitness = 0
|
||||
|
||||
if class_type == 'i':
|
||||
for result in arange(min_val, max_val, .5):
|
||||
for solution in arange(min_val, max_val, 0.5):
|
||||
for label in range(class_labels):
|
||||
|
||||
if label == 0 and result <= 0 - skew: # check for the first class
|
||||
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas\033[1m', result, '\033[0;0m\033[36m<= boundary', 0 - skew, '\033[0;0m'
|
||||
if label == 0 and solution <= 0 - skew: # check for the first class
|
||||
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas\033[1m', solution, '\033[0;0m\033[36m<=', 0 - skew, '\033[0;0m'
|
||||
|
||||
elif label == class_labels - 1 and result > label - 1 - skew: # check for the last class
|
||||
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas\033[1m', result, '\033[0;0m\033[36m> boundary', label - skew, '\033[0;0m'
|
||||
elif label == class_labels - 1 and solution > label - 1 - skew: # check for the last class
|
||||
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas\033[1m', solution, '\033[0;0m\033[36m>', label - 1 - skew, '\033[0;0m'
|
||||
|
||||
elif (label - 1) - skew < result <= label - skew: # check for class bins between first and last
|
||||
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas boundary', (label - 1) - skew, '<\033[1m', result, '\033[0;0m\033[36m<=', 'boundary', label - skew, '\033[0;0m'
|
||||
elif label - 1 - skew < solution <= label - skew: # check for class bins between first and last
|
||||
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas', label - 1 - skew, '<\033[1m', solution, '\033[0;0m\033[36m<=', label - skew, '\033[0;0m'
|
||||
|
||||
else: fitness = 0; print '\t\033[36m no match for', result, 'in class', label, '\033[0;0m' # no class match
|
||||
else: fitness = 0 #; print '\t\033[36m no match for', solution, 'in class', label, '\033[0;0m' # no class match
|
||||
|
||||
print ''
|
||||
# print ''
|
||||
|
||||
|
||||
if class_type == 'f':
|
||||
for result in arange(min_val, max_val, .5):
|
||||
for solution in arange(min_val, max_val, .5):
|
||||
for label in range(class_labels):
|
||||
|
||||
if (label - 1) - skew < result <= label - skew: # check for discrete, finite class bins
|
||||
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas boundary', (label - 1) - skew, '<\033[1m', result, '\033[0;0m\033[36m<=', 'boundary', label - skew, '\033[0;0m'
|
||||
if label - 1 - skew < solution <= label - skew: # check for discrete, finite class bins
|
||||
fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas', label - 1 - skew, '<\033[1m', solution, '\033[0;0m\033[36m<=', label - skew, '\033[0;0m'
|
||||
|
||||
else: fitness = 0; print '\t\033[36m no match for', result, 'in class', label, '\033[0;0m' # no class match
|
||||
else: fitness = 0 #; print '\t\033[36m no match for', solution, 'in class', label, '\033[0;0m' # no class match
|
||||
|
||||
print ''
|
||||
# print ''
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
# Karoo Data Normalisation
|
||||
# by Kai Staats, MSc UCT
|
||||
# version 0.9.1.2
|
||||
|
||||
import sys
|
||||
import numpy as np
|
||||
|
||||
np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters before line-wrapping in order to view Trees
|
||||
|
||||
'''
|
||||
This script works with a raw dataset to prepare a new, normalised dataset. It does so by comparing all values in each
|
||||
given column, finding the maximum and minimum values, and then modifying each value to fall between a high of 1 and
|
||||
low of 0. The modified values are written to a new file, the original remaining untouched.
|
||||
|
||||
This script can be used before or after karoo_features_sort.py but assumes no header has yet been applied to the .csv.
|
||||
'''
|
||||
|
||||
def normalise(array):
|
||||
|
||||
'''
|
||||
The formula was derived from stn.spotfire.com/spotfire_client_help/norm/norm_normalizing_columns.htm
|
||||
'''
|
||||
|
||||
norm = []
|
||||
array_norm = []
|
||||
array_min = np.min(array)
|
||||
array_max = np.max(array)
|
||||
|
||||
for col in range(1, len(array) + 1):
|
||||
norm = float((array[col - 1] - array_min) / (array_max - array_min))
|
||||
norm = round(norm, fp) # force to 4 decimal points
|
||||
array_norm = np.append(array_norm, norm)
|
||||
|
||||
return array_norm
|
||||
|
||||
|
||||
### USER INTERACTION ###
|
||||
if len(sys.argv) == 1: print '\n\t\033[31mERROR! You have not assigned an input file. Try again ...\033[0;0m'; sys.exit()
|
||||
elif len(sys.argv) > 2: print '\n\t\033[31mERROR! You have assigned too many command line arguments. Try again ...\033[0;0m'; sys.exit()
|
||||
else: filename = sys.argv[1]
|
||||
|
||||
n = range(1,9)
|
||||
while True:
|
||||
try:
|
||||
fp = raw_input('\n\tEnter number of floating points desired in normalised data (default 4): ')
|
||||
if fp not in str(n) and fp not in '': raise ValueError()
|
||||
if fp == '0': fp = 1; break
|
||||
fp = fp or 4; fp = int(fp); break
|
||||
except ValueError: print '\n\t\033[32mEnter a number from 1 including 8. Try again ...\033[0;0m'
|
||||
|
||||
|
||||
### LOAD THE DATA and PREPARE AN EMPTY ARRAY ###
|
||||
print '\n\t\033[36mLoading dataset:', filename, '\033[0;0m\n'
|
||||
data = np.loadtxt(filename, delimiter = ',') # load data
|
||||
data_norm = np.zeros(shape = (data.shape[0], data.shape[1])) # build an empty dataset which matches the shape of the original
|
||||
|
||||
|
||||
### NORMALISE THE DATA ###
|
||||
for col in range(data.shape[1] - 1):
|
||||
print '\tnormalising column:', col
|
||||
|
||||
colsum = []
|
||||
for row in range(data.shape[0]):
|
||||
colsum = np.append(colsum, data[row,col])
|
||||
|
||||
data_norm[:,col] = normalise(colsum) # add each normalised column of data
|
||||
|
||||
data_norm[:,data.shape[1] - 1] = data[:,data.shape[1] - 1] # add the labels again
|
||||
|
||||
|
||||
### SAVE THE NORMALISED DATA ###
|
||||
file_tmp = filename.split('.')[0]
|
||||
np.savetxt(file_tmp + '-NORM.csv', data_norm, delimiter = ',')
|
||||
|
||||
print '\n\t\033[36mThe normlised dataset has been written to the file:', file_tmp + '-NORM.csv', '\033[0;0m'
|
||||
|
||||
|
Loading…
Reference in New Issue