all tools updated and improved

2016-07-07 23:01:28 -06:00 · 2016-07-07 23:01:28 -06:00 · 76652e9364
parent 9cd58eb5c5
commit 76652e9364
5 changed files with 212 additions and 67 deletions
--- a/tools/karoo_data_sort.py
+++ b/tools/karoo_data_sort.py
@ -0,0 +1,65 @@
+# Karoo Dataset Builder
+# by Kai Staats, MSc UCT / AIMS and Arun Kumar, PhD
+# version 0.9.1.2
+
+import sys
+import numpy as np
+
+np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters before line-wrapping in order to view Trees
+
+'''
+In machine learning, it is often the case that your engaged dataset is derived from a larger parent. In constructing 
+the subset, if we grab a series of datapoints (rows in a .csv) from the larger dataset in sequential order, only from 
+the top, middle, or bottom, we will likely bias the new dataset and incorrectly train the machine learning algorithm. 
+Therefore, it is imperative that we engage a random function, guided only by the number of data points for each class.
+
+This script can be used before or after karoo_normalise.py but assumes no header has yet been applied to the .csv.
+'''
+
+### USER INTERACTION ###
+if len(sys.argv) == 1: print '\n\t\033[31mERROR! You have not assigned an input file. Try again ...\033[0;0m'; sys.exit()
+elif len(sys.argv) > 2: print '\n\t\033[31mERROR! You have assigned too many command line arguments. Try again ...\033[0;0m'; sys.exit()
+else: filename = sys.argv[1]
+
+n = range(1,101)
+while True:
+	try:
+		labels = raw_input('\n\tEnter number of unique class labels, or 0 for a regression dataset (default 2): ')
+		if labels not in str(n) and labels not in '': raise ValueError()
+		# if labels == '0': labels = 1; break
+		labels = labels or 2; labels = int(labels); break
+	except ValueError: print '\n\t\033[32mEnter a number from 0 including 100. Try again ...\033[0;0m'
+
+n = range(10,10001)
+while True:
+	try:
+		samples = raw_input('\n\tEnter number of desired datapoints per class (default 100): ')
+		if samples not in str(n) and samples not in '': raise ValueError()
+		if samples == '0': samples = 10; break
+		samples = samples or 100; samples = int(samples); break
+	except ValueError: print '\n\t\033[32mEnter a number from 10 including 10000. Try again ...\033[0;0m'
+
+
+### LOAD THE ORIGINAL DATASET ###
+print '\n\t\033[36m\n\tLoading dataset:', filename, '\033[0;0m\n'
+data = np.loadtxt(filename, delimiter = ',') # load data
+data_sort = np.empty(shape = [0, data.shape[1]]) # build an empty array of the proper dimensions
+
+
+### SORT DATA by LABEL ###
+for label in range(labels):
+	data_list = np.where(data[:,-1] == label) # build a list of all rows which end in the current label
+
+	data_select = np.random.choice(data_list[0], samples, replace = False) # select user defined 'samples' from list
+	print data_select
+	
+	data_sort = np.append(data_sort, data[data_select], axis = 0)
+
+
+### SAVE THE SORTED DATASET ###
+file_tmp = filename.split('.')[0]
+np.savetxt(file_tmp + '-SORT.csv', data_sort, delimiter = ',')
+
+print '\n\t\033[36mThe sorted dataset has been written to the file:', file_tmp + '-SORT.csv', '\033[0;0m'
+
+
--- a/tools/karoo_features_sort.py
+++ b/tools/karoo_features_sort.py
@ -1,11 +1,16 @@
 # Karoo Feature Set Prep
-# Prepare a balanced feature set
+# Prepare a balanced dataset
 # by Kai Staats, MSc UCT / AIMS and Arun Kumar, PhD

 import sys
 import numpy as np

+if len(sys.argv) == 1: print '\n\t\033[31mERROR! You have not assigned an input file. Try again ...\033[0;0m'; sys.exit()
+elif len(sys.argv) > 2: print '\n\t\033[31mERROR! You have assigned too many command line arguments. Try again ...\033[0;0m'; sys.exit()
+
 filename = sys.argv[1] # 'data/pixel_classifier/kat7-20150924-SUBSET.csv'
+print '\n\t\033[36m You have opted to load the dataset:', filename, '\033[0;0m'
+
 samples = 5000

 # do NOT use readline as that is very, very slow
--- a/tools/karoo_iris_plot.py
+++ b/tools/karoo_iris_plot.py
@ -1,34 +1,39 @@
-# Karoo GP Iris Plot
-# Plot a function generated by Karoo GP against a scatter of the Iris data
+# Karoo Iris Plot
 # by Kai Staats, MSc UCT / AIMS and Arun Kumar, PhD
+# version 0.9.1.2

-# See https://www.youtube.com/channel/UCfzlCWGWYyIQ0aLC5w48gBQ for a good plotting tutorial
-
+import sys
 import numpy as np
 import matplotlib.pyplot as mpl
-from mpl_toolkits.mplot3d import Axes3D

-# data = np.loadtxt('../files/Iris_dataset/data_IRIS_setosa-vs-versicolor_3-col_PLOT.csv', delimiter=',', dtype = str)
-# data = np.loadtxt('../files/Iris_dataset/data_IRIS_versicolor-vs-virginica_3-col_PLOT.csv', delimiter=',', dtype = str)
-data = np.loadtxt('../files/Iris_dataset/data_IRIS_virginica-vs-setosa_3-col_PLOT.csv', delimiter=',', dtype = str)
+np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters before line-wrapping in order to view Trees

-# http://stn.spotfire.com/spotfire_client_help/norm/norm_normalizing_columns.htm
-# to scale between 0 and 1: n - min(list) / (max(list) - min(list))
+'''
+This is a functional, even it not fully automated script designed to help you visualise your 2D or 3D data against a
+function generated by Karoo GP. The challenge comes with solving complex equations for a single variable such that
+you have a plot-able function. If the algebra required is beyond your skills (or you forgot what you learned in high 
+school), tools such as Matlab may be of some assistance. If you desire to normalise your data in advance of using this 
+script, the Karoo GP normalisation script included in the karoo_gp/toos/ directory is very easy to use.

-### PLOT THE DATA ###
-def fx_normalize(array):
+By default, this script plots a Karoo GP derived function against a scatter plot of one of the Iris datasets 
+included with this package: karoo_gp/files/Iris_dataset/data_IRIS_virginica-vs-setosa_3-col_PLOT.csv

-	norm = []
-	array_min = np.min(array)
-	array_max = np.max(array)
-	
-	for col in range(1, len(array) + 1):
-		n = float((array[col - 1] - array_min) / (array_max - array_min))			
-		norm = np.append(norm, n)
-		
-	return norm
+If you are new to plotting, https://www.youtube.com/channel/UCfzlCWGWYyIQ0aLC5w48gBQ for a good plotting tutorial
+provides a good, visual tutorial, as do many, many other web and video based guides.
+'''
+
+### USER INTERACTION ###
+if len(sys.argv) == 1:
+	filename = '../files/Iris_dataset/data_IRIS_virginica-vs-setosa_3-col_PLOT.csv'
+	print '\n\t\033[31mYou have not assigned an input file, therefore "IRIS_virginica-vs-setosa_3-col_PLOT" will be used.\033[0;0m'
 	
+elif len(sys.argv) > 2: print '\n\t\033[31mERROR! You have assigned too many command line arguments. Try again ...\033[0;0m'; sys.exit()
+else: filename = sys.argv[1]

+
+### LOAD THE DATA and PREPARE AN EMPTY ARRAY ###
+print '\n\t\033[36mLoading dataset:', filename, '\033[0;0m\n'
+data = np.loadtxt(filename, delimiter=',', dtype = str)
 data_a, data_b, data_c = [], [], []

 tmp = data[:,0] 
@ -43,33 +48,24 @@ tmp = data[:,2]
 for n in range(len(tmp)):
 	data_c.append(float(tmp[n]))

-# normalise the data
-# data_a = fx_normalize(data_a)
-# data_b = fx_normalize(data_b)
-# data_c = fx_normalize(data_c)

-fig = mpl.figure()
-ax = fig.add_subplot(111, projection = '3d')
-
-ax.scatter(data_a, data_b, data_c, c = 'r', marker = 'o')
-
-
-### PLOT THE FUNCTION ###
-b = np.arange(2, 4, 0.25)
-c = np.arange(2, 4, 0.25)
+### PREP THE FUNCTION ###
+b = np.arange(2, 4, 0.25) # plot from n to m in steps o
+c = np.arange(2, 4, 0.25) # plot from n to m in steps o
 b, c = np.meshgrid(b, c)

-# -b*c + c**2 + c - 1 --> ?
-# -a/c - b**2 + c**2 --> ?
-# -a - b + c**2 --> a = -b + c**2
+# -b*c + c**2 + c - 1 # Karoo GP derived function
+# -a/c - b**2 + c**2 # Karoo GP derived function
+# -a - b + c**2 # Karoo GP derived function becomes a = -b + c**2 
 a = -b + c**2

-# normalise the function
-# a = fx_normalize(a)
-# b = fx_normalize(b)
-# c = fx_normalize(c)

-ax.plot_wireframe(a,b,c)
+### PLOT THE FUNCTION and DATA###
+fig = mpl.figure()
+
+ax = fig.add_subplot(111, projection = '3d')
+ax.scatter(data_a, data_b, data_c, c = 'r', marker = 'o') # 3D data
+ax.plot_wireframe(a,b,c) # 3D function

 ax.set_xlabel('a')
 ax.set_ylabel('b')
@ -77,3 +73,4 @@ ax.set_zlabel('c')

 mpl.show()

+
--- a/tools/karoo_multiclassifier.py
+++ b/tools/karoo_multiclassifier.py
@ -1,12 +1,17 @@
 # Karoo Multiclass Classifer Test
-# Play with quantity of class labels against a range of results
 # by Kai Staats, MSc UCT / AIMS
+# version 0.9.1.2
+
+'''
+This is a toy script, designed to allow you to play with multiclass classification using the same underlying function
+as employed by Karoo GP.
+'''

 from numpy import arange

 while True:
 	try:
-		class_type = raw_input('\t Select (i)finite or (f)inite class bins (default i): ')
+		class_type = raw_input('\t Select (i)nfinite or (f)inite wing bins (default i): ')
 		if class_type not in ('i','f',''): raise ValueError()
 		class_type = class_type or 'i'; break
 	except ValueError: print '\033[32mSelect from the options given. Try again ...\n\033[0;0m'
@ -21,45 +26,41 @@ while True:
 	except ValueError: print '\033[32m Enter a number from 3 including 100. Try again ...\n\033[0;0m'

 skew = (class_labels / 2) - 1
-min_val = 0 - skew - 1
-if class_labels & 1: max_val = 0 + skew + 3
-else: max_val = 0 + skew + 2
+min_val = 0 - skew - 1 # add a data point to the left
+if class_labels & 1: max_val = 0 + skew + 3 # add a data point to the right if odd number of class labels
+else: max_val = 0 + skew + 2 # add a data point to the right if even number of class labels

 print '\n\t class_labels =', range(class_labels)
+print '\t solutions = [', min_val, '...', max_val - .5,']'
 print '\t skew =', skew, '\n'

-# a simple binary classifier, for comparison
-	# if result <= 0 and label == 0: fitness = 1
-	# elif result > 0 and label == 1: fitness = 1
-	# else: fitness = 0
-
 if class_type == 'i':
-	for result in arange(min_val, max_val, .5):
+	for solution in arange(min_val, max_val, 0.5):
 		for label in range(class_labels):
 		
-			if label == 0 and result <= 0 - skew: # check for the first class
-				fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas\033[1m', result, '\033[0;0m\033[36m<= boundary', 0 - skew, '\033[0;0m'
+			if label == 0 and solution <= 0 - skew: # check for the first class
+				fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas\033[1m', solution, '\033[0;0m\033[36m<=', 0 - skew, '\033[0;0m'
 				
-			elif label == class_labels - 1 and result > label - 1 - skew: # check for the last class
-				fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas\033[1m', result, '\033[0;0m\033[36m> boundary', label - skew, '\033[0;0m'
+			elif label == class_labels - 1 and solution > label - 1 - skew: # check for the last class
+				fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas\033[1m', solution, '\033[0;0m\033[36m>', label - 1 - skew, '\033[0;0m'
 				
-			elif (label - 1) - skew < result <= label - skew: # check for class bins between first and last
-				fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas boundary', (label - 1) - skew, '<\033[1m', result, '\033[0;0m\033[36m<=', 'boundary', label - skew, '\033[0;0m'
+			elif label - 1 - skew < solution <= label - skew: # check for class bins between first and last
+				fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas', label - 1 - skew, '<\033[1m', solution, '\033[0;0m\033[36m<=', label - skew, '\033[0;0m'
 				
-			else: fitness = 0; print '\t\033[36m no match for', result, 'in class', label, '\033[0;0m' # no class match
+			else: fitness = 0 #; print '\t\033[36m no match for', solution, 'in class', label, '\033[0;0m' # no class match
 			
-		print ''
+		# print ''


 if class_type == 'f':
-	for result in arange(min_val, max_val, .5):
+	for solution in arange(min_val, max_val, .5):
 		for label in range(class_labels):
 		
-			if (label - 1) - skew < result <= label - skew: # check for discrete, finite class bins
-				fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas boundary', (label - 1) - skew, '<\033[1m', result, '\033[0;0m\033[36m<=', 'boundary', label - skew, '\033[0;0m'
+			if label - 1 - skew < solution <= label - skew: # check for discrete, finite class bins
+				fitness = 1; print '\t\033[36m\033[1m class', label, '\033[0;0m\033[36mas', label - 1 - skew, '<\033[1m', solution, '\033[0;0m\033[36m<=', label - skew, '\033[0;0m'
 				
-			else: fitness = 0; print '\t\033[36m no match for', result, 'in class', label, '\033[0;0m' # no class match
+			else: fitness = 0 #; print '\t\033[36m no match for', solution, 'in class', label, '\033[0;0m' # no class match
 			
-		print ''
+		# print ''


--- a/tools/karoo_normalise.py
+++ b/tools/karoo_normalise.py
@ -0,0 +1,77 @@
+# Karoo Data Normalisation
+# by Kai Staats, MSc UCT
+# version 0.9.1.2
+
+import sys
+import numpy as np
+
+np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters before line-wrapping in order to view Trees
+
+'''
+This script works with a raw dataset to prepare a new, normalised dataset. It does so by comparing all values in each 
+given column, finding the maximum and minimum values, and then modifying each value to fall between a high of 1 and 
+low of 0. The modified values are written to a new file, the original remaining untouched.
+
+This script can be used before or after karoo_features_sort.py but assumes no header has yet been applied to the .csv.
+'''
+
+def normalise(array):
+
+	'''
+	The formula was derived from stn.spotfire.com/spotfire_client_help/norm/norm_normalizing_columns.htm 
+	'''
+	
+	norm = []
+	array_norm = []
+	array_min = np.min(array)
+	array_max = np.max(array)
+	
+	for col in range(1, len(array) + 1):
+		norm = float((array[col - 1] - array_min) / (array_max - array_min))
+		norm = round(norm, fp) # force to 4 decimal points
+		array_norm = np.append(array_norm, norm)
+		
+	return array_norm
+
+
+### USER INTERACTION ###
+if len(sys.argv) == 1: print '\n\t\033[31mERROR! You have not assigned an input file. Try again ...\033[0;0m'; sys.exit()
+elif len(sys.argv) > 2: print '\n\t\033[31mERROR! You have assigned too many command line arguments. Try again ...\033[0;0m'; sys.exit()
+else: filename = sys.argv[1]
+
+n = range(1,9)
+while True:
+	try:
+		fp = raw_input('\n\tEnter number of floating points desired in normalised data (default 4): ')
+		if fp not in str(n) and fp not in '': raise ValueError()
+		if fp == '0': fp = 1; break
+		fp = fp or 4; fp = int(fp); break
+	except ValueError: print '\n\t\033[32mEnter a number from 1 including 8. Try again ...\033[0;0m'
+
+
+### LOAD THE DATA and PREPARE AN EMPTY ARRAY ###
+print '\n\t\033[36mLoading dataset:', filename, '\033[0;0m\n'
+data = np.loadtxt(filename, delimiter = ',') # load data
+data_norm = np.zeros(shape = (data.shape[0], data.shape[1])) # build an empty dataset which matches the shape of the original
+
+
+### NORMALISE THE DATA ###
+for col in range(data.shape[1] - 1):
+	print '\tnormalising column:', col
+	
+	colsum = []
+	for row in range(data.shape[0]):
+		colsum = np.append(colsum, data[row,col])
+		
+	data_norm[:,col] = normalise(colsum) # add each normalised column of data
+	
+data_norm[:,data.shape[1] - 1] = data[:,data.shape[1] - 1] # add the labels again
+
+
+### SAVE THE NORMALISED DATA ###
+file_tmp = filename.split('.')[0]
+np.savetxt(file_tmp + '-NORM.csv', data_norm, delimiter = ',')
+
+print '\n\t\033[36mThe normlised dataset has been written to the file:', file_tmp + '-NORM.csv', '\033[0;0m'
+
+