read the RELEASE NOTES please

2016-09-14 19:41:13 -06:00 · 2016-09-14 19:41:13 -06:00 · 59dc61d703
parent 5fd41e6fea
commit 59dc61d703
7 changed files with 162 additions and 55 deletions
--- a/RELEASE_NOTES.txt
+++ b/RELEASE_NOTES.txt
@ -1,3 +1,35 @@
+2016 09/14 - version 0.9.2.0
+
+In karoo_gp_base_class.py
+ - Merged 2 instances of 'algo_sym.subs(data)' into a single, new method 'fx_eval_subs'
+ - Removed redundant lines in the method 'fx_karoo_data_load'
+ - Added support for the Sympy 'lambdify' function in 'fx_karoo_data_load' (see explanation below)
+ - Added a draft means of catching divide-by-zero errors in the new 'lambdify' function
+ - Discovered the prior 'fx_eval_subs' incorrected applied a value of 1 to the variable 'result' as a means to
+ 	replace the 'zoo' function for divide by zero errors. However, this could inadvertantly undermine the success of
+ 	Classification and Regression runs. My apology for not catching this sooner.
+
+"While attending the CHEAPR 2016 workshop hosted by the Center for Cosmology and Astro-Particle Physics, The Ohio State
+University, Erik Hemberg of MIT suggested that I could improve the performance by combining what were to Sympy.subs
+calls into one. This was successfully completed and the new method 'fx_eval_subs' was created.
+
+Michael Zevin of Northwestern University proposed that Karoo GP *should* be able to process trees far faster than what 
+we were seeing. I looked into the Sympy functions I was at that time using. Indeed, '.subs' is noted as easy to use, 
+but terribly slow as it relies upon an internal, Python mathematical library. I therefore replaced '.subs' with
+'.lambdify' which calls upon the C-based Numpy maths library. It is slated to be 500x faster than '.subs', but I am 
+seeing only a 2x performance increase. Clearly, there are yet other barriers to remove.
+
+In the new 'fx_eval_subs' method you will find both sympy.subs (active) and sympy.lambdify. While preliminary tests 
+worked well, I witnessed an erractic outcome which I yet need to reproduce and investigate. Feel free to comment the
+.subs and uncomment the .lambdify sections and take it for a spin.
+
+I believe there are 2 more steps to increase performance: removing the dictionaries which contain each row, such that
+Karoo is working directly with the Numpy array again, and then processing the array as a vector instead. But this will
+require substantial recoding.
+
+I'll keep you informed ..." --kai
+
+
 2016 08/08 - version 0.9.1.9

 In karoo_gp_base_class.py
--- a/karoo_gp_base_class.py
+++ b/karoo_gp_base_class.py
@ -2,7 +2,7 @@
 # Define the methods and global variables used by Karoo GP
 # by Kai Staats, MSc UCT / AIMS; see LICENSE.md
 # Much thanks to Emmanuel Dufourq and Arun Kumar for their support, guidance, and free psychotherapy sessions
-# version 0.9.1.9
+# version 0.9.2.0

 '''
 A NOTE TO THE NEWBIE, EXPERT, AND BRAVE
@ -56,9 +56,9 @@ class Base_GP(object):
 		'''
 		All Karoo GP global variables are named with the prefix 'gp.' All Karoo GP methods are named with the prefix 
 		'gp.fx_'. The 13 variables which begin with 'gp.pop_' are used specifically to define the 13 parameters for 
-		each GP as stored in the axis-1 (expanding horizontally on-screen) 'gp.population' Numpy array.
+		each tree as stored in the axis-1 (expand horizontally) 'gp.population' Numpy array.
 		
-		### Variables defined by the user in karoo_gp_main.py (in order of appearence) ###
+		### Global and local variables defined by the user in karoo_gp_main.py (in order of appearence) ###
 		'gp.kernel'					fitness function
 		'gp.class_method'			select the number of classes (will be automated in future version)
 		'tree_type'					Full, Grow, or Ramped 50/50 (local variable)
@ -78,21 +78,26 @@ class Base_GP(object):
 		'gp.cores'					user defined or default to 1; can be set to auto-detect number of cores instead
 		'gp.precision'				the number of floating points for the round function in 'fx_fitness_eval'
 		
-		### Variables initiated elsewhere, as used for data management ###		
+		### Global variables used for data management ###		
 		'gp.data_train_cols'		number of cols in the TRAINING data (see 'fx_karoo_data_load', below)
 		'gp.data_train_rows'		number of rows in the TRAINING data (see 'fx_karoo_data_load', below)
-		'data_train_dict'			temporary dictionary which stores the data row-by-row (local variable)
 		'gp.data_train_dict_array'	array of dictionaries which stores the TRAINING data, through all generations
 		
 		'gp.data_test_cols'			number of cols in the TEST data (see 'fx_karoo_data_load', below)
 		'gp.data_test_rows'			number of rows in the TEST data (see 'fx_karoo_data_load', below)
-		'data_test_dict'			temporary dictionary which stores the data row-by-row (local variable)
 		'gp.data_test_dict_array'	array of dictionaries which stores the TEST data for the very end
 		
-		'gp.functions'				loaded from the associated [functions].csv
-		'gp.terminals'				the top row of the associated [data].csv
+		'gp.functions'				user defined functions (operators) from the associated files/[functions].csv
+		'gp.terminals'				user defined variables (operands) from the top row of the associated [data].csv
+		'gp.coeff'					user defined coefficients (constants)
+		'gp.fitness_type'			fitness type
 		
-		### Variables initiated elsewhere, as used for evolutionary management ###
+		### Global variables initiated and/or used by Sympy ###
+		'gp.algo_raw'				a Sympy string which represents a flattened tree
+		'gp.algo_sym'				a Sympy executable version of algo_raw
+		'gp.algo_ops'				a Sympy list of available operators
+
+		### Variables used for evolutionary management ###
 		'gp.population_a'			the root generation from which Trees are chosen for mutation and reproduction
 		'gp.population_b'			the generation constructed from gp.population_a (recyled)
 		'gp.gene_pool'				once-per-generation assessment of trees that meet min and max boundary conditions
@ -187,7 +192,7 @@ class Base_GP(object):
 		print '\t **   **  **    **  **   **  **    **  **    **     **    **  **'
 		print '\t **    ** **    **  **    **  ******    ******       ******   **'
 		print '\033[0;0m'
-		print '\t\033[36m Genetic Programming in Python - by Kai Staats, version 0.9.1.8b\033[0;0m'
+		print '\t\033[36m Genetic Programming in Python - by Kai Staats, version 0.9.2.0\033[0;0m'
 				
 		return
 		
@ -201,47 +206,38 @@ class Base_GP(object):
 		10 rows will not be split, rather copied in full to both TRAINING and TEST as it is assumed you are conducting
 		a system validation run, as with the built-in MATCH kernel and associated dataset.
 		
-		Arguments required: none
+		Arguments required: tree_type, tree_depth_base, filename (of the dataset)
 		'''
 		
-		### 1) load the data file associated with the user selected fitness kernel ###	
+		### 1) load the associated data set, operators, operands, fitness type, and coefficients ###			
 		data_dict = {'b':'files/data_BOOL.csv', 'c':'files/data_CLASSIFY.csv', 'r':'files/data_REGRESS.csv', 'm':'files/data_MATCH.csv', 'p':'files/data_PLAY.csv'}
-		func_dict = {'b':'files/functions_BOOL.csv', 'c':'files/functions_CLASSIFY.csv', 'r':'files/functions_REGRESS.csv', 'm':'files/functions_MATCH.csv', 'p':'files/functions_PLAY.csv'}
-		fitt_dict = {'b':'max', 'c':'max', 'r':'min', 'm':'max', 'p':''}
 		
 		if len(sys.argv) == 1: # load data from the default karoo_gp/files/ directory
 			data_x = np.loadtxt(data_dict[self.kernel], skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column
 			data_y = np.loadtxt(data_dict[self.kernel], skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels)
-			self.class_labels = len(np.unique(data_y))
-			
 			header = open(data_dict[self.kernel],'r')
-			self.terminals = header.readline().split(','); self.terminals[-1] = self.terminals[-1].replace('\n','') # load the variables across the top of the .csv
-			
-			self.functions = np.loadtxt(func_dict[self.kernel], delimiter=',', skiprows=1, dtype = str) # load the user defined functions (operators)
-			self.fitness_type = fitt_dict[self.kernel]
 			
 		elif len(sys.argv) == 2: # load an external data file
 			data_x = np.loadtxt(sys.argv[1], skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column
 			data_y = np.loadtxt(sys.argv[1], skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels)
-			self.class_labels = len(np.unique(data_y))
-			
 			header = open(sys.argv[1],'r')
-			self.terminals = header.readline().split(','); self.terminals[-1] = self.terminals[-1].replace('\n','') # load the variables across the top of the .csv
-			
-			self.functions = np.loadtxt(func_dict[self.kernel], delimiter=',', skiprows=1, dtype = str) # load the user defined functions (operators)
-			self.fitness_type = fitt_dict[self.kernel]
 			
 		elif len(sys.argv) > 2: # receive filename and additional flags from karoo_gp_server.py via argparse
-					
 			data_x = np.loadtxt(filename, skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column
 			data_y = np.loadtxt(filename, skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels)
-			self.class_labels = len(np.unique(data_y))
-			
 			header = open(filename,'r')
-			self.terminals = header.readline().split(','); self.terminals[-1] = self.terminals[-1].replace('\n','') # load the variables across the top of the .csv
 			
-			self.functions = np.loadtxt(func_dict[self.kernel], delimiter=',', skiprows=1, dtype = str) # load the user defined functions (operators)
-			self.fitness_type = fitt_dict[self.kernel]
+		fitt_dict = {'b':'max', 'c':'max', 'r':'min', 'm':'max', 'p':''}
+		self.fitness_type = fitt_dict[self.kernel] # load fitness type
+		
+		func_dict = {'b':'files/functions_BOOL.csv', 'c':'files/functions_CLASSIFY.csv', 'r':'files/functions_REGRESS.csv', 'm':'files/functions_MATCH.csv', 'p':'files/functions_PLAY.csv'}
+		self.functions = np.loadtxt(func_dict[self.kernel], delimiter=',', skiprows=1, dtype = str) # load the user defined functions (operators)
+		
+		self.terminals = header.readline().split(','); self.terminals[-1] = self.terminals[-1].replace('\n','') # load the user defined terminals (operands)
+		self.algo_ops = sp.symbols(self.terminals) # convert a string of terminals to sympy executables - tested 2016 08/29
+		
+		self.class_labels = len(np.unique(data_y)) # load the user defined labels for classification or solutions for regression
+		self.coeff = np.loadtxt('files/coefficients.csv', delimiter=',', skiprows=1, dtype = str) # load the user defined coefficients (constants)
 		
 		
 		### 2) from the dataset, generate TRAINING and TEST data ###
@ -269,6 +265,8 @@ class Base_GP(object):
 		data_train_dict = {}
 		self.data_train_dict_array = np.array([])
 		
+		# potential place to insert 'coeff' for a static variable 'v': 2016 08/22
+		
 		for row in range(0, self.data_train_rows): # increment through each row of data
 			for col in range(0, self.data_train_cols): # increment through each column
 				data_train_dict.update( {self.terminals[col]:data_train[row,col]} ) # to be unpacked in 'fx_fitness_eval'
@ -1185,7 +1183,7 @@ class Base_GP(object):
 		'''
 		
 		self.algo_raw = self.fx_eval_label(tree, 1) # pass the root 'node_id', then flatten the Tree to a string
-		self.algo_sym = sp.sympify(self.algo_raw) # string converted to a functional expression (the coolest line in the script! :)
+		self.algo_sym = sp.sympify(self.algo_raw) # converted string to a functional expression (the coolest line in Karoo! :)
 		
 		return
 		
@ -1196,15 +1194,24 @@ class Base_GP(object):
 		Process the sympified expression against the current data row.
 		
 		Arguments required: data (typically a single row from the associated [data].csv)
+		
 		'''
 		
-		subs = self.algo_sym.subs(data) # process the expression against the data
-		if str(subs) == 'zoo': result = 1 # TEST & DEBUG: print 'divide by zero', result; self.fx_karoo_pause(0)
-		else: result = round(float(subs), self.precision) # force 'result' to the set number of floating points
-			
+		### OLD .subs method ###
+		#result = self.algo_sym.subs(data) # process the expression against the data
+		#if str(result) == 'zoo': result = 1 # TEST & DEBUG: print 'divide by zero', result; self.fx_karoo_pause(0)
+		#else: result = round(float(result), self.precision) # force 'result' to the set number of floating points
+		
+		### NEW .lambdify method ###
+		f = sp.lambdify(self.algo_ops, self.algo_sym, "numpy") # define the function		
+		with np.errstate(divide = 'ignore'): # do not raise 'divide by zero' errors
+			result = f(*sp.flatten(data.values())) # execute the function against the given data row; which currently remains a dictionary
+		# if str(subs) == 'inf' or str(subs) == '-inf': print subs; self.fx_karoo_pause(0) # TEST & DEBUG catch divide by zero
+		result = round(float(result), self.precision) # force 'result' to the set number of floating points
+		
 		return result
 		
-		
+	
 	def fx_eval_label(self, tree, node_id):
 	
 		'''
@ -1303,7 +1310,8 @@ class Base_GP(object):
 		'''
 		Display a Tree branch on-screen.
 		
-		This method displays all sequential node_ids from 'start' node through bottom, within the given branch.
+		This method displays all sequential node_ids from 'start' node through bottom, within the given branch. This
+		is not used by Karoo GP at this time.
 		
 		Arguments required: tree, start
 		'''
@ -1341,10 +1349,9 @@ class Base_GP(object):
 	def fx_eval_generation(self):
 	
 		'''
-		Karoo GP evaluates each subsequent generation of Trees. This process flattens each GP Tree into a standard 
-		equation by means of a recursive algorithm and subsequent processing by the Sympy library. Sympy simultaneously 
-		evaluates the Tree for its results, returns null for divide by zero, reorganises and then rewrites the 
-		expression in its simplest form.
+		Karoo GP evaluates each generation of Trees. This process flattens each GP Tree into a standard equation by 
+		means of a recursive algorithm and subsequent processing by Sympy. Sympy simultaneously evaluates the Tree for 
+		its results, reorganises and then rewrites the expression in its simplest form.
 		
 		Arguments required: none
 		'''
@ -1396,7 +1403,7 @@ class Base_GP(object):
 		
 		for tree_id in range(1, len(population)):
 		
-			### PART 1 - EXTRACT EXPRESSION FROM EACH TREE ###
+			### PART 1 - EXTRACT EXPRESSION FROM TREE ###
 			self.fx_eval_poly(population[tree_id]) # extract the expression
 			if self.display not in ('s','t'): print '\t\033[36mTree', population[tree_id][0][1], 'yields (sym):\033[1m', self.algo_sym, '\033[0;0m'
 			
@ -1484,10 +1491,10 @@ class Base_GP(object):
 		# to the original variables listed across the top of each column of data.csv. Therefore, we must re-assign 
 		# the respective values for each subsequent row in the data .csv, for each Tree's unique expression.
 		
-		result = self.fx_eval_subs(self.data_train_dict_array[row]) # process the expression against the training data		
+		result = self.fx_eval_subs(self.data_train_dict_array[row]) # process the expression against the training data - tested 2016 07
 		solution = round(float(self.data_train_dict_array[row]['s']), self.precision) # force 'solution' to the set number of floating points
 		
-		# if str(self.algo_sym) == 'a + b/c': # TEST & DEBUG: a temp fishing net to catch a specific result
+		# if str(self.algo_sym) == 'a + b/c': # TEST & DEBUG: a fishing net to catch a specific result
 			# print 'algo_sym', self.algo_sym
 			# print 'result', result, 'solution', solution
 			# self.fx_karoo_pause(0)
@ -1575,11 +1582,11 @@ class Base_GP(object):
 		skew = (self.class_labels / 2) - 1 # '-1' keeps a binary classification splitting over the origin
 		# skew = 0 # for code testing
 		
-		if solution == 0 and result <= 0 - skew: # check for first class
+		if solution == 0 and result <= 0 - skew: # check for first class (the left-most bin)
 			if self.display == 'i': print '\t\033[36m data row', row, 'yields class label:\033[1m', int(solution), 'as', result, '<=', int(0 - skew), '\033[0;0m'
 			fitness = 1
 			
-		elif solution == self.class_labels - 1 and result > solution - 1 - skew: # check for last class
+		elif solution == self.class_labels - 1 and result > solution - 1 - skew: # check for last class (the right-most bin)
 			if self.display == 'i': print '\t\033[36m data row', row, 'yields class label:\033[1m', int(solution), 'as', result, '>', int(solution - 1 - skew), '\033[0;0m'
 			fitness = 1
 			
@ -2485,11 +2492,11 @@ class Base_GP(object):
 			label_pred = '' # sets the label_pred to a known state (see 'if label_pred ==' below)
 			label_true = int(self.data_test_dict_array[row]['s'])
 			
-			if result <= 0 - skew: # test for the first class
+			if result <= 0 - skew: # test for the first class (the left-most bin)
 				label_pred = 0
 				print '\t\033[36m data row', row, 'predicts class:\033[1m', label_pred, '(', label_true, 'label) as', result, '<=', 0 - skew, '\033[0;0m'
 								
-			elif result > (self.class_labels - 2) - skew: # test for last class (the right-most bin
+			elif result > (self.class_labels - 2) - skew: # test for last class (the right-most bin)
 				label_pred = self.class_labels - 1
 				print '\t\033[36m data row', row, 'predicts class:\033[1m', label_pred, '(', label_true, 'label) as', result, '>', (self.class_labels - 2) - skew, '\033[0;0m'
 				
--- a/karoo_gp_main.py
+++ b/karoo_gp_main.py
@ -2,7 +2,7 @@
 # Use Genetic Programming for Classification and Symbolic Regression
 # by Kai Staats, MSc UCT / AIMS; see LICENSE.md
 # Much thanks to Emmanuel Dufourq and Arun Kumar for their support, guidance, and free psychotherapy sessions
-# version 0.9.1.9
+# version 0.9.2.0

 '''
 A word to the newbie, expert, and brave--
--- a/karoo_gp_server.py
+++ b/karoo_gp_server.py
@ -2,7 +2,7 @@
 # Use Genetic Programming for Classification and Symbolic Regression
 # by Kai Staats, MSc UCT / AIMS; see LICENSE.md
 # Much thanks to Emmanuel Dufourq and Arun Kumar for their support, guidance, and free psychotherapy sessions
-# version 0.9.1.9
+# version 0.9.2.0

 '''
 A word to the newbie, expert, and brave--
--- a/tools/karoo_multiclassifier.py
+++ b/tools/karoo_multiclassifier.py
@ -1,6 +1,6 @@
 # Karoo Multiclass Classifer Test
 # by Kai Staats, MSc UCT / AIMS
-# version 0.9.1.9
+# version 0.9.2.0

 '''
 This is a toy script, designed to allow you to play with multiclass classification using the same underlying function
--- a/tools/karoo_normalise.py
+++ b/tools/karoo_normalise.py
@ -1,6 +1,6 @@
 # Karoo Data Normalisation
 # by Kai Staats, MSc UCT
-# version 0.9.1.9
+# version 0.9.2.0

 import sys
 import numpy as np
@ -27,7 +27,8 @@ def normalise(array):
 	array_max = np.max(array)
 	
 	for col in range(1, len(array) + 1):
-		norm = float((array[col - 1] - array_min) / (array_max - array_min))
+		# norm = float((array[col - 1] - array_min) / (array_max - array_min))
+		norm = float(array[col - 1] - array_min) / float(array_max - array_min)
 		norm = round(norm, fp) # force to 4 decimal points
 		array_norm = np.append(array_norm, norm)
 		
--- a/tools/karoo_sort.py
+++ b/tools/karoo_sort.py
@ -0,0 +1,67 @@
+# Karoo Dataset Builder
+# by Kai Staats, MSc UCT / AIMS and Arun Kumar, PhD
+# version 0.9.2.0
+
+import sys
+import numpy as np
+
+np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters before line-wrapping in order to view Trees
+
+'''
+In machine learning, it is often the case that your engaged dataset is derived from a larger parent. In constructing 
+the subset, if we grab a series of datapoints (rows in a .csv) from the larger dataset in sequential order, only from 
+the top, middle, or bottom, we will likely bias the new dataset and incorrectly train the machine learning algorithm. 
+Therefore, it is imperative that we engage a random function, guided only by the number of data points for each class.
+
+This script can be used *before* karoo_normalise.py, and assumes no header has yet been applied to the .csv.
+'''
+
+### USER INTERACTION ###
+if len(sys.argv) == 1: print '\n\t\033[31mERROR! You have not assigned an input file. Try again ...\033[0;0m'; sys.exit()
+elif len(sys.argv) > 2: print '\n\t\033[31mERROR! You have assigned too many command line arguments. Try again ...\033[0;0m'; sys.exit()
+else: filename = sys.argv[1]
+
+#n = range(1,101)
+#while True:
+#	try:
+#		labels = raw_input('\n\tEnter number of unique class labels, or 0 for a regression dataset (default 2): ')
+#		if labels not in str(n) and labels not in '': raise ValueError()
+#		# if labels == '0': labels = 1; break
+#		labels = labels or 2; labels = int(labels); break
+#	except ValueError: print '\n\t\033[32mEnter a number from 0 including 100. Try again ...\033[0;0m'
+
+n = range(10,10001)
+while True:
+	try:
+		samples = raw_input('\n\tEnter number of desired datapoints per class (default 100): ')
+		if samples not in str(n) and samples not in '': raise ValueError()
+		if samples == '0': samples = 10; break
+		samples = samples or 100; samples = int(samples); break
+	except ValueError: print '\n\t\033[32mEnter a number from 10 including 10000. Try again ...\033[0;0m'
+
+
+### LOAD THE ORIGINAL DATASET ###
+print '\n\t\033[36m\n\tLoading dataset:', filename, '\033[0;0m\n'
+data = np.loadtxt(filename, delimiter = ',') # load data
+data_sort = np.empty(shape = [0, data.shape[1]]) # build an empty array of the proper dimensions
+
+
+### SORT DATA by LABEL ###
+labels = len(np.unique(data[:,-1]))
+
+for label in range(labels):
+	data_list = np.where(data[:,-1] == label) # build a list of all rows which end in the current label
+
+	data_select = np.random.choice(data_list[0], samples, replace = False) # select user defined 'samples' from list
+	print data_select
+	
+	data_sort = np.append(data_sort, data[data_select], axis = 0)
+
+
+### SAVE THE SORTED DATASET ###
+file_tmp = filename.split('.')[0]
+np.savetxt(file_tmp + '-SORT.csv', data_sort, delimiter = ',')
+
+print '\n\t\033[36mThe sorted dataset has been written to the file:', file_tmp + '-SORT.csv', '\033[0;0m'
+
+