diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt index 3497c36..7e180ba 100644 --- a/RELEASE_NOTES.txt +++ b/RELEASE_NOTES.txt @@ -1,3 +1,35 @@ +2016 09/14 - version 0.9.2.0 + +In karoo_gp_base_class.py + - Merged 2 instances of 'algo_sym.subs(data)' into a single, new method 'fx_eval_subs' + - Removed redundant lines in the method 'fx_karoo_data_load' + - Added support for the Sympy 'lambdify' function in 'fx_karoo_data_load' (see explanation below) + - Added a draft means of catching divide-by-zero errors in the new 'lambdify' function + - Discovered the prior 'fx_eval_subs' incorrected applied a value of 1 to the variable 'result' as a means to + replace the 'zoo' function for divide by zero errors. However, this could inadvertantly undermine the success of + Classification and Regression runs. My apology for not catching this sooner. + +"While attending the CHEAPR 2016 workshop hosted by the Center for Cosmology and Astro-Particle Physics, The Ohio State +University, Erik Hemberg of MIT suggested that I could improve the performance by combining what were to Sympy.subs +calls into one. This was successfully completed and the new method 'fx_eval_subs' was created. + +Michael Zevin of Northwestern University proposed that Karoo GP *should* be able to process trees far faster than what +we were seeing. I looked into the Sympy functions I was at that time using. Indeed, '.subs' is noted as easy to use, +but terribly slow as it relies upon an internal, Python mathematical library. I therefore replaced '.subs' with +'.lambdify' which calls upon the C-based Numpy maths library. It is slated to be 500x faster than '.subs', but I am +seeing only a 2x performance increase. Clearly, there are yet other barriers to remove. + +In the new 'fx_eval_subs' method you will find both sympy.subs (active) and sympy.lambdify. While preliminary tests +worked well, I witnessed an erractic outcome which I yet need to reproduce and investigate. Feel free to comment the +.subs and uncomment the .lambdify sections and take it for a spin. + +I believe there are 2 more steps to increase performance: removing the dictionaries which contain each row, such that +Karoo is working directly with the Numpy array again, and then processing the array as a vector instead. But this will +require substantial recoding. + +I'll keep you informed ..." --kai + + 2016 08/08 - version 0.9.1.9 In karoo_gp_base_class.py diff --git a/karoo_gp_base_class.py b/karoo_gp_base_class.py index 2e28779..c0682e7 100644 --- a/karoo_gp_base_class.py +++ b/karoo_gp_base_class.py @@ -2,7 +2,7 @@ # Define the methods and global variables used by Karoo GP # by Kai Staats, MSc UCT / AIMS; see LICENSE.md # Much thanks to Emmanuel Dufourq and Arun Kumar for their support, guidance, and free psychotherapy sessions -# version 0.9.1.9 +# version 0.9.2.0 ''' A NOTE TO THE NEWBIE, EXPERT, AND BRAVE @@ -56,9 +56,9 @@ class Base_GP(object): ''' All Karoo GP global variables are named with the prefix 'gp.' All Karoo GP methods are named with the prefix 'gp.fx_'. The 13 variables which begin with 'gp.pop_' are used specifically to define the 13 parameters for - each GP as stored in the axis-1 (expanding horizontally on-screen) 'gp.population' Numpy array. + each tree as stored in the axis-1 (expand horizontally) 'gp.population' Numpy array. - ### Variables defined by the user in karoo_gp_main.py (in order of appearence) ### + ### Global and local variables defined by the user in karoo_gp_main.py (in order of appearence) ### 'gp.kernel' fitness function 'gp.class_method' select the number of classes (will be automated in future version) 'tree_type' Full, Grow, or Ramped 50/50 (local variable) @@ -78,21 +78,26 @@ class Base_GP(object): 'gp.cores' user defined or default to 1; can be set to auto-detect number of cores instead 'gp.precision' the number of floating points for the round function in 'fx_fitness_eval' - ### Variables initiated elsewhere, as used for data management ### + ### Global variables used for data management ### 'gp.data_train_cols' number of cols in the TRAINING data (see 'fx_karoo_data_load', below) 'gp.data_train_rows' number of rows in the TRAINING data (see 'fx_karoo_data_load', below) - 'data_train_dict' temporary dictionary which stores the data row-by-row (local variable) 'gp.data_train_dict_array' array of dictionaries which stores the TRAINING data, through all generations 'gp.data_test_cols' number of cols in the TEST data (see 'fx_karoo_data_load', below) 'gp.data_test_rows' number of rows in the TEST data (see 'fx_karoo_data_load', below) - 'data_test_dict' temporary dictionary which stores the data row-by-row (local variable) 'gp.data_test_dict_array' array of dictionaries which stores the TEST data for the very end - 'gp.functions' loaded from the associated [functions].csv - 'gp.terminals' the top row of the associated [data].csv + 'gp.functions' user defined functions (operators) from the associated files/[functions].csv + 'gp.terminals' user defined variables (operands) from the top row of the associated [data].csv + 'gp.coeff' user defined coefficients (constants) + 'gp.fitness_type' fitness type - ### Variables initiated elsewhere, as used for evolutionary management ### + ### Global variables initiated and/or used by Sympy ### + 'gp.algo_raw' a Sympy string which represents a flattened tree + 'gp.algo_sym' a Sympy executable version of algo_raw + 'gp.algo_ops' a Sympy list of available operators + + ### Variables used for evolutionary management ### 'gp.population_a' the root generation from which Trees are chosen for mutation and reproduction 'gp.population_b' the generation constructed from gp.population_a (recyled) 'gp.gene_pool' once-per-generation assessment of trees that meet min and max boundary conditions @@ -187,7 +192,7 @@ class Base_GP(object): print '\t ** ** ** ** ** ** ** ** ** ** ** ** **' print '\t ** ** ** ** ** ** ****** ****** ****** **' print '\033[0;0m' - print '\t\033[36m Genetic Programming in Python - by Kai Staats, version 0.9.1.8b\033[0;0m' + print '\t\033[36m Genetic Programming in Python - by Kai Staats, version 0.9.2.0\033[0;0m' return @@ -201,47 +206,38 @@ class Base_GP(object): 10 rows will not be split, rather copied in full to both TRAINING and TEST as it is assumed you are conducting a system validation run, as with the built-in MATCH kernel and associated dataset. - Arguments required: none + Arguments required: tree_type, tree_depth_base, filename (of the dataset) ''' - ### 1) load the data file associated with the user selected fitness kernel ### + ### 1) load the associated data set, operators, operands, fitness type, and coefficients ### data_dict = {'b':'files/data_BOOL.csv', 'c':'files/data_CLASSIFY.csv', 'r':'files/data_REGRESS.csv', 'm':'files/data_MATCH.csv', 'p':'files/data_PLAY.csv'} - func_dict = {'b':'files/functions_BOOL.csv', 'c':'files/functions_CLASSIFY.csv', 'r':'files/functions_REGRESS.csv', 'm':'files/functions_MATCH.csv', 'p':'files/functions_PLAY.csv'} - fitt_dict = {'b':'max', 'c':'max', 'r':'min', 'm':'max', 'p':''} if len(sys.argv) == 1: # load data from the default karoo_gp/files/ directory data_x = np.loadtxt(data_dict[self.kernel], skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column data_y = np.loadtxt(data_dict[self.kernel], skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels) - self.class_labels = len(np.unique(data_y)) - header = open(data_dict[self.kernel],'r') - self.terminals = header.readline().split(','); self.terminals[-1] = self.terminals[-1].replace('\n','') # load the variables across the top of the .csv - - self.functions = np.loadtxt(func_dict[self.kernel], delimiter=',', skiprows=1, dtype = str) # load the user defined functions (operators) - self.fitness_type = fitt_dict[self.kernel] elif len(sys.argv) == 2: # load an external data file data_x = np.loadtxt(sys.argv[1], skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column data_y = np.loadtxt(sys.argv[1], skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels) - self.class_labels = len(np.unique(data_y)) - header = open(sys.argv[1],'r') - self.terminals = header.readline().split(','); self.terminals[-1] = self.terminals[-1].replace('\n','') # load the variables across the top of the .csv - - self.functions = np.loadtxt(func_dict[self.kernel], delimiter=',', skiprows=1, dtype = str) # load the user defined functions (operators) - self.fitness_type = fitt_dict[self.kernel] elif len(sys.argv) > 2: # receive filename and additional flags from karoo_gp_server.py via argparse - data_x = np.loadtxt(filename, skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column data_y = np.loadtxt(filename, skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels) - self.class_labels = len(np.unique(data_y)) - header = open(filename,'r') - self.terminals = header.readline().split(','); self.terminals[-1] = self.terminals[-1].replace('\n','') # load the variables across the top of the .csv - self.functions = np.loadtxt(func_dict[self.kernel], delimiter=',', skiprows=1, dtype = str) # load the user defined functions (operators) - self.fitness_type = fitt_dict[self.kernel] + fitt_dict = {'b':'max', 'c':'max', 'r':'min', 'm':'max', 'p':''} + self.fitness_type = fitt_dict[self.kernel] # load fitness type + + func_dict = {'b':'files/functions_BOOL.csv', 'c':'files/functions_CLASSIFY.csv', 'r':'files/functions_REGRESS.csv', 'm':'files/functions_MATCH.csv', 'p':'files/functions_PLAY.csv'} + self.functions = np.loadtxt(func_dict[self.kernel], delimiter=',', skiprows=1, dtype = str) # load the user defined functions (operators) + + self.terminals = header.readline().split(','); self.terminals[-1] = self.terminals[-1].replace('\n','') # load the user defined terminals (operands) + self.algo_ops = sp.symbols(self.terminals) # convert a string of terminals to sympy executables - tested 2016 08/29 + + self.class_labels = len(np.unique(data_y)) # load the user defined labels for classification or solutions for regression + self.coeff = np.loadtxt('files/coefficients.csv', delimiter=',', skiprows=1, dtype = str) # load the user defined coefficients (constants) ### 2) from the dataset, generate TRAINING and TEST data ### @@ -269,6 +265,8 @@ class Base_GP(object): data_train_dict = {} self.data_train_dict_array = np.array([]) + # potential place to insert 'coeff' for a static variable 'v': 2016 08/22 + for row in range(0, self.data_train_rows): # increment through each row of data for col in range(0, self.data_train_cols): # increment through each column data_train_dict.update( {self.terminals[col]:data_train[row,col]} ) # to be unpacked in 'fx_fitness_eval' @@ -1185,7 +1183,7 @@ class Base_GP(object): ''' self.algo_raw = self.fx_eval_label(tree, 1) # pass the root 'node_id', then flatten the Tree to a string - self.algo_sym = sp.sympify(self.algo_raw) # string converted to a functional expression (the coolest line in the script! :) + self.algo_sym = sp.sympify(self.algo_raw) # converted string to a functional expression (the coolest line in Karoo! :) return @@ -1196,15 +1194,24 @@ class Base_GP(object): Process the sympified expression against the current data row. Arguments required: data (typically a single row from the associated [data].csv) + ''' - subs = self.algo_sym.subs(data) # process the expression against the data - if str(subs) == 'zoo': result = 1 # TEST & DEBUG: print 'divide by zero', result; self.fx_karoo_pause(0) - else: result = round(float(subs), self.precision) # force 'result' to the set number of floating points - + ### OLD .subs method ### + #result = self.algo_sym.subs(data) # process the expression against the data + #if str(result) == 'zoo': result = 1 # TEST & DEBUG: print 'divide by zero', result; self.fx_karoo_pause(0) + #else: result = round(float(result), self.precision) # force 'result' to the set number of floating points + + ### NEW .lambdify method ### + f = sp.lambdify(self.algo_ops, self.algo_sym, "numpy") # define the function + with np.errstate(divide = 'ignore'): # do not raise 'divide by zero' errors + result = f(*sp.flatten(data.values())) # execute the function against the given data row; which currently remains a dictionary + # if str(subs) == 'inf' or str(subs) == '-inf': print subs; self.fx_karoo_pause(0) # TEST & DEBUG catch divide by zero + result = round(float(result), self.precision) # force 'result' to the set number of floating points + return result - + def fx_eval_label(self, tree, node_id): ''' @@ -1303,7 +1310,8 @@ class Base_GP(object): ''' Display a Tree branch on-screen. - This method displays all sequential node_ids from 'start' node through bottom, within the given branch. + This method displays all sequential node_ids from 'start' node through bottom, within the given branch. This + is not used by Karoo GP at this time. Arguments required: tree, start ''' @@ -1341,10 +1349,9 @@ class Base_GP(object): def fx_eval_generation(self): ''' - Karoo GP evaluates each subsequent generation of Trees. This process flattens each GP Tree into a standard - equation by means of a recursive algorithm and subsequent processing by the Sympy library. Sympy simultaneously - evaluates the Tree for its results, returns null for divide by zero, reorganises and then rewrites the - expression in its simplest form. + Karoo GP evaluates each generation of Trees. This process flattens each GP Tree into a standard equation by + means of a recursive algorithm and subsequent processing by Sympy. Sympy simultaneously evaluates the Tree for + its results, reorganises and then rewrites the expression in its simplest form. Arguments required: none ''' @@ -1396,7 +1403,7 @@ class Base_GP(object): for tree_id in range(1, len(population)): - ### PART 1 - EXTRACT EXPRESSION FROM EACH TREE ### + ### PART 1 - EXTRACT EXPRESSION FROM TREE ### self.fx_eval_poly(population[tree_id]) # extract the expression if self.display not in ('s','t'): print '\t\033[36mTree', population[tree_id][0][1], 'yields (sym):\033[1m', self.algo_sym, '\033[0;0m' @@ -1484,10 +1491,10 @@ class Base_GP(object): # to the original variables listed across the top of each column of data.csv. Therefore, we must re-assign # the respective values for each subsequent row in the data .csv, for each Tree's unique expression. - result = self.fx_eval_subs(self.data_train_dict_array[row]) # process the expression against the training data + result = self.fx_eval_subs(self.data_train_dict_array[row]) # process the expression against the training data - tested 2016 07 solution = round(float(self.data_train_dict_array[row]['s']), self.precision) # force 'solution' to the set number of floating points - # if str(self.algo_sym) == 'a + b/c': # TEST & DEBUG: a temp fishing net to catch a specific result + # if str(self.algo_sym) == 'a + b/c': # TEST & DEBUG: a fishing net to catch a specific result # print 'algo_sym', self.algo_sym # print 'result', result, 'solution', solution # self.fx_karoo_pause(0) @@ -1575,11 +1582,11 @@ class Base_GP(object): skew = (self.class_labels / 2) - 1 # '-1' keeps a binary classification splitting over the origin # skew = 0 # for code testing - if solution == 0 and result <= 0 - skew: # check for first class + if solution == 0 and result <= 0 - skew: # check for first class (the left-most bin) if self.display == 'i': print '\t\033[36m data row', row, 'yields class label:\033[1m', int(solution), 'as', result, '<=', int(0 - skew), '\033[0;0m' fitness = 1 - elif solution == self.class_labels - 1 and result > solution - 1 - skew: # check for last class + elif solution == self.class_labels - 1 and result > solution - 1 - skew: # check for last class (the right-most bin) if self.display == 'i': print '\t\033[36m data row', row, 'yields class label:\033[1m', int(solution), 'as', result, '>', int(solution - 1 - skew), '\033[0;0m' fitness = 1 @@ -2485,11 +2492,11 @@ class Base_GP(object): label_pred = '' # sets the label_pred to a known state (see 'if label_pred ==' below) label_true = int(self.data_test_dict_array[row]['s']) - if result <= 0 - skew: # test for the first class + if result <= 0 - skew: # test for the first class (the left-most bin) label_pred = 0 print '\t\033[36m data row', row, 'predicts class:\033[1m', label_pred, '(', label_true, 'label) as', result, '<=', 0 - skew, '\033[0;0m' - elif result > (self.class_labels - 2) - skew: # test for last class (the right-most bin + elif result > (self.class_labels - 2) - skew: # test for last class (the right-most bin) label_pred = self.class_labels - 1 print '\t\033[36m data row', row, 'predicts class:\033[1m', label_pred, '(', label_true, 'label) as', result, '>', (self.class_labels - 2) - skew, '\033[0;0m' diff --git a/karoo_gp_main.py b/karoo_gp_main.py index 6dc3366..5e7b75f 100644 --- a/karoo_gp_main.py +++ b/karoo_gp_main.py @@ -2,7 +2,7 @@ # Use Genetic Programming for Classification and Symbolic Regression # by Kai Staats, MSc UCT / AIMS; see LICENSE.md # Much thanks to Emmanuel Dufourq and Arun Kumar for their support, guidance, and free psychotherapy sessions -# version 0.9.1.9 +# version 0.9.2.0 ''' A word to the newbie, expert, and brave-- diff --git a/karoo_gp_server.py b/karoo_gp_server.py index f18b96f..c07a745 100644 --- a/karoo_gp_server.py +++ b/karoo_gp_server.py @@ -2,7 +2,7 @@ # Use Genetic Programming for Classification and Symbolic Regression # by Kai Staats, MSc UCT / AIMS; see LICENSE.md # Much thanks to Emmanuel Dufourq and Arun Kumar for their support, guidance, and free psychotherapy sessions -# version 0.9.1.9 +# version 0.9.2.0 ''' A word to the newbie, expert, and brave-- diff --git a/tools/karoo_multiclassifier.py b/tools/karoo_multiclassifier.py index 8eba90b..cd1d34d 100644 --- a/tools/karoo_multiclassifier.py +++ b/tools/karoo_multiclassifier.py @@ -1,6 +1,6 @@ # Karoo Multiclass Classifer Test # by Kai Staats, MSc UCT / AIMS -# version 0.9.1.9 +# version 0.9.2.0 ''' This is a toy script, designed to allow you to play with multiclass classification using the same underlying function diff --git a/tools/karoo_normalise.py b/tools/karoo_normalise.py index fe17038..856386b 100644 --- a/tools/karoo_normalise.py +++ b/tools/karoo_normalise.py @@ -1,6 +1,6 @@ # Karoo Data Normalisation # by Kai Staats, MSc UCT -# version 0.9.1.9 +# version 0.9.2.0 import sys import numpy as np @@ -27,7 +27,8 @@ def normalise(array): array_max = np.max(array) for col in range(1, len(array) + 1): - norm = float((array[col - 1] - array_min) / (array_max - array_min)) + # norm = float((array[col - 1] - array_min) / (array_max - array_min)) + norm = float(array[col - 1] - array_min) / float(array_max - array_min) norm = round(norm, fp) # force to 4 decimal points array_norm = np.append(array_norm, norm) diff --git a/tools/karoo_sort.py b/tools/karoo_sort.py new file mode 100644 index 0000000..145f32d --- /dev/null +++ b/tools/karoo_sort.py @@ -0,0 +1,67 @@ +# Karoo Dataset Builder +# by Kai Staats, MSc UCT / AIMS and Arun Kumar, PhD +# version 0.9.2.0 + +import sys +import numpy as np + +np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters before line-wrapping in order to view Trees + +''' +In machine learning, it is often the case that your engaged dataset is derived from a larger parent. In constructing +the subset, if we grab a series of datapoints (rows in a .csv) from the larger dataset in sequential order, only from +the top, middle, or bottom, we will likely bias the new dataset and incorrectly train the machine learning algorithm. +Therefore, it is imperative that we engage a random function, guided only by the number of data points for each class. + +This script can be used *before* karoo_normalise.py, and assumes no header has yet been applied to the .csv. +''' + +### USER INTERACTION ### +if len(sys.argv) == 1: print '\n\t\033[31mERROR! You have not assigned an input file. Try again ...\033[0;0m'; sys.exit() +elif len(sys.argv) > 2: print '\n\t\033[31mERROR! You have assigned too many command line arguments. Try again ...\033[0;0m'; sys.exit() +else: filename = sys.argv[1] + +#n = range(1,101) +#while True: +# try: +# labels = raw_input('\n\tEnter number of unique class labels, or 0 for a regression dataset (default 2): ') +# if labels not in str(n) and labels not in '': raise ValueError() +# # if labels == '0': labels = 1; break +# labels = labels or 2; labels = int(labels); break +# except ValueError: print '\n\t\033[32mEnter a number from 0 including 100. Try again ...\033[0;0m' + +n = range(10,10001) +while True: + try: + samples = raw_input('\n\tEnter number of desired datapoints per class (default 100): ') + if samples not in str(n) and samples not in '': raise ValueError() + if samples == '0': samples = 10; break + samples = samples or 100; samples = int(samples); break + except ValueError: print '\n\t\033[32mEnter a number from 10 including 10000. Try again ...\033[0;0m' + + +### LOAD THE ORIGINAL DATASET ### +print '\n\t\033[36m\n\tLoading dataset:', filename, '\033[0;0m\n' +data = np.loadtxt(filename, delimiter = ',') # load data +data_sort = np.empty(shape = [0, data.shape[1]]) # build an empty array of the proper dimensions + + +### SORT DATA by LABEL ### +labels = len(np.unique(data[:,-1])) + +for label in range(labels): + data_list = np.where(data[:,-1] == label) # build a list of all rows which end in the current label + + data_select = np.random.choice(data_list[0], samples, replace = False) # select user defined 'samples' from list + print data_select + + data_sort = np.append(data_sort, data[data_select], axis = 0) + + +### SAVE THE SORTED DATASET ### +file_tmp = filename.split('.')[0] +np.savetxt(file_tmp + '-SORT.csv', data_sort, delimiter = ',') + +print '\n\t\033[36mThe sorted dataset has been written to the file:', file_tmp + '-SORT.csv', '\033[0;0m' + +