diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt index 0f82f0e..c7de6db 100644 --- a/RELEASE_NOTES.txt +++ b/RELEASE_NOTES.txt @@ -1,10 +1,22 @@ +2017 02/08 + +The parameters.txt file now captures the full run-time configuration of both karoo_gp_main.py and karoo_gp_server.py. +This file is written to the unique (datetime stamp) directory in karoo_gp/runs/ at the auto-termination of _server.py +and with the user executed 'q'uit of _main.py. + +The final list of best fit evolved Trees and the test of the highest numbered (usually the highest performing) Tree is +also recorded, with the test auto-executed based upon the original kernel designation. This functionality supports the +fully hands-off execution of Karoo GP on a remote server, from a bash or Python script, for parallel or multiple +serial evolutionary runs. + + 2017 02/06 Graphics Processing Units (GPU) are now supported with the introduction of the Python library TensorFlow. The end result is a staggering improvement in performance. With one comparison of a 10,000 data points (rows) x 9 features -(columns) dataset on a 40 core Intel Xeon motherboard versus a 2000 core Nvidia GPU card, the wall time as reduced from -50 hours to less than 4 minutes. On CPU-only computers, the performance on a single core is as much as 10x improved due -to the vectorisation of the data and application of the C-based TensorFlow maths library. +(columns) dataset on a 40 core Intel Xeon motherboard versus a 2000 core Nvidia GPU card, the wall time was reduced +from 50 hours to less than 4 minutes. On CPU-only computers, the performance on a single core is as much as 10x +improved due to the vectorisation of the data and application of the C-based TensorFlow maths library. To install TensorFlow, I recommend visiting https://www.tensorflow.org/get_started/ It is straight forward for Ubuntu, but unfortunately can be rather challenging with OSX. Have patience. Review the forums. It's worth the effort. @@ -29,10 +41,10 @@ A number of other changes have been integrated, including: for multi-core processing which is now automated by TensorFlow. - The libraries 'pprocess' and 'time' are no longer required nor imported. - - - The population_* files (.csv) are now deposited into unique directories created with the launch of each run. A .txt - file is also written to each directory which captures the run-time configuration of Karoo GP. This enables truly - scriptable runs of Karoo. + + - The population_* files (.csv) are now written into unique directories created inside of karoo_gp/runs/ with the + launch of each run. A .txt file is also written to each directory which captures the run-time configuration of + Karoo GP. This enables truly scriptable runs of Karoo. - The Server interface to Karoo GP (karoo_gp_server.py) now terminates completely, kicking back to the command line. This enables bash or chron launches of multiple sequential or parallel runs, enabling the exploration of multiple diff --git a/karoo_gp_base_class.py b/karoo_gp_base_class.py index cb8da2b..e8aff44 100644 --- a/karoo_gp_base_class.py +++ b/karoo_gp_base_class.py @@ -2,7 +2,7 @@ # Define the methods and global variables used by Karoo GP # by Kai Staats, MSc; see LICENSE.md # Thanks to Emmanuel Dufourq and Arun Kumar for support during 2014-15 devel; TensorFlow support provided by Iurii Milovanov -# version 1.0 +# version 1.0.1 ''' A NOTE TO THE NEWBIE, EXPERT, AND BRAVE @@ -14,12 +14,14 @@ likely find more enjoyment of this particular flavour of GP with a little unders import csv import os import sys -import datetime import numpy as np import sklearn.metrics as skm import sklearn.cross_validation as skcv -import sympy as sp + +from sympy import sympify +from datetime import datetime +from collections import OrderedDict # TensorFlow-related imports os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" @@ -99,6 +101,7 @@ class Base_GP(object): 'gp.fitness_type' fitness type 'gp.datetime' date-time stamp of when the unique directory is created 'gp.path' full path to the unique directory created with each run + 'gp.dataset' local path and dataset filename ### Global variables initiated and/or used by Sympy ### 'gp.algo_raw' a Sympy string which represents a flattened tree @@ -221,7 +224,7 @@ class Base_GP(object): ### 1) load the associated data set, operators, operands, fitness type, and coefficients ### - #full_path = os.path.realpath(__file__); cwd = os.path.dirname(full_path) # helps with chron jobs --Thanks Marco! + #full_path = os.path.realpath(__file__); cwd = os.path.dirname(full_path) cwd = os.getcwd() # Good idea Marco :) data_dict = {'c':cwd + '/files/data_CLASSIFY.csv', 'r':cwd + '/files/data_REGRESS.csv', 'm':cwd + '/files/data_MATCH.csv', 'p':cwd + '/files/data_PLAY.csv'} @@ -230,21 +233,24 @@ class Base_GP(object): data_x = np.loadtxt(data_dict[self.kernel], skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column data_y = np.loadtxt(data_dict[self.kernel], skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels) header = open(data_dict[self.kernel],'r') + self.dataset = data_dict[self.kernel] elif len(sys.argv) == 2: # load an external data file data_x = np.loadtxt(sys.argv[1], skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column data_y = np.loadtxt(sys.argv[1], skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels) header = open(sys.argv[1],'r') + self.dataset = sys.argv[1] elif len(sys.argv) > 2: # receive filename and additional flags from karoo_gp_server.py via argparse data_x = np.loadtxt(filename, skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column data_y = np.loadtxt(filename, skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels) header = open(filename,'r') + self.dataset = filename fitt_dict = {'c':'max', 'r':'min', 'm':'max', 'p':''} self.fitness_type = fitt_dict[self.kernel] # load fitness type - func_dict = {'c':cwd + '/files/functions_CLASSIFY.csv', 'r':cwd + '/files/functions_REGRESS.csv', 'm':cwd + '/files/functions_MATCH.csv', 'p':cwd + '/files/functions_PLAY.csv'} + func_dict = {'c':cwd + '/files/operators_CLASSIFY.csv', 'r':cwd + '/files/operators_REGRESS.csv', 'm':cwd + '/files/operators_MATCH.csv', 'p':cwd + '/files/operators_PLAY.csv'} self.functions = np.loadtxt(func_dict[self.kernel], delimiter=',', skiprows=1, dtype = str) # load the user defined functions (operators) self.terminals = header.readline().split(','); self.terminals[-1] = self.terminals[-1].replace('\n','') # load the user defined terminals (operands) self.class_labels = len(np.unique(data_y)) # load the user defined labels for classification or solutions for regression @@ -283,7 +289,8 @@ class Base_GP(object): ### 4) create a unique directory and initialise all .csv files ### - self.datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + # self.datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + self.datetime = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') self.path = os.path.join(cwd, 'runs/', self.datetime) # generate a unique directory name if not os.path.isdir(self.path): os.makedirs(self.path) # make a unique directory @@ -669,10 +676,9 @@ class Base_GP(object): elif pause == 'l': # display dictionary of Trees with the best fitness score print '\n\t The leading Trees and their associated expressions are:' - for n in range(len(self.fittest_dict)): - print '\t ', self.fittest_dict.keys()[n], ':', self.fittest_dict.values()[n] - - + for item in sorted(self.fittest_dict): print '\t ', item, ':', self.fittest_dict[item] + + elif pause == 't': # evaluate a Tree against the TEST data if self.generation_id > 1: menu = range(1, len(self.population_b)) @@ -870,18 +876,18 @@ class Base_GP(object): Arguments required: TREE_ID, tree_type, tree_depth_base ''' - self.pop_TREE_ID = TREE_ID # pos 0: a unique identifier for each tree - self.pop_tree_type = tree_type # pos 1: a global constant based upon the initial user setting + self.pop_TREE_ID = TREE_ID # pos 0: a unique identifier for each tree + self.pop_tree_type = tree_type # pos 1: a global constant based upon the initial user setting self.pop_tree_depth_base = tree_depth_base # pos 2: a global variable which conveys 'tree_depth_base' as unique to each new Tree self.pop_NODE_ID = 1 # pos 3: unique identifier for each node; this is the INDEX KEY to this array - self.pop_node_depth = 0 # pos 4: depth of each node when committed to the array - self.pop_node_type = '' # pos 5: root, function, or terminal - self.pop_node_label = '' # pos 6: operator [+, -, *, ...] or terminal [a, b, c, ...] - self.pop_node_parent = '' # pos 7: parent node - self.pop_node_arity = '' # pos 8: number of nodes attached to each non-terminal node - self.pop_node_c1 = '' # pos 9: child node 1 - self.pop_node_c2 = '' # pos 10: child node 2 - self.pop_node_c3 = '' # pos 11: child node 3 (assumed max of 3 with boolean operator 'if') + self.pop_node_depth = 0 # pos 4: depth of each node when committed to the array + self.pop_node_type = '' # pos 5: root, function, or terminal + self.pop_node_label = '' # pos 6: operator [+, -, *, ...] or terminal [a, b, c, ...] + self.pop_node_parent = '' # pos 7: parent node + self.pop_node_arity = '' # pos 8: number of nodes attached to each non-terminal node + self.pop_node_c1 = '' # pos 9: child node 1 + self.pop_node_c2 = '' # pos 10: child node 2 + self.pop_node_c3 = '' # pos 11: child node 3 (assumed max of 3 with boolean operator 'if') self.pop_fitness = '' # pos 12: fitness value following Tree evaluation self.tree = np.array([ ['TREE_ID'],['tree_type'],['tree_depth_base'],['NODE_ID'],['node_depth'],['node_type'],['node_label'],['node_parent'],['node_arity'],['node_c1'],['node_c2'],['node_c3'],['fitness'] ]) @@ -1182,7 +1188,7 @@ class Base_GP(object): ''' self.algo_raw = self.fx_eval_label(tree, 1) # pass the root 'node_id', then flatten the Tree to a string - self.algo_sym = sp.sympify(self.algo_raw) # convert string to a functional expression (the coolest line in Karoo! :) + self.algo_sym = sympify(self.algo_raw) # convert string to a functional expression (the coolest line in Karoo! :) return @@ -1587,7 +1593,6 @@ class Base_GP(object): tree[12][1] = fitness # store the fitness with each tree tree[12][2] = len(str(self.algo_raw)) # store the length of the raw algo for parsimony - # tree[12][3] might equal 'error' as recorded by 'fx_eval_subs' - NO LONGER USED # if len(tree[3]) > 4: # if the Tree array is wide enough ... return @@ -1747,8 +1752,8 @@ class Base_GP(object): print '\t\033[36m Data row {} predicts class:\033[1m {} ({} label) as {:.2f}{}\033[0;0m'.format(i, int(result['labels'][0][i]), int(result['solution'][i]), result['result'][i], result['labels'][1][i]) print '\n Fitness value: {}'.format(result['fitness']) - print '\n Classification report:', '\n', skm.classification_report(result['solution'], result['labels'][0]) - print ' Confusion matrix:', '\n', skm.confusion_matrix(result['solution'], result['labels'][0]) + print '\n Classification report:\n', skm.classification_report(result['solution'], result['labels'][0]) + print ' Confusion matrix:\n', skm.confusion_matrix(result['solution'], result['labels'][0]) return @@ -1849,6 +1854,7 @@ class Base_GP(object): if self.display == 'db': print '\n\n\033[33m *** Full Mutation: function to function *** \033[0;0m\n\n\033[36m This is the unaltered tourn_winner:\033[0;0m\n', tree for n in range(len(branch)): + # 'root' is not made available for Full mutation as this would build an entirely new Tree if tree[5][branch[n]] == 'func': @@ -1894,7 +1900,6 @@ class Base_GP(object): ''' branch_top = int(branch[0]) # replaces 2 instances, below; tested 2016 07/09 - # branch_depth = int(tree[2][1]) - int(tree[4][branch_top]) # 'tree_depth_base' - depth at 'branch_top' to set max potential size of new branch - ORIGINAL branch_depth = self.tree_depth_max - int(tree[4][branch_top]) # 'tree_depth_max' - depth at 'branch_top' to set max potential size of new branch - 2016 07/10 if branch_depth < 0: # this has never occured ... yet @@ -2029,7 +2034,7 @@ class Base_GP(object): branch = np.array([]) # the array is necessary in order to len(branch) when 'branch' has only one element branch_top = np.random.randint(2, len(tree[3])) # randomly select a non-root node branch_eval = self.fx_eval_id(tree, branch_top) # generate tuple of 'branch_top' and subseqent nodes - branch_symp = sp.sympify(branch_eval) # convert string into something useful + branch_symp = sympify(branch_eval) # convert string into something useful branch = np.append(branch, branch_symp) # append list to array branch = np.sort(branch) # sort nodes in branch for Crossover. @@ -2476,7 +2481,6 @@ class Base_GP(object): ind = '' print '\n\033[1m\033[36m Tree ID', int(tree[0][1]), '\033[0;0m' - #for depth in range(0, int(tree[2][1]) + self.tree_depth_max + 1): # increment through all Tree depths - tested 2016 07/09 for depth in range(0, self.tree_depth_max + 1): # increment through all possible Tree depths - tested 2016 07/09 print '\n', ind,'\033[36m Tree Depth:', depth, 'of', tree[2][1], '\033[0;0m' @@ -2511,8 +2515,8 @@ class Base_GP(object): ''' branch = np.array([]) # the array is necessary in order to len(branch) when 'branch' has only one element - branch_eval = self.fx_eval_id(tree, start) # generate tuple of given 'branch' - branch_symp = sp.sympify(branch_eval) # convert string from tuple to list + branch_eval = self.fx_eval_id(tree, start) # generate tuple of given 'branch' + branch_symp = sympify(branch_eval) # convert string from tuple to list branch = np.append(branch, branch_symp) # append list to array ind = '' @@ -2585,7 +2589,6 @@ class Base_GP(object): target = csv.writer(csv_file, delimiter=',') if self.generation_id != 1: target.writerows(['']) # empty row before each generation target.writerows([['Karoo GP by Kai Staats', 'Generation:', str(self.generation_id)]]) - # NEED TO ADD: time to file header for tree in range(1, len(population)): target.writerows(['']) # empty row before each Tree @@ -2593,7 +2596,7 @@ class Base_GP(object): target.writerows([population[tree][row]]) - def fx_archive_params_write(self, app): + def fx_archive_params_write(self, app): # tested 2017 02/08 ''' Save run-time configuration parameters to disk. @@ -2604,12 +2607,10 @@ class Base_GP(object): file = open(self.path + '/parameters.txt', 'w') file.write('Karoo GP ' + app) - file.write('\n launched: ' + self.datetime) - file.write('\n dataset: [n/a]') - file.write('\n best tree: [n/a]') - file.write('\n best tree fitness: [n/a]') + file.write('\n launched: ' + str(self.datetime)) + file.write('\n dataset: ' + str(self.dataset)) file.write('\n') - file.write('\n kernel: ' + self.kernel) + file.write('\n kernel: ' + str(self.kernel)) file.write('\n precision: ' + str(self.precision)) file.write('\n') # file.write('tree type: ' + tree_type) @@ -2617,15 +2618,48 @@ class Base_GP(object): file.write('\n tree depth max: ' + str(self.tree_depth_max)) file.write('\n tree depth min: ' + str(self.tree_depth_min)) file.write('\n') - file.write('\n\t genetic operator Reproduction: ' + str(self.evolve_repro)) - file.write('\n\t genetic operator Point Mutation: ' + str(self.evolve_point)) - file.write('\n\t genetic operator Branch Mutation: ' + str(self.evolve_branch)) - file.write('\n\t genetic operator Crossover: ' + str(self.evolve_cross)) + file.write('\n genetic operator Reproduction: ' + str(self.evolve_repro)) + file.write('\n genetic operator Point Mutation: ' + str(self.evolve_point)) + file.write('\n genetic operator Branch Mutation: ' + str(self.evolve_branch)) + file.write('\n genetic operator Crossover: ' + str(self.evolve_cross)) file.write('\n') file.write('\n tournament size: ' + str(self.tourn_size)) file.write('\n population: ' + str(self.tree_pop_max)) file.write('\n number of generations: ' + str(self.generation_id)) + # parse the 'l'ist for the highest numbered Tree + if len(self.fittest_dict) > 0: + + file.write('\n\n The leading Trees and their associated expressions are:') + for item in sorted(self.fittest_dict): + file.write('\n\t ' + str(item) + ' : ' + str(self.fittest_dict[item])) + + # test the highest numbered Tree and write to the .txt log + self.fx_eval_poly(self.population_b[int(item)]) # generate the raw and sympified equation for the given Tree using SymPy + expr = str(self.algo_sym) # get simplified expression and process it by TF - tested 2017 02/02 + result = self.fx_fitness_eval(expr, self.data_test, get_labels=True) + + file.write('\n\n Tree ' + str(item) + ' yields (sym): ' + str(self.algo_sym)) + + if self.kernel == 'c': + file.write('\n\n Fitness value: {}'.format(result['fitness'])) + file.write('\n\n Classification report:\n {}'.format(skm.classification_report(result['solution'], result['labels'][0]))) + file.write('\n Confusion matrix:\n {}'.format(skm.confusion_matrix(result['solution'], result['labels'][0]))) + + elif self.kernel == 'r': + MSE, fitness = skm.mean_squared_error(result['result'], result['solution']), result['fitness'] + file.write('\n\n Fitness value: {}'.format(fitness)) + file.write('\n Mean Squared Error: {}'.format(MSE)) + + elif self.kernel == 'm': + file.write('\n\n Fitness value: {}'.format(result['fitness'])) + + # elif self.kernel == '[other]': + # file.write( ... ) + + else: file.write('\n\n There were no evolved solutions generated in this run... your species has gone extinct!') + + file.write('\n\n') file.close() return diff --git a/karoo_gp_main.py b/karoo_gp_main.py index b0ec01e..5c69c4c 100644 --- a/karoo_gp_main.py +++ b/karoo_gp_main.py @@ -2,7 +2,7 @@ # Use Genetic Programming for Classification and Symbolic Regression # by Kai Staats, MSc; see LICENSE.md # Thanks to Emmanuel Dufourq and Arun Kumar for support during 2014-15 devel; TensorFlow support provided by Iurii Milovanov -# version 1.0 +# version 1.0.1 ''' A word to the newbie, expert, and brave-- @@ -152,8 +152,8 @@ else: # if any other kernel is selected # define the ratio between types of mutation, where all sum to 1.0; can be adjusted in 'i'nteractive mode gp.evolve_repro = int(0.1 * gp.tree_pop_max) # quantity of a population generated through Reproduction -gp.evolve_point = int(0.1 * gp.tree_pop_max) # quantity of a population generated through Point Mutation -gp.evolve_branch = int(0.1 * gp.tree_pop_max) # quantity of a population generated through Branch Mutation +gp.evolve_point = int(0.0 * gp.tree_pop_max) # quantity of a population generated through Point Mutation +gp.evolve_branch = int(0.2 * gp.tree_pop_max) # quantity of a population generated through Branch Mutation gp.evolve_cross = int(0.7 * gp.tree_pop_max) # quantity of a population generated through Crossover gp.tourn_size = 10 # qty of individuals entered into each tournament (standard 10); can be adjusted in 'i'nteractive mode diff --git a/karoo_gp_server.py b/karoo_gp_server.py index 63e262e..3a6e8ed 100644 --- a/karoo_gp_server.py +++ b/karoo_gp_server.py @@ -2,7 +2,7 @@ # Use Genetic Programming for Classification and Symbolic Regression # by Kai Staats, MSc; see LICENSE.md # Thanks to Emmanuel Dufourq and Arun Kumar for support during 2014-15 devel; TensorFlow support provided by Iurii Milovanov -# version 1.0 +# version 1.0.1 ''' A word to the newbie, expert, and brave--