diff --git a/Karoo_GP_User_Guide.pdf b/Karoo_GP_User_Guide.pdf index 6ab59b5..1c1fbc3 100644 Binary files a/Karoo_GP_User_Guide.pdf and b/Karoo_GP_User_Guide.pdf differ diff --git a/LICENSE.md b/LICENSE.md index 2906dea..82670d7 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2015-2016 Kai Staats (www.kaistaats.com) +Copyright (c) 2015-2018 Kai Staats (www.kaistaats.com) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt index ed80d54..cd601ec 100644 --- a/RELEASE_NOTES.txt +++ b/RELEASE_NOTES.txt @@ -1,9 +1,30 @@ -2017 08/10 -Relatively light updates this time. +2018 02/27 +Updated the Python library versions and improved some explanation of Operators and Operands in the User Guide. -Iurii fixed a minor bug in which the MATCHING function (used only for demonstrations) would not find a match despite -the fact that the output was correct, due to the way in which TensorFlow was handling floating points and precision. -Iurii used numpy.allclose.html as a reference to resolve the situation. + +2017 10/26 +An upgrade from Tensorflow 1.1 to 1.3 caused the Classify kernel test to break. Fixed by Iurii by replacing [] with () +in the 'fx_fitness_eval' method. + + +2017 10/17 +To be consistent with the anticipated Machine Learning vocabulary, replaced the term 'label' with 'pred_labels' +(predicted labels) in the TF graph methods. + + +2017 08/10 +Iurii fixed a minor bug in which the MATCHING function (used only for demonstrations) would not find a match despite +the output being (apprently) correct. This was discovered to be due to the way in which TensorFlow was handling +floating points and precision, as follows: + + [ -0.29999995 0.69999981 13.69999981 16.70000076 19.70000076 + 22.70000076 25.70000076 28.70000076 31.70000076 34.70000076] + + [ -0.30000001 0.69999999 13.69999981 16.70000076 19.70000076 + 22.70000076 25.70000076 28.70000076 31.70000076 34.70000076] + +As you can see, the values are close, but not equal and so a "match" was not resolved. Iurii used numpy.allclose.html +as a reference to resolve the situation. I also modified the autosave to the runs/ directory such that if you are using an external dataset (quite likely), the new directory (for each run) will be saved as [filename]-[date_time_stamp]/ The idea (thank you Marco) is to help keep diff --git a/files/data_MATCH.csv b/files/data_MATCH.csv index 20ada99..068fd02 100644 --- a/files/data_MATCH.csv +++ b/files/data_MATCH.csv @@ -1,11 +1,11 @@ a,b,c,s -0,1,2,3 -1,2,3,6 -2,3,4,9 -3,4,5,12 -4,5,6,15 -5,6,7,18 -6,7,8,21 -7,8,9,24 -8,9,10,27 -9,10,11,30 +0,1,2,3.0 +1,2,3,6.0 +2,3,4,9.0 +3,4,5,12.0 +4,5,6,15.0 +5,6,7,18.0 +6,7,8,21.0 +7,8,9,24.0 +8,9,10,27.0 +9,10,11,30.0 diff --git a/karoo_gp_base_class.py b/karoo_gp_base_class.py index d165c9d..f5cc993 100644 --- a/karoo_gp_base_class.py +++ b/karoo_gp_base_class.py @@ -2,7 +2,7 @@ # Define the methods and global variables used by Karoo GP # by Kai Staats, MSc; see LICENSE.md # Thanks to Emmanuel Dufourq and Arun Kumar for support during 2014-15 devel; TensorFlow support provided by Iurii Milovanov -# version 1.0.5 +# version 1.0.8 ''' A NOTE TO THE NEWBIE, EXPERT, AND BRAVE @@ -24,40 +24,45 @@ from sympy import sympify from datetime import datetime from collections import OrderedDict -# TensorFlow-related imports +# np.random.seed(1000) # for reproducibility + + +### TensorFlow Imports and Definitions ### os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" + import tensorflow as tf import ast import operator as op + operators = {ast.Add: tf.add, # e.g., a + b - ast.Sub: tf.subtract, # e.g., a - b - ast.Mult: tf.multiply, # e.g., a * b - ast.Div: tf.divide, # e.g., a / b - ast.Pow: tf.pow, # e.g., a ** 2 - ast.USub: tf.negative, # e.g., -a - ast.And: tf.logical_and, # e.g., a and b - ast.Or: tf.logical_or, # e.g., a or b - ast.Not: tf.logical_not, # e.g., not a - ast.Eq: tf.equal, # e.g., a == b - ast.NotEq: tf.not_equal, # e.g., a != b - ast.Lt: tf.less, # e.g., a < b - ast.LtE: tf.less_equal, # e.g., a <= b - ast.Gt: tf.greater, # e.g., a > b - ast.GtE: tf.greater_equal, # e.g., a >= 1 - 'abs': tf.abs, # e.g., abs(a) - 'sign': tf.sign, # e.g., sign(a) - 'square': tf.square, # e.g., square(a) - 'sqrt': tf.sqrt, # e.g., sqrt(a) - 'pow': tf.pow, # e.g., pow(a, b) - 'log': tf.log, # e.g., log(a) - 'log1p': tf.log1p, # e.g., log1p(a) - 'cos': tf.cos, # e.g., cos(a) - 'sin': tf.sin, # e.g., sin(a) - 'tan': tf.tan, # e.g., tan(a) - 'acos': tf.acos, # e.g., acos(a) - 'asin': tf.asin, # e.g., asin(a) - 'atan': tf.atan, # e.g., atan(a) - } + ast.Sub: tf.subtract, # e.g., a - b + ast.Mult: tf.multiply, # e.g., a * b + ast.Div: tf.divide, # e.g., a / b + ast.Pow: tf.pow, # e.g., a ** 2 + ast.USub: tf.negative, # e.g., -a + ast.And: tf.logical_and, # e.g., a and b + ast.Or: tf.logical_or, # e.g., a or b + ast.Not: tf.logical_not, # e.g., not a + ast.Eq: tf.equal, # e.g., a == b + ast.NotEq: tf.not_equal, # e.g., a != b + ast.Lt: tf.less, # e.g., a < b + ast.LtE: tf.less_equal, # e.g., a <= b + ast.Gt: tf.greater, # e.g., a > b + ast.GtE: tf.greater_equal, # e.g., a >= 1 + 'abs': tf.abs, # e.g., abs(a) + 'sign': tf.sign, # e.g., sign(a) + 'square': tf.square, # e.g., square(a) + 'sqrt': tf.sqrt, # e.g., sqrt(a) + 'pow': tf.pow, # e.g., pow(a, b) + 'log': tf.log, # e.g., log(a) + 'log1p': tf.log1p, # e.g., log1p(a) + 'cos': tf.cos, # e.g., cos(a) + 'sin': tf.sin, # e.g., sin(a) + 'tan': tf.tan, # e.g., tan(a) + 'acos': tf.acos, # e.g., acos(a) + 'asin': tf.asin, # e.g., asin(a) + 'atan': tf.atan, # e.g., atan(a) + } np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters before line-wrapping in order to view Trees @@ -65,10 +70,9 @@ np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters class Base_GP(object): ''' - This Base_BP class contains all methods for Karoo GP. - - Method names are differentiated from global variable names (defined below) by the prefix 'fx_' followed by an object - and action, as in 'fx_display_tree()', with a few expections, such as 'fx_fitness_gene_pool'. + This Base_BP class contains all methods for Karoo GP. Method names are differentiated from global variable names + (defined below) by the prefix 'fx_' followed by an object and action, as in 'fx_display_tree()', with a few + expections, such as 'fx_fitness_gene_pool'. The categories (denoted by +++ banners +++) are as follows: 'karoo_gp' A single method which conducts an entire run. Employed only by karoo_gp_server.py @@ -80,85 +84,72 @@ class Base_GP(object): 'fx_display_' Methods to Display a Tree 'fx_archive_' Methods to Archive - There are no sub-classes at the time of this edit - 2015 09/21 + ### Global variables used for data management ### + 'gp.data_train' store train data for processing in TF + 'gp.data_test' store test data for processing in TF + 'gp.tf_device' set TF computation backend device (CPU or GPU) + 'gp.tf_device_log' employed for TensorFlow debugging + + 'gp.data_train_cols' number of cols in the TRAINING data (see 'fx_karoo_data_load', below) + 'gp.data_train_rows' number of rows in the TRAINING data (see 'fx_karoo_data_load', below) + 'gp.data_test_cols' number of cols in the TEST data (see 'fx_karoo_data_load', below) + 'gp.data_test_rows' number of rows in the TEST data (see 'fx_karoo_data_load', below) + + 'gp.functions' user defined functions (operators) from the associated files/[functions].csv + 'gp.terminals' user defined variables (operands) from the top row of the associated [data].csv + 'gp.coeff' user defined coefficients (NOT YET IN USE) + 'gp.fitness_type' fitness type + 'gp.datetime' date-time stamp of when the unique directory is created + 'gp.path' full path to the unique directory created with each run + 'gp.dataset' local path and dataset filename + + ### Global variables initiated and/or used by Sympy ### + 'gp.algo_raw' a Sympy string which represents a flattened tree + 'gp.algo_sym' a Sympy executable version of algo_raw + 'gp.fittest_dict' a dictionary of the most fit trees, compiled during fitness function execution + + ### Global variables used for evolutionary management ### + 'gp.population_a' the root generation from which Trees are chosen for mutation and reproduction + 'gp.population_b' the generation constructed from gp.population_a (recyled) + 'gp.gene_pool' once-per-generation assessment of trees that meet min and max boundary conditions + 'gp.generation_id' simple n + 1 increment + 'gp.fitness_type' set in 'fx_karoo_data_load' as either a minimising or maximising function + 'gp.tree' axis-1, 13 element Numpy array that defines each Tree, stored in 'gp.population' + 'gp.pop_*' 13 variables that define each Tree (see 'fx_gen_tree_initialise') + + ### Error checks ### + You can quickly locate all error checks by searching for 'ERROR!' in this and all classes. + ''' - - #++++++++++++++++++++++++++++++++++++++++++ - # Define Global Variables | - #++++++++++++++++++++++++++++++++++++++++++ - + def __init__(self): - - ''' - All Karoo GP global variables are named with the prefix 'gp.' The 13 variables which begin with 'gp.pop_' are - specifically employed to define the 13 parameters for each tree as stored in the axis-1 (expand horizontally) - 'gp.population' Numpy array. + + ### Global variables instantiated in karoo_gp_main.py and karoo_gp_server.py ### + self.kernel = '' # fitness function + self.tree_depth_max = 0 # maximum Tree depth for the entire run; limits bloat + self.tree_depth_min = 0 # minimum number of nodes + self.tree_pop_max = 0 # maximum number of Trees per generation + self.generation_max = 0 # maximum number of generations + self.tourn_size = 0 # number of Trees selected for each tournament - ### Global and local variables defined by the user in karoo_gp_main.py (in order of appearence) ### - 'gp.kernel' fitness function - 'gp.class_method' select the number of classes (will be automated in future version) - 'tree_type' Full, Grow, or Ramped 50/50 (local variable) - 'gp.tree_depth_min' minimum number of nodes - 'tree_depth_base' maximum Tree depth for the initial population, where nodes = 2^(depth + 1) - 1 - 'gp.tree_depth_max' maximum Tree depth for the entire run; introduces potential bloat - 'gp.tree_pop_max' maximum number of Trees per generation - 'gp.generation_max' maximum number of generations - 'gp.display' level of on-screen feedback + self.evolve_repro = 0 # quantity of a population generated through Reproduction + self.evolve_point = 0 # quantity of a population generated through Point Mutation + self.evolve_branch = 0 # quantity of a population generated through Branch Mutation + self.evolve_cross = 0 # quantity of a population generated through Crossover - 'gp.evolve_repro' quantity of a population generated through Reproduction - 'gp.evolve_point' quantity of a population generated through Point Mutation - 'gp.evolve_branch' quantity of a population generated through Branch Mutation - 'gp.evolve_cross' quantity of a population generated through Crossover + self.display = '' # display mode is set to (s)ilent # level of on-screen feedback + self.precision = 0 # the number of floating points for the round function in 'fx_fitness_eval' - 'gp.tourn_size' the number of Trees chosen for each tournament - 'gp.precision' the number of floating points for all applications of the round function + # self.karoo_gp(tree_type, tree_depth_base, filename) # used by karoo_gp_server.py to launch an entire run - ### Global variables used for data management ### - 'gp.data_train' store train data for processing in TF - 'gp.data_test' store test data for processing in TF - 'gp.tf_device' set TF computation backend device (CPU or GPU) - 'gp.tf_device_log' employed for TensorFlow debugging - 'gp.data_train_cols' number of cols in the TRAINING data (see 'fx_karoo_data_load', below) - 'gp.data_train_rows' number of rows in the TRAINING data (see 'fx_karoo_data_load', below) - 'gp.data_test_cols' number of cols in the TEST data (see 'fx_karoo_data_load', below) - 'gp.data_test_rows' number of rows in the TEST data (see 'fx_karoo_data_load', below) + ### Global variables instantiated in the classes ### + self.algo_raw = [] # the raw expression generated by Sympy per Tree -- CONSIDER MAKING THIS VARIABLE LOCAL + self.algo_sym = [] # the expression generated by Sympy per Tree -- CONSIDER MAKING THIS VARIABLE LOCAL + self.fittest_dict = {} # all Trees which share the best fitness score - 'gp.functions' user defined functions (operators) from the associated files/[functions].csv - 'gp.terminals' user defined variables (operands) from the top row of the associated [data].csv - 'gp.coeff' user defined coefficients (NOT YET IN USE) - 'gp.fitness_type' fitness type - 'gp.datetime' date-time stamp of when the unique directory is created - 'gp.path' full path to the unique directory created with each run - 'gp.dataset' local path and dataset filename - - ### Global variables initiated and/or used by Sympy ### - 'gp.algo_raw' a Sympy string which represents a flattened tree - 'gp.algo_sym' a Sympy executable version of algo_raw - 'gp.fittest_dict' a dictionary of the most fit trees, compiled during fitness function execution - - ### Variables used for evolutionary management ### - 'gp.population_a' the root generation from which Trees are chosen for mutation and reproduction - 'gp.population_b' the generation constructed from gp.population_a (recyled) - 'gp.gene_pool' once-per-generation assessment of trees that meet min and max boundary conditions - 'gp.generation_id' simple n + 1 increment - 'gp.fitness_type' set in 'fx_karoo_data_load' as either a minimising or maximising function - 'gp.tree' axis-1, 13 element Numpy array that defines each Tree, stored in 'gp.population' - 'gp.pop_*' 13 elements which define each Tree (see 'fx_gen_tree_initialise' below) - - ### Fishing nets ### - You can insert a "fishing net" to search for a specific expression when you fear the evolutionary process or - something in the code may not be working. Search for "fishing net" and follow the directions. - - ### Error checks ### - You can quickly find all places in which error checks have been inserted by searching for "ERROR!" - ''' - - self.algo_raw = [] # temp store the raw expression -- CONSIDER MAKING THIS VARIABLE LOCAL - self.algo_sym = [] # temp store the sympified expression-- CONSIDER MAKING THIS VARIABLE LOCAL - self.fittest_dict = {} # temp store all Trees which share the best fitness score - self.gene_pool = [] # temp store all Tree IDs for use by Tournament - self.class_labels = 0 # temp set a variable which will be assigned the number of class labels (data_y) + self.gene_pool = [] # store all Tree IDs for use by Tournament + self.class_labels = 0 # the number of true class labels (data_y) return @@ -181,7 +172,7 @@ class Base_GP(object): start = time.time() # start the clock for the timer # construct first generation of Trees - self.fx_karoo_data_load(tree_type, tree_depth_base, filename) + self.fx_karoo_data_load(filename) self.generation_id = 1 # set initial generation ID self.population_a = ['Karoo GP by Kai Staats, Generation ' + str(self.generation_id)] # list to store all Tree arrays, one generation at a time self.fx_karoo_construct(tree_type, tree_depth_base) # construct the first population of Trees @@ -228,21 +219,21 @@ class Base_GP(object): os.system('clear') print '\n\033[36m\033[1m' - print '\t ** ** ****** ***** ****** ****** ****** ******' - print '\t ** ** ** ** ** ** ** ** ** ** ** ** **' - print '\t ** ** ** ** ** ** ** ** ** ** ** ** **' - print '\t **** ******** ****** ** ** ** ** ** *** ******' + print '\t ** ** ****** ***** ****** ****** ****** ******' + print '\t ** ** ** ** ** ** ** ** ** ** ** ** **' + print '\t ** ** ** ** ** ** ** ** ** ** ** ** **' + print '\t **** ******** ****** ** ** ** ** ** *** *******' print '\t ** ** ** ** ** ** ** ** ** ** ** ** **' print '\t ** ** ** ** ** ** ** ** ** ** ** ** **' print '\t ** ** ** ** ** ** ** ** ** ** ** ** **' print '\t ** ** ** ** ** ** ****** ****** ****** **' print '\033[0;0m' - print '\t\033[36m Genetic Programming in Python - by Kai Staats, version 1.0\033[0;0m' - + print '\t\033[36m Genetic Programming in Python - by Kai Staats, version 1.0.7\033[0;0m' + return - def fx_karoo_data_load(self, tree_type, tree_depth_base, filename): + def fx_karoo_data_load(self, filename): ''' The data and function .csv files are loaded according to the fitness function kernel selected by the user. An @@ -251,33 +242,33 @@ class Base_GP(object): 10 rows will not be split, rather copied in full to both TRAINING and TEST as it is assumed you are conducting a system validation run, as with the built-in MATCH kernel and associated dataset. - Arguments required: tree_type, tree_depth_base, filename (of the dataset) + Arguments required: filename (of the dataset) ''' ### 1) load the associated data set, operators, operands, fitness type, and coefficients ### - full_path = os.path.realpath(__file__); cwd = os.path.dirname(full_path) # Good idea Marco :) - # cwd = os.getcwd() + # full_path = os.path.realpath(__file__); cwd = os.path.dirname(full_path) # Good idea Marco :) + cwd = os.getcwd() data_dict = {'c':cwd + '/files/data_CLASSIFY.csv', 'r':cwd + '/files/data_REGRESS.csv', 'm':cwd + '/files/data_MATCH.csv', 'p':cwd + '/files/data_PLAY.csv'} if len(sys.argv) == 1: # load data from the default karoo_gp/files/ directory data_x = np.loadtxt(data_dict[self.kernel], skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column data_y = np.loadtxt(data_dict[self.kernel], skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels) - header = open(data_dict[self.kernel],'r') - self.dataset = data_dict[self.kernel] + header = open(data_dict[self.kernel],'r') # read only the top row of parameters + self.dataset = data_dict[self.kernel] # copy the name only elif len(sys.argv) == 2: # load an external data file data_x = np.loadtxt(sys.argv[1], skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column data_y = np.loadtxt(sys.argv[1], skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels) - header = open(sys.argv[1],'r') - self.dataset = sys.argv[1] + header = open(sys.argv[1],'r') # read only the top row of parameters + self.dataset = sys.argv[1] # copy the name only - elif len(sys.argv) > 2: # receive filename and additional flags from karoo_gp_server.py via argparse + elif len(sys.argv) > 2: # receive filename and additional arguments from karoo_gp_server.py via argparse data_x = np.loadtxt(filename, skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column data_y = np.loadtxt(filename, skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels) - header = open(filename,'r') - self.dataset = filename + header = open(filename,'r') # read only the top row of parameters + self.dataset = filename # copy the name only fitt_dict = {'c':'max', 'r':'min', 'm':'max', 'p':''} self.fitness_type = fitt_dict[self.kernel] # load fitness type @@ -285,7 +276,7 @@ class Base_GP(object): func_dict = {'c':cwd + '/files/operators_CLASSIFY.csv', 'r':cwd + '/files/operators_REGRESS.csv', 'm':cwd + '/files/operators_MATCH.csv', 'p':cwd + '/files/operators_PLAY.csv'} self.functions = np.loadtxt(func_dict[self.kernel], delimiter=',', skiprows=1, dtype = str) # load the user defined functions (operators) self.terminals = header.readline().split(','); self.terminals[-1] = self.terminals[-1].replace('\n','') # load the user defined terminals (operands) - self.class_labels = len(np.unique(data_y)) # load the user defined labels for classification or solutions for regression + self.class_labels = len(np.unique(data_y)) # load the user defined true labels for classification or solutions for regression #self.coeff = np.loadtxt(cwd + '/files/coefficients.csv', delimiter=',', skiprows=1, dtype = str) # load the user defined coefficients - NOT USED YET @@ -299,10 +290,10 @@ class Base_GP(object): x_train, x_test, y_train, y_test = skcv.train_test_split(data_x, data_y, test_size = 0.2) # 80/20 TRAIN/TEST split data_x, data_y = [], [] # clear from memory - data_train = np.c_[x_train, y_train] # recombine each row of data with its associated label (right column) + data_train = np.c_[x_train, y_train] # recombine each row of data with its associated class label (right column) x_train, y_train = [], [] # clear from memory - data_test = np.c_[x_test, y_test] # recombine each row of data with its associated label (right column) + data_test = np.c_[x_test, y_test] # recombine each row of data with its associated class label (right column) x_test, y_test = [], [] # clear from memory self.data_train_cols = len(data_train[0,:]) # qty count @@ -315,33 +306,30 @@ class Base_GP(object): self.data_train = data_train # Store train data for processing in TF self.data_test = data_test # Store test data for processing in TF - self.tf_device = "/gpu:0" # Set TF computation backend device (CPU or GPU) + self.tf_device = "/gpu:0" # Set TF computation backend device (CPU or GPU); gpu:n = 1st, 2nd, or ... GPU device self.tf_device_log = False # TF device usage logging (for debugging) ### 4) create a unique directory and initialise all .csv files ### - - # self.datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') self.datetime = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') - self.path = os.path.join(cwd, 'runs/', filename.split('.')[0] + '_' + self.datetime) # generate a unique directory name - # self.path = os.path.join(cwd, 'runs/', self.datetime) # generate a unique directory name + self.path = os.path.join(cwd, 'runs/', filename.split('.')[0] + '_' + self.datetime + '/') # generate a unique directory name if not os.path.isdir(self.path): os.makedirs(self.path) # make a unique directory self.filename = {} # a dictionary to hold .csv filenames - self.filename.update( {'a':self.path + '/population_a.csv'} ) + self.filename.update( {'a':self.path + 'population_a.csv'} ) target = open(self.filename['a'], 'w') # initialise the .csv file for population 'a' (foundation) target.close() - self.filename.update( {'b':self.path + '/population_b.csv'} ) + self.filename.update( {'b':self.path + 'population_b.csv'} ) target = open(self.filename['b'], 'w') # initialise the .csv file for population 'b' (evolving) target.close() - self.filename.update( {'f':self.path + '/population_f.csv'} ) + self.filename.update( {'f':self.path + 'population_f.csv'} ) target = open(self.filename['f'], 'w') # initialise the .csv file for the final population (test) target.close() - self.filename.update( {'s':self.path + '/population_s.csv'} ) + self.filename.update( {'s':self.path + 'population_s.csv'} ) # do NOT initialise this .csv file, as it is retained for loading a previous run (recover) return @@ -619,9 +607,10 @@ class Base_GP(object): while True: try: print '\n\t The current tournament size is:', self.tourn_size - query = int(raw_input('\t Adjust the tournament size (suggest 10): ')) - if query not in menu: raise ValueError() - self.tourn_size = query; break + query = raw_input('\t Adjust the tournament size (suggest 10): ') + if query not in str(menu) or query == '0' or query == '1': raise ValueError() # not ideal 20170918 + elif query == '': break + self.tourn_size = int(query); break except ValueError: print '\n\t\033[32m Enter a number from 2 including', str(self.tree_pop_max) + ".", 'Try again ...\033[0;0m' @@ -630,9 +619,10 @@ class Base_GP(object): while True: try: print '\n\t The current minimum number of nodes is:', self.tree_depth_min - query = int(raw_input('\t Adjust the minimum number of nodes for all Trees (min 3): ')) - if query not in menu: raise ValueError() - self.tree_depth_min = query; break + query = raw_input('\t Adjust the minimum number of nodes for all Trees (min 3): ') + if query not in str(menu) or query == '0' or query == '1' or query == '2': raise ValueError() # not ideal 20170918 + elif query == '': break + self.tree_depth_min = int(query); break except ValueError: print '\n\t\033[32m Enter a number from 3 including 1000. Try again ...\033[0;0m' @@ -643,8 +633,8 @@ class Base_GP(object): # while True: # try: # print '\n\t The current \033[3madjusted\033[0;0m maximum Tree depth is:', self.tree_depth_max - # query = int(raw_input('\n\t Adjust the global maximum Tree depth to (1 ... 10): ')) - # if query not in menu: raise ValueError() + # query = raw_input('\n\t Adjust the global maximum Tree depth to (1 ... 10): ') + # if query not in str(menu): raise ValueError() # if query < self.tree_depth_max: # print '\n\t\033[32m This value is less than the current value.\033[0;0m' # conf = raw_input('\n\t Are you ok with this? (y/n) ') @@ -666,7 +656,7 @@ class Base_GP(object): query = raw_input('\t Enter quantity of Trees to be generated by Reproduction: ') if query not in str(menu): raise ValueError() elif query == '': break - tmp_repro = int(float(query)); break + tmp_repro = int(query); break # replaced int(float(query)) 20170918 except ValueError: print '\n\t\033[32m Enter a number from 0 including 1000. Try again ...\033[0;0m' while True: @@ -674,7 +664,7 @@ class Base_GP(object): query = raw_input('\t Enter quantity of Trees to be generated by Point Mutation: ') if query not in str(menu): raise ValueError() elif query == '': break - tmp_point = int(float(query)); break + tmp_point = int(query); break # replaced int(float(query)) 20170918 except ValueError: print '\n\t\033[32m Enter a number from 0 including 1000. Try again ...\033[0;0m' while True: @@ -682,7 +672,7 @@ class Base_GP(object): query = raw_input('\t Enter quantity of Trees to be generated by Branch Mutation: ') if query not in str(menu): raise ValueError() elif query == '': break - tmp_branch = int(float(query)); break + tmp_branch = int(query); break # replaced int(float(query)) 20170918 except ValueError: print '\n\t\033[32m Enter a number from 0 including 1000. Try again ...\033[0;0m' while True: @@ -690,7 +680,7 @@ class Base_GP(object): query = raw_input('\t Enter quantity of Trees to be generated by Crossover: ') if query not in str(menu): raise ValueError() elif query == '': break - tmp_cross = int(float(query)); break + tmp_cross = int(query); break # replaced int(float(query)) 20170918 except ValueError: print '\n\t\033[32m Enter a number from 0 including 1000. Try again ...\033[0;0m' if tmp_repro + tmp_point + tmp_branch + tmp_cross != self.tree_pop_max: print '\n\t The sum of the above does not equal', self.tree_pop_max, 'Try again ...' @@ -720,7 +710,7 @@ class Base_GP(object): # get simplified expression and process it by TF - tested 2017 02/02 expr = str(self.algo_sym) # might change this to algo_raw for more correct expression evaluation - result = self.fx_fitness_eval(expr, self.data_test, get_labels=True) + result = self.fx_fitness_eval(expr, self.data_test, get_pred_labels = True) print '\n\t\033[36mTree', query, 'yields (raw):', self.algo_raw, '\033[0;0m' print '\t\033[36mTree', query, 'yields (sym):\033[1m', self.algo_sym, '\033[0;0m\n' @@ -814,14 +804,14 @@ class Base_GP(object): elif pause == 'q': if eol == 0: # if the GP run is not at the final generation - query = raw_input('\n\t \033[32mThe current population_b will be lost!\033[0;0m\n\n\t Are you certain you want to quit? (y/n)') + query = raw_input('\n\t \033[32mThe current population_b will be lost!\033[0;0m\n\n\t Are you certain you want to quit? (y/n) ') if query == 'y': self.fx_archive_params_write('Desktop') # save run-time parameters to disk sys.exit() # quit the script without saving population_b else: break else: # if the GP run is complete - query = raw_input('\n\t Are you certain you want to quit? (y/n)') + query = raw_input('\n\t Are you certain you want to quit? (y/n) ') if query == 'y': print '\n\t \033[32mYour Trees and runtime parameters are archived in karoo_gp/runs/\033[0;0m' self.fx_archive_params_write('Desktop') # save run-time parameters to disk @@ -1244,7 +1234,7 @@ class Base_GP(object): else: if tree[8, node_id] == '1': # arity of 1 for the explicit pattern 'not [term]' - return self.fx_eval_label(tree, tree[9, node_id]) + tree[6, node_id] # original code + return self.fx_eval_label(tree, tree[9, node_id]) + tree[6, node_id] elif tree[8, node_id] == '2': # arity of 2 for the pattern '[func] [term] [func]' return self.fx_eval_label(tree, tree[9, node_id]) + tree[6, node_id] + self.fx_eval_label(tree, tree[10, node_id]) @@ -1383,32 +1373,32 @@ class Base_GP(object): return - def fx_fitness_eval(self, expr, data, get_labels = False): + def fx_fitness_eval(self, expr, data, get_pred_labels = False): ''' Computes tree expression using TensorFlow (TF) returning results and fitness scores. - This method orchestrates most of the TF routines by parsing input string expression and converting it into TF - operation graph which then is processed in an isolated TF session to compute the results and corresponding fitness - values. + This method orchestrates most of the TF routines by parsing input string 'expression' and converting it into a TF + operation graph which is then processed in an isolated TF session to compute the results and corresponding fitness + values. 'self.tf_device' - controls which device will be used for computations (CPU or GPU). 'self.tf_device_log' - controls device placement logging (debug only). Args: 'expr' - a string containing math expression to be computed on the data. Variable names should match corresponding - terminal names in 'self.terminals'. Only algebraic operations are currently supported (+, -, *, /, **). + terminal names in 'self.terminals'. - 'data' - an 'n by m' matrix of the data points containing n observations and m features each. Variable order should - match corresponding order of terminals in 'self.terminals'. - - 'get_labels' - a boolean flag which controls whether classification labels should be extracted from the results. - This is applied only to the CLASSIFY kernel and defaults to 'False'. + 'data' - an 'n by m' matrix of the data points containing n observations and m features per observation. + Variable order should match corresponding order of terminals in 'self.terminals'. + 'get_pred_labels' - a boolean flag which controls whether the predicted labels should be extracted from the + evolved results. This applies only to the CLASSIFY kernel and defaults to 'False'. + Returns: A dict mapping keys to the following outputs: 'result' - an array of the results of applying given expression to the data - 'labels' - an array of the labels extracted from the results; defined only for CLASSIFY kernel, None otherwise + 'pred_labels' - an array of the predicted labels extracted from the results; defined only for CLASSIFY kernel, else None 'solution' - an array of the solution values extracted from the data (variable 's' in the dataset) 'pairwise_fitness' - an array of the element-wise results of applying corresponding fitness kernel function 'fitness' - aggregated scalar fitness score @@ -1424,20 +1414,19 @@ class Base_GP(object): with tf.Session(config=config) as sess: with sess.graph.device(self.tf_device): - # 1 - Load data into TF + # 1 - Load data into TF vectors tensors = {} for i in range(len(self.terminals)): var = self.terminals[i] - tensors[var] = tf.constant(data[:, i], dtype=tf.float32) + tensors[var] = tf.constant(data[:, i], dtype=tf.float32) # converts data into vectors # 2- Transform string expression into TF operation graph result = self.fx_fitness_expr_parse(expr, tensors) - - labels = tf.no_op() # a placeholder, applies only to CLASSIFY kernel + pred_labels = tf.no_op() # a placeholder, applies only to CLASSIFY kernel solution = tensors['s'] # solution value is assumed to be stored in 's' terminal # 3- Add fitness computation into TF graph - if self.kernel == 'c': # CLASSIFY kernels + if self.kernel == 'c': # CLASSIFY kernel ''' Creates element-wise fitness computation TensorFlow (TF) sub-graph for CLASSIFY kernel. @@ -1445,22 +1434,24 @@ class Base_GP(object): This method uses the 'sympified' (SymPy) expression ('algo_sym') created in 'fx_eval_poly' and the data set loaded at run-time to evaluate the fitness of the selected kernel. - This multiclass classifer compares each row of a given Tree to the known solution, comparing estimated values - (labels) generated by Karoo GP against the correct labels. This method is able to work with any number of - class labels, from 2 to n. The left-most bin includes -inf. The right-most bin includes +inf. Those inbetween - are by default confined to the spacing of 1.0 each, as defined by: + This multiclass classifer compares each row of a given Tree to the known solution, comparing predicted labels + generated by Karoo GP against the true classs labels. This method is able to work with any number of class + labels, from 2 to n. The left-most bin includes -inf. The right-most bin includes +inf. Those inbetween are + by default confined to the spacing of 1.0 each, as defined by: (solution - 1) < result <= solution The skew adjusts the boundaries of the bins such that they fall on both the negative and positive sides of the origin. At the time of this writing, an odd number of class labels will generate an extra bin on the positive side of origin as it has not yet been determined the effect of enabling the middle bin to include both a - negative and positive space. + negative and positive result. - Arguments required: result, solution + Arguments required: result, solution ''' - if get_labels: labels = tf.map_fn(self.fx_fitness_labels_map, result, dtype=[tf.int32, tf.string], swap_memory=True) + # was breaking with upgrade from Tensorflow 1.1 to 1.3; fixed by Iurii by replacing [] with () as of 20171026 + # if get_pred_labels: pred_labels = tf.map_fn(self.fx_fitness_labels_map, result, dtype = [tf.int32, tf.string], swap_memory = True) + if get_pred_labels: pred_labels = tf.map_fn(self.fx_fitness_labels_map, result, dtype = (tf.int32, tf.string), swap_memory = True) skew = (self.class_labels / 2) - 1 @@ -1478,12 +1469,26 @@ class Base_GP(object): pairwise_fitness = tf.cast(tf.logical_or(tf.logical_or(rule13, rule23), rule33), tf.int32) + elif self.kernel == 'r': # REGRESSION kernel + + ''' + A very, very basic REGRESSION kernel which is not designed to perform well in the real world. It requires + that you raise the minimum node count to keep it from converging on the value of '1'. Consider writing or + integrating a more sophisticated kernel. + ''' + pairwise_fitness = tf.abs(solution - result) + elif self.kernel == 'm': # MATCH kernel + + ''' + This is used for demonstration purposes only. + ''' + # pairwise_fitness = tf.cast(tf.equal(solution, result), tf.int32) # breaks due to floating points - RTOL, ATOL = 1e-05, 1e-08 + RTOL, ATOL = 1e-05, 1e-08 # fixes above issue by checking if a float value lies within a range of values pairwise_fitness = tf.cast(tf.less_equal(tf.abs(solution - result), ATOL + RTOL * tf.abs(result)), tf.int32) # elif self.kernel == '[other]': # [OTHER] kernel @@ -1494,9 +1499,9 @@ class Base_GP(object): fitness = tf.reduce_sum(pairwise_fitness) # Process TF graph and collect the results - result, labels, solution, fitness, pairwise_fitness = sess.run([result, labels, solution, fitness, pairwise_fitness]) + result, pred_labels, solution, fitness, pairwise_fitness = sess.run([result, pred_labels, solution, fitness, pairwise_fitness]) - return {'result': result, 'labels': labels, 'solution': solution, 'fitness': float(fitness), 'pairwise_fitness': pairwise_fitness} + return {'result': result, 'pred_labels': pred_labels, 'solution': solution, 'fitness': float(fitness), 'pairwise_fitness': pairwise_fitness} def fx_fitness_expr_parse(self, expr, tensors): @@ -1579,10 +1584,11 @@ class Base_GP(object): def fx_fitness_labels_map(self, result): ''' - Creates label extraction TensorFlow (TF) sub-graph for CLASSIFY kernel defined as a sequence of boolean conditions. - Outputs an array of tuples containing label extracted from the result and corresponding boolean condition triggered. + For the CLASSIFY kernel, creates a TensorFlow (TF) sub-graph defined as a sequence of boolean conditions based upon + the quantity of true class labels provided in the data .csv. Outputs an array of tuples containing the predicted + labels based upon the result and corresponding boolean condition triggered. - The original (pre-TensorFlow) code is as follows: + For comparison, the original (pre-TensorFlow) cod follows: skew = (self.class_labels / 2) - 1 # '-1' keeps a binary classification splitting over the origin if solution == 0 and result <= 0 - skew; fitness = 1: # check for first class (the left-most bin) @@ -1600,9 +1606,9 @@ class Base_GP(object): cond = (class_label - 1 - skew < result) & (result <= class_label - skew) label_rules[class_label] = tf.cond(cond, lambda: (tf.constant(class_label), tf.constant(' <= {}'.format(class_label - skew))), lambda: label_rules[class_label + 1]) - zero_rule = tf.cond(result <= 0 - skew, lambda: (tf.constant(0), tf.constant(' <= {}'.format(0 - skew))), lambda: label_rules[1]) + pred_label = tf.cond(result <= 0 - skew, lambda: (tf.constant(0), tf.constant(' <= {}'.format(0 - skew))), lambda: label_rules[1]) - return zero_rule + return pred_label def fx_fitness_store(self, tree, fitness): @@ -1769,18 +1775,18 @@ class Base_GP(object): harmonic mean of Precision and Recall (F1) = 2(P x R) / (P + R) From scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html - y_pred = result, the estimated target values (labels) generated by Karoo GP - y_true = solution, the correct target values (labels) associated with the data + y_pred = result, the predicted labels generated by Karoo GP + y_true = solution, the true labels associated with the data Arguments required: result ''' for i in range(len(result['result'])): - print '\t\033[36m Data row {} predicts class:\033[1m {} ({} True)\033[0;0m\033[36m as {:.2f}{}\033[0;0m'.format(i, int(result['labels'][0][i]), int(result['solution'][i]), result['result'][i], result['labels'][1][i]) + print '\t\033[36m Data row {} predicts class:\033[1m {} ({} True)\033[0;0m\033[36m as {:.2f}{}\033[0;0m'.format(i, int(result['pred_labels'][0][i]), int(result['solution'][i]), result['result'][i], result['pred_labels'][1][i]) print '\n Fitness score: {}'.format(result['fitness']) - print '\n Precision-Recall report:\n', skm.classification_report(result['solution'], result['labels'][0]) - print ' Confusion matrix:\n', skm.confusion_matrix(result['solution'], result['labels'][0]) + print '\n Precision-Recall report:\n', skm.classification_report(result['solution'], result['pred_labels'][0]) + print ' Confusion matrix:\n', skm.confusion_matrix(result['solution'], result['pred_labels'][0]) return @@ -2633,7 +2639,7 @@ class Base_GP(object): Arguments required: none ''' - file = open(self.path + '/log_config.txt', 'w') + file = open(self.path + 'log_config.txt', 'w') file.write('Karoo GP ' + app) file.write('\n launched: ' + str(self.datetime)) file.write('\n dataset: ' + str(self.dataset)) @@ -2658,7 +2664,7 @@ class Base_GP(object): file.close() - file = open(self.path + '/log_test.txt', 'w') + file = open(self.path + 'log_test.txt', 'w') file.write('Karoo GP ' + app) file.write('\n launched: ' + str(self.datetime)) file.write('\n dataset: ' + str(self.dataset)) @@ -2668,12 +2674,7 @@ class Base_GP(object): fitness_best = 0 fittest_tree = 0 - - # original method, using pre-built fittest_dict - # file.write('\n The leading Trees and their associated expressions are:') - # for n in sorted(self.fittest_dict): - # file.write('\n\t ' + str(n) + ' : ' + str(self.fittest_dict[n])) - + # revised method, re-evaluating all Trees from stored fitness score for tree_id in range(1, len(self.population_b)): @@ -2698,19 +2699,19 @@ class Base_GP(object): # print 'fitness_best:', fitness_best, 'fittest_tree:', fittest_tree - + # test the most fit Tree and write to the .txt log self.fx_eval_poly(self.population_b[int(fittest_tree)]) # generate the raw and sympified equation for the given Tree using SymPy expr = str(self.algo_sym) # get simplified expression and process it by TF - tested 2017 02/02 - result = self.fx_fitness_eval(expr, self.data_test, get_labels=True) + result = self.fx_fitness_eval(expr, self.data_test, get_pred_labels = True) file.write('\n\n Tree ' + str(fittest_tree) + ' is the most fit, with expression:') file.write('\n\n ' + str(self.algo_sym)) if self.kernel == 'c': file.write('\n\n Classification fitness score: {}'.format(result['fitness'])) - file.write('\n\n Precision-Recall report:\n {}'.format(skm.classification_report(result['solution'], result['labels'][0]))) - file.write('\n Confusion matrix:\n {}'.format(skm.confusion_matrix(result['solution'], result['labels'][0]))) + file.write('\n\n Precision-Recall report:\n {}'.format(skm.classification_report(result['solution'], result['pred_labels'][0]))) + file.write('\n Confusion matrix:\n {}'.format(skm.confusion_matrix(result['solution'], result['pred_labels'][0]))) elif self.kernel == 'r': MSE, fitness = skm.mean_squared_error(result['result'], result['solution']), result['fitness'] diff --git a/karoo_gp_main.py b/karoo_gp_main.py index 55ab1ac..b2a8c2a 100644 --- a/karoo_gp_main.py +++ b/karoo_gp_main.py @@ -1,8 +1,7 @@ # Karoo GP Main (desktop) # Use Genetic Programming for Classification and Symbolic Regression # by Kai Staats, MSc; see LICENSE.md -# Thanks to Emmanuel Dufourq and Arun Kumar for support during 2014-15 devel; TensorFlow support provided by Iurii Milovanov -# version 1.0.5 +# version 1.0.8 ''' A word to the newbie, expert, and brave-- @@ -31,10 +30,12 @@ If you include the path to an external dataset, it will auto-load at launch: $ python karoo_gp_main.py /[path]/[to_your]/[filename].csv ''' -import sys # sys.path.append('modules/') to add the directory 'modules' to the current path -import karoo_gp_base_class; gp = karoo_gp_base_class.Base_GP() +import sys; sys.path.append('modules/') # add directory 'modules' to the current path import time +import karoo_gp_base_class; gp = karoo_gp_base_class.Base_GP() + + #++++++++++++++++++++++++++++++++++++++++++ # User Defined Configuration | #++++++++++++++++++++++++++++++++++++++++++ @@ -87,12 +88,12 @@ while True: try: tree_depth_base = raw_input('\t Enter depth of the \033[3minitial\033[0;0m population of Trees (default 3): ') if tree_depth_base not in str(menu) or tree_depth_base == '0': raise ValueError() - tree_depth_base = tree_depth_base or 3; tree_depth_base = int(tree_depth_base); break + elif tree_depth_base == '': tree_depth_base = 3; break + tree_depth_base = int(tree_depth_base); break except ValueError: print '\t\033[32m Enter a number from 1 including 10. Try again ...\n\033[0;0m' except KeyboardInterrupt: sys.exit() - if gp.kernel == 'p': # if the Play kernel is selected gp.tree_depth_max = tree_depth_base gp.tree_pop_max = 1 @@ -106,11 +107,13 @@ else: # if any other kernel is selected menu = range(tree_depth_base,11) while True: try: - gp.tree_depth_max = raw_input('\t Enter maximum Tree depth (default matches \033[3minitial\033[0;0m): ') - if gp.tree_depth_max not in str(menu) or gp.tree_depth_max == '0': raise ValueError() - gp.tree_depth_max = gp.tree_depth_max or tree_depth_base; gp.tree_depth_max = int(gp.tree_depth_max); break - # gp.tree_depth_max = int(gp.tree_depth_max) - tree_depth_base; break - except ValueError: print '\t\033[32m Enter a number >= the maximum Tree depth. Try again ...\n\033[0;0m' + gp.tree_depth_max = raw_input('\t Enter maximum Tree depth (default %i): ' %tree_depth_base) + if gp.tree_depth_max not in str(menu): raise ValueError() + elif gp.tree_depth_max == '': gp.tree_depth_max = tree_depth_base + gp.tree_depth_max = int(gp.tree_depth_max) + if gp.tree_depth_max < tree_depth_base: raise ValueError() # an ugly exception to the norm 20170918 + else: break + except ValueError: print '\t\033[32m Enter a number >= the initial Tree depth. Try again ...\n\033[0;0m' except KeyboardInterrupt: sys.exit() menu = range(3,101) @@ -118,7 +121,8 @@ else: # if any other kernel is selected try: gp.tree_depth_min = raw_input('\t Enter minimum number of nodes for any given Tree (default 3): ') if gp.tree_depth_min not in str(menu) or gp.tree_depth_min == '0': raise ValueError() - gp.tree_depth_min = gp.tree_depth_min or 3; gp.tree_depth_min = int(gp.tree_depth_min); break + elif gp.tree_depth_min == '': gp.tree_depth_min = 3 + gp.tree_depth_min = int(gp.tree_depth_min); break except ValueError: print '\t\033[32m Enter a number from 3 to 2^(depth + 1) - 1 including 100. Try again ...\n\033[0;0m' except KeyboardInterrupt: sys.exit() @@ -127,7 +131,8 @@ else: # if any other kernel is selected try: gp.tree_pop_max = raw_input('\t Enter number of Trees in each population (default 100): ') if gp.tree_pop_max not in str(menu) or gp.tree_pop_max == '0': raise ValueError() - gp.tree_pop_max = gp.tree_pop_max or 100; gp.tree_pop_max = int(gp.tree_pop_max); break + elif gp.tree_pop_max == '': gp.tree_pop_max = 100 + gp.tree_pop_max = int(gp.tree_pop_max); break except ValueError: print '\t\033[32m Enter a number from 10 including 1000. Try again ...\n\033[0;0m' except KeyboardInterrupt: sys.exit() @@ -136,7 +141,8 @@ else: # if any other kernel is selected try: gp.generation_max = raw_input('\t Enter max number of generations (default 10): ') if gp.generation_max not in str(menu) or gp.generation_max == '0': raise ValueError() - gp.generation_max = gp.generation_max or 10; gp.generation_max = int(gp.generation_max); break + elif gp.generation_max == '': gp.generation_max = 10 + gp.generation_max = int(gp.generation_max); break except ValueError: print '\t\033[32m Enter a number from 1 including 100. Try again ...\n\033[0;0m' except KeyboardInterrupt: sys.exit() @@ -172,9 +178,11 @@ If the user has selected 'Play' mode, this is the only generation to be construc ''' start = time.time() # start the clock for the timer - + filename = '' # temp place holder -gp.fx_karoo_data_load(tree_type, tree_depth_base, filename) + +gp.fx_karoo_data_load(filename) + gp.generation_id = 1 # set initial generation ID gp.population_a = ['Karoo GP by Kai Staats, Generation ' + str(gp.generation_id)] # an empty list which will store all Tree arrays, one generation at a time diff --git a/karoo_gp_server.py b/karoo_gp_server.py index 1f9b90f..3425999 100644 --- a/karoo_gp_server.py +++ b/karoo_gp_server.py @@ -1,8 +1,7 @@ # Karoo GP Server # Use Genetic Programming for Classification and Symbolic Regression # by Kai Staats, MSc; see LICENSE.md -# Thanks to Emmanuel Dufourq and Arun Kumar for support during 2014-15 devel; TensorFlow support provided by Iurii Milovanov -# version 1.0.5 +# version 1.0.8 ''' A word to the newbie, expert, and brave-- @@ -49,9 +48,10 @@ Note that if you include any of the above flags, then you must also include a fl An example is given, as follows: $ python karoo_gp_server.py -ker c -typ r -bas 4 -fil /[path]/[to_your]/[filename].csv + ''' -import sys # sys.path.append('modules/') to add the directory 'modules' to the current path +import sys; sys.path.append('modules/') # to add the directory 'modules' to the current path import argparse import karoo_gp_base_class; gp = karoo_gp_base_class.Base_GP()