v1.0.8 bug fixes and updates

pull/13/head
Kai Staats 2018-02-27 16:50:27 -07:00
parent 0e96bc1fd1
commit db958d7472
7 changed files with 264 additions and 234 deletions

Binary file not shown.

View File

@ -2,7 +2,7 @@
The MIT License (MIT)
Copyright (c) 2015-2016 Kai Staats (www.kaistaats.com)
Copyright (c) 2015-2018 Kai Staats (www.kaistaats.com)
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the Software without restriction, including without limitation the

View File

@ -1,9 +1,30 @@
2017 08/10
Relatively light updates this time.
2018 02/27
Updated the Python library versions and improved some explanation of Operators and Operands in the User Guide.
Iurii fixed a minor bug in which the MATCHING function (used only for demonstrations) would not find a match despite
the fact that the output was correct, due to the way in which TensorFlow was handling floating points and precision.
Iurii used numpy.allclose.html as a reference to resolve the situation.
2017 10/26
An upgrade from Tensorflow 1.1 to 1.3 caused the Classify kernel test to break. Fixed by Iurii by replacing [] with ()
in the 'fx_fitness_eval' method.
2017 10/17
To be consistent with the anticipated Machine Learning vocabulary, replaced the term 'label' with 'pred_labels'
(predicted labels) in the TF graph methods.
2017 08/10
Iurii fixed a minor bug in which the MATCHING function (used only for demonstrations) would not find a match despite
the output being (apprently) correct. This was discovered to be due to the way in which TensorFlow was handling
floating points and precision, as follows:
[ -0.29999995 0.69999981 13.69999981 16.70000076 19.70000076
22.70000076 25.70000076 28.70000076 31.70000076 34.70000076]
[ -0.30000001 0.69999999 13.69999981 16.70000076 19.70000076
22.70000076 25.70000076 28.70000076 31.70000076 34.70000076]
As you can see, the values are close, but not equal and so a "match" was not resolved. Iurii used numpy.allclose.html
as a reference to resolve the situation.
I also modified the autosave to the runs/ directory such that if you are using an external dataset (quite likely), the
new directory (for each run) will be saved as [filename]-[date_time_stamp]/ The idea (thank you Marco) is to help keep

View File

@ -1,11 +1,11 @@
a,b,c,s
0,1,2,3
1,2,3,6
2,3,4,9
3,4,5,12
4,5,6,15
5,6,7,18
6,7,8,21
7,8,9,24
8,9,10,27
9,10,11,30
0,1,2,3.0
1,2,3,6.0
2,3,4,9.0
3,4,5,12.0
4,5,6,15.0
5,6,7,18.0
6,7,8,21.0
7,8,9,24.0
8,9,10,27.0
9,10,11,30.0

1 a b c s
2 0 1 2 3 3.0
3 1 2 3 6 6.0
4 2 3 4 9 9.0
5 3 4 5 12 12.0
6 4 5 6 15 15.0
7 5 6 7 18 18.0
8 6 7 8 21 21.0
9 7 8 9 24 24.0
10 8 9 10 27 27.0
11 9 10 11 30 30.0

View File

@ -2,7 +2,7 @@
# Define the methods and global variables used by Karoo GP
# by Kai Staats, MSc; see LICENSE.md
# Thanks to Emmanuel Dufourq and Arun Kumar for support during 2014-15 devel; TensorFlow support provided by Iurii Milovanov
# version 1.0.5
# version 1.0.8
'''
A NOTE TO THE NEWBIE, EXPERT, AND BRAVE
@ -24,40 +24,45 @@ from sympy import sympify
from datetime import datetime
from collections import OrderedDict
# TensorFlow-related imports
# np.random.seed(1000) # for reproducibility
### TensorFlow Imports and Definitions ###
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
import tensorflow as tf
import ast
import operator as op
operators = {ast.Add: tf.add, # e.g., a + b
ast.Sub: tf.subtract, # e.g., a - b
ast.Mult: tf.multiply, # e.g., a * b
ast.Div: tf.divide, # e.g., a / b
ast.Pow: tf.pow, # e.g., a ** 2
ast.USub: tf.negative, # e.g., -a
ast.And: tf.logical_and, # e.g., a and b
ast.Or: tf.logical_or, # e.g., a or b
ast.Not: tf.logical_not, # e.g., not a
ast.Eq: tf.equal, # e.g., a == b
ast.NotEq: tf.not_equal, # e.g., a != b
ast.Lt: tf.less, # e.g., a < b
ast.LtE: tf.less_equal, # e.g., a <= b
ast.Gt: tf.greater, # e.g., a > b
ast.GtE: tf.greater_equal, # e.g., a >= 1
'abs': tf.abs, # e.g., abs(a)
'sign': tf.sign, # e.g., sign(a)
'square': tf.square, # e.g., square(a)
'sqrt': tf.sqrt, # e.g., sqrt(a)
'pow': tf.pow, # e.g., pow(a, b)
'log': tf.log, # e.g., log(a)
'log1p': tf.log1p, # e.g., log1p(a)
'cos': tf.cos, # e.g., cos(a)
'sin': tf.sin, # e.g., sin(a)
'tan': tf.tan, # e.g., tan(a)
'acos': tf.acos, # e.g., acos(a)
'asin': tf.asin, # e.g., asin(a)
'atan': tf.atan, # e.g., atan(a)
}
ast.Sub: tf.subtract, # e.g., a - b
ast.Mult: tf.multiply, # e.g., a * b
ast.Div: tf.divide, # e.g., a / b
ast.Pow: tf.pow, # e.g., a ** 2
ast.USub: tf.negative, # e.g., -a
ast.And: tf.logical_and, # e.g., a and b
ast.Or: tf.logical_or, # e.g., a or b
ast.Not: tf.logical_not, # e.g., not a
ast.Eq: tf.equal, # e.g., a == b
ast.NotEq: tf.not_equal, # e.g., a != b
ast.Lt: tf.less, # e.g., a < b
ast.LtE: tf.less_equal, # e.g., a <= b
ast.Gt: tf.greater, # e.g., a > b
ast.GtE: tf.greater_equal, # e.g., a >= 1
'abs': tf.abs, # e.g., abs(a)
'sign': tf.sign, # e.g., sign(a)
'square': tf.square, # e.g., square(a)
'sqrt': tf.sqrt, # e.g., sqrt(a)
'pow': tf.pow, # e.g., pow(a, b)
'log': tf.log, # e.g., log(a)
'log1p': tf.log1p, # e.g., log1p(a)
'cos': tf.cos, # e.g., cos(a)
'sin': tf.sin, # e.g., sin(a)
'tan': tf.tan, # e.g., tan(a)
'acos': tf.acos, # e.g., acos(a)
'asin': tf.asin, # e.g., asin(a)
'atan': tf.atan, # e.g., atan(a)
}
np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters before line-wrapping in order to view Trees
@ -65,10 +70,9 @@ np.set_printoptions(linewidth = 320) # set the terminal to print 320 characters
class Base_GP(object):
'''
This Base_BP class contains all methods for Karoo GP.
Method names are differentiated from global variable names (defined below) by the prefix 'fx_' followed by an object
and action, as in 'fx_display_tree()', with a few expections, such as 'fx_fitness_gene_pool'.
This Base_BP class contains all methods for Karoo GP. Method names are differentiated from global variable names
(defined below) by the prefix 'fx_' followed by an object and action, as in 'fx_display_tree()', with a few
expections, such as 'fx_fitness_gene_pool'.
The categories (denoted by +++ banners +++) are as follows:
'karoo_gp' A single method which conducts an entire run. Employed only by karoo_gp_server.py
@ -80,85 +84,72 @@ class Base_GP(object):
'fx_display_' Methods to Display a Tree
'fx_archive_' Methods to Archive
There are no sub-classes at the time of this edit - 2015 09/21
### Global variables used for data management ###
'gp.data_train' store train data for processing in TF
'gp.data_test' store test data for processing in TF
'gp.tf_device' set TF computation backend device (CPU or GPU)
'gp.tf_device_log' employed for TensorFlow debugging
'gp.data_train_cols' number of cols in the TRAINING data (see 'fx_karoo_data_load', below)
'gp.data_train_rows' number of rows in the TRAINING data (see 'fx_karoo_data_load', below)
'gp.data_test_cols' number of cols in the TEST data (see 'fx_karoo_data_load', below)
'gp.data_test_rows' number of rows in the TEST data (see 'fx_karoo_data_load', below)
'gp.functions' user defined functions (operators) from the associated files/[functions].csv
'gp.terminals' user defined variables (operands) from the top row of the associated [data].csv
'gp.coeff' user defined coefficients (NOT YET IN USE)
'gp.fitness_type' fitness type
'gp.datetime' date-time stamp of when the unique directory is created
'gp.path' full path to the unique directory created with each run
'gp.dataset' local path and dataset filename
### Global variables initiated and/or used by Sympy ###
'gp.algo_raw' a Sympy string which represents a flattened tree
'gp.algo_sym' a Sympy executable version of algo_raw
'gp.fittest_dict' a dictionary of the most fit trees, compiled during fitness function execution
### Global variables used for evolutionary management ###
'gp.population_a' the root generation from which Trees are chosen for mutation and reproduction
'gp.population_b' the generation constructed from gp.population_a (recyled)
'gp.gene_pool' once-per-generation assessment of trees that meet min and max boundary conditions
'gp.generation_id' simple n + 1 increment
'gp.fitness_type' set in 'fx_karoo_data_load' as either a minimising or maximising function
'gp.tree' axis-1, 13 element Numpy array that defines each Tree, stored in 'gp.population'
'gp.pop_*' 13 variables that define each Tree (see 'fx_gen_tree_initialise')
### Error checks ###
You can quickly locate all error checks by searching for 'ERROR!' in this and all classes.
'''
#++++++++++++++++++++++++++++++++++++++++++
# Define Global Variables |
#++++++++++++++++++++++++++++++++++++++++++
def __init__(self):
'''
All Karoo GP global variables are named with the prefix 'gp.' The 13 variables which begin with 'gp.pop_' are
specifically employed to define the 13 parameters for each tree as stored in the axis-1 (expand horizontally)
'gp.population' Numpy array.
### Global variables instantiated in karoo_gp_main.py and karoo_gp_server.py ###
self.kernel = '' # fitness function
self.tree_depth_max = 0 # maximum Tree depth for the entire run; limits bloat
self.tree_depth_min = 0 # minimum number of nodes
self.tree_pop_max = 0 # maximum number of Trees per generation
self.generation_max = 0 # maximum number of generations
self.tourn_size = 0 # number of Trees selected for each tournament
### Global and local variables defined by the user in karoo_gp_main.py (in order of appearence) ###
'gp.kernel' fitness function
'gp.class_method' select the number of classes (will be automated in future version)
'tree_type' Full, Grow, or Ramped 50/50 (local variable)
'gp.tree_depth_min' minimum number of nodes
'tree_depth_base' maximum Tree depth for the initial population, where nodes = 2^(depth + 1) - 1
'gp.tree_depth_max' maximum Tree depth for the entire run; introduces potential bloat
'gp.tree_pop_max' maximum number of Trees per generation
'gp.generation_max' maximum number of generations
'gp.display' level of on-screen feedback
self.evolve_repro = 0 # quantity of a population generated through Reproduction
self.evolve_point = 0 # quantity of a population generated through Point Mutation
self.evolve_branch = 0 # quantity of a population generated through Branch Mutation
self.evolve_cross = 0 # quantity of a population generated through Crossover
'gp.evolve_repro' quantity of a population generated through Reproduction
'gp.evolve_point' quantity of a population generated through Point Mutation
'gp.evolve_branch' quantity of a population generated through Branch Mutation
'gp.evolve_cross' quantity of a population generated through Crossover
self.display = '' # display mode is set to (s)ilent # level of on-screen feedback
self.precision = 0 # the number of floating points for the round function in 'fx_fitness_eval'
'gp.tourn_size' the number of Trees chosen for each tournament
'gp.precision' the number of floating points for all applications of the round function
# self.karoo_gp(tree_type, tree_depth_base, filename) # used by karoo_gp_server.py to launch an entire run
### Global variables used for data management ###
'gp.data_train' store train data for processing in TF
'gp.data_test' store test data for processing in TF
'gp.tf_device' set TF computation backend device (CPU or GPU)
'gp.tf_device_log' employed for TensorFlow debugging
'gp.data_train_cols' number of cols in the TRAINING data (see 'fx_karoo_data_load', below)
'gp.data_train_rows' number of rows in the TRAINING data (see 'fx_karoo_data_load', below)
'gp.data_test_cols' number of cols in the TEST data (see 'fx_karoo_data_load', below)
'gp.data_test_rows' number of rows in the TEST data (see 'fx_karoo_data_load', below)
### Global variables instantiated in the classes ###
self.algo_raw = [] # the raw expression generated by Sympy per Tree -- CONSIDER MAKING THIS VARIABLE LOCAL
self.algo_sym = [] # the expression generated by Sympy per Tree -- CONSIDER MAKING THIS VARIABLE LOCAL
self.fittest_dict = {} # all Trees which share the best fitness score
'gp.functions' user defined functions (operators) from the associated files/[functions].csv
'gp.terminals' user defined variables (operands) from the top row of the associated [data].csv
'gp.coeff' user defined coefficients (NOT YET IN USE)
'gp.fitness_type' fitness type
'gp.datetime' date-time stamp of when the unique directory is created
'gp.path' full path to the unique directory created with each run
'gp.dataset' local path and dataset filename
### Global variables initiated and/or used by Sympy ###
'gp.algo_raw' a Sympy string which represents a flattened tree
'gp.algo_sym' a Sympy executable version of algo_raw
'gp.fittest_dict' a dictionary of the most fit trees, compiled during fitness function execution
### Variables used for evolutionary management ###
'gp.population_a' the root generation from which Trees are chosen for mutation and reproduction
'gp.population_b' the generation constructed from gp.population_a (recyled)
'gp.gene_pool' once-per-generation assessment of trees that meet min and max boundary conditions
'gp.generation_id' simple n + 1 increment
'gp.fitness_type' set in 'fx_karoo_data_load' as either a minimising or maximising function
'gp.tree' axis-1, 13 element Numpy array that defines each Tree, stored in 'gp.population'
'gp.pop_*' 13 elements which define each Tree (see 'fx_gen_tree_initialise' below)
### Fishing nets ###
You can insert a "fishing net" to search for a specific expression when you fear the evolutionary process or
something in the code may not be working. Search for "fishing net" and follow the directions.
### Error checks ###
You can quickly find all places in which error checks have been inserted by searching for "ERROR!"
'''
self.algo_raw = [] # temp store the raw expression -- CONSIDER MAKING THIS VARIABLE LOCAL
self.algo_sym = [] # temp store the sympified expression-- CONSIDER MAKING THIS VARIABLE LOCAL
self.fittest_dict = {} # temp store all Trees which share the best fitness score
self.gene_pool = [] # temp store all Tree IDs for use by Tournament
self.class_labels = 0 # temp set a variable which will be assigned the number of class labels (data_y)
self.gene_pool = [] # store all Tree IDs for use by Tournament
self.class_labels = 0 # the number of true class labels (data_y)
return
@ -181,7 +172,7 @@ class Base_GP(object):
start = time.time() # start the clock for the timer
# construct first generation of Trees
self.fx_karoo_data_load(tree_type, tree_depth_base, filename)
self.fx_karoo_data_load(filename)
self.generation_id = 1 # set initial generation ID
self.population_a = ['Karoo GP by Kai Staats, Generation ' + str(self.generation_id)] # list to store all Tree arrays, one generation at a time
self.fx_karoo_construct(tree_type, tree_depth_base) # construct the first population of Trees
@ -228,21 +219,21 @@ class Base_GP(object):
os.system('clear')
print '\n\033[36m\033[1m'
print '\t ** ** ****** ***** ****** ****** ****** ******'
print '\t ** ** ** ** ** ** ** ** ** ** ** ** **'
print '\t ** ** ** ** ** ** ** ** ** ** ** ** **'
print '\t **** ******** ****** ** ** ** ** ** *** ******'
print '\t ** ** ****** ***** ****** ****** ****** ******'
print '\t ** ** ** ** ** ** ** ** ** ** ** ** **'
print '\t ** ** ** ** ** ** ** ** ** ** ** ** **'
print '\t **** ******** ****** ** ** ** ** ** *** *******'
print '\t ** ** ** ** ** ** ** ** ** ** ** ** **'
print '\t ** ** ** ** ** ** ** ** ** ** ** ** **'
print '\t ** ** ** ** ** ** ** ** ** ** ** ** **'
print '\t ** ** ** ** ** ** ****** ****** ****** **'
print '\033[0;0m'
print '\t\033[36m Genetic Programming in Python - by Kai Staats, version 1.0\033[0;0m'
print '\t\033[36m Genetic Programming in Python - by Kai Staats, version 1.0.7\033[0;0m'
return
def fx_karoo_data_load(self, tree_type, tree_depth_base, filename):
def fx_karoo_data_load(self, filename):
'''
The data and function .csv files are loaded according to the fitness function kernel selected by the user. An
@ -251,33 +242,33 @@ class Base_GP(object):
10 rows will not be split, rather copied in full to both TRAINING and TEST as it is assumed you are conducting
a system validation run, as with the built-in MATCH kernel and associated dataset.
Arguments required: tree_type, tree_depth_base, filename (of the dataset)
Arguments required: filename (of the dataset)
'''
### 1) load the associated data set, operators, operands, fitness type, and coefficients ###
full_path = os.path.realpath(__file__); cwd = os.path.dirname(full_path) # Good idea Marco :)
# cwd = os.getcwd()
# full_path = os.path.realpath(__file__); cwd = os.path.dirname(full_path) # Good idea Marco :)
cwd = os.getcwd()
data_dict = {'c':cwd + '/files/data_CLASSIFY.csv', 'r':cwd + '/files/data_REGRESS.csv', 'm':cwd + '/files/data_MATCH.csv', 'p':cwd + '/files/data_PLAY.csv'}
if len(sys.argv) == 1: # load data from the default karoo_gp/files/ directory
data_x = np.loadtxt(data_dict[self.kernel], skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column
data_y = np.loadtxt(data_dict[self.kernel], skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels)
header = open(data_dict[self.kernel],'r')
self.dataset = data_dict[self.kernel]
header = open(data_dict[self.kernel],'r') # read only the top row of parameters
self.dataset = data_dict[self.kernel] # copy the name only
elif len(sys.argv) == 2: # load an external data file
data_x = np.loadtxt(sys.argv[1], skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column
data_y = np.loadtxt(sys.argv[1], skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels)
header = open(sys.argv[1],'r')
self.dataset = sys.argv[1]
header = open(sys.argv[1],'r') # read only the top row of parameters
self.dataset = sys.argv[1] # copy the name only
elif len(sys.argv) > 2: # receive filename and additional flags from karoo_gp_server.py via argparse
elif len(sys.argv) > 2: # receive filename and additional arguments from karoo_gp_server.py via argparse
data_x = np.loadtxt(filename, skiprows = 1, delimiter = ',', dtype = float); data_x = data_x[:,0:-1] # load all but the right-most column
data_y = np.loadtxt(filename, skiprows = 1, usecols = (-1,), delimiter = ',', dtype = float) # load only right-most column (class labels)
header = open(filename,'r')
self.dataset = filename
header = open(filename,'r') # read only the top row of parameters
self.dataset = filename # copy the name only
fitt_dict = {'c':'max', 'r':'min', 'm':'max', 'p':''}
self.fitness_type = fitt_dict[self.kernel] # load fitness type
@ -285,7 +276,7 @@ class Base_GP(object):
func_dict = {'c':cwd + '/files/operators_CLASSIFY.csv', 'r':cwd + '/files/operators_REGRESS.csv', 'm':cwd + '/files/operators_MATCH.csv', 'p':cwd + '/files/operators_PLAY.csv'}
self.functions = np.loadtxt(func_dict[self.kernel], delimiter=',', skiprows=1, dtype = str) # load the user defined functions (operators)
self.terminals = header.readline().split(','); self.terminals[-1] = self.terminals[-1].replace('\n','') # load the user defined terminals (operands)
self.class_labels = len(np.unique(data_y)) # load the user defined labels for classification or solutions for regression
self.class_labels = len(np.unique(data_y)) # load the user defined true labels for classification or solutions for regression
#self.coeff = np.loadtxt(cwd + '/files/coefficients.csv', delimiter=',', skiprows=1, dtype = str) # load the user defined coefficients - NOT USED YET
@ -299,10 +290,10 @@ class Base_GP(object):
x_train, x_test, y_train, y_test = skcv.train_test_split(data_x, data_y, test_size = 0.2) # 80/20 TRAIN/TEST split
data_x, data_y = [], [] # clear from memory
data_train = np.c_[x_train, y_train] # recombine each row of data with its associated label (right column)
data_train = np.c_[x_train, y_train] # recombine each row of data with its associated class label (right column)
x_train, y_train = [], [] # clear from memory
data_test = np.c_[x_test, y_test] # recombine each row of data with its associated label (right column)
data_test = np.c_[x_test, y_test] # recombine each row of data with its associated class label (right column)
x_test, y_test = [], [] # clear from memory
self.data_train_cols = len(data_train[0,:]) # qty count
@ -315,33 +306,30 @@ class Base_GP(object):
self.data_train = data_train # Store train data for processing in TF
self.data_test = data_test # Store test data for processing in TF
self.tf_device = "/gpu:0" # Set TF computation backend device (CPU or GPU)
self.tf_device = "/gpu:0" # Set TF computation backend device (CPU or GPU); gpu:n = 1st, 2nd, or ... GPU device
self.tf_device_log = False # TF device usage logging (for debugging)
### 4) create a unique directory and initialise all .csv files ###
# self.datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
self.datetime = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
self.path = os.path.join(cwd, 'runs/', filename.split('.')[0] + '_' + self.datetime) # generate a unique directory name
# self.path = os.path.join(cwd, 'runs/', self.datetime) # generate a unique directory name
self.path = os.path.join(cwd, 'runs/', filename.split('.')[0] + '_' + self.datetime + '/') # generate a unique directory name
if not os.path.isdir(self.path): os.makedirs(self.path) # make a unique directory
self.filename = {} # a dictionary to hold .csv filenames
self.filename.update( {'a':self.path + '/population_a.csv'} )
self.filename.update( {'a':self.path + 'population_a.csv'} )
target = open(self.filename['a'], 'w') # initialise the .csv file for population 'a' (foundation)
target.close()
self.filename.update( {'b':self.path + '/population_b.csv'} )
self.filename.update( {'b':self.path + 'population_b.csv'} )
target = open(self.filename['b'], 'w') # initialise the .csv file for population 'b' (evolving)
target.close()
self.filename.update( {'f':self.path + '/population_f.csv'} )
self.filename.update( {'f':self.path + 'population_f.csv'} )
target = open(self.filename['f'], 'w') # initialise the .csv file for the final population (test)
target.close()
self.filename.update( {'s':self.path + '/population_s.csv'} )
self.filename.update( {'s':self.path + 'population_s.csv'} )
# do NOT initialise this .csv file, as it is retained for loading a previous run (recover)
return
@ -619,9 +607,10 @@ class Base_GP(object):
while True:
try:
print '\n\t The current tournament size is:', self.tourn_size
query = int(raw_input('\t Adjust the tournament size (suggest 10): '))
if query not in menu: raise ValueError()
self.tourn_size = query; break
query = raw_input('\t Adjust the tournament size (suggest 10): ')
if query not in str(menu) or query == '0' or query == '1': raise ValueError() # not ideal 20170918
elif query == '': break
self.tourn_size = int(query); break
except ValueError: print '\n\t\033[32m Enter a number from 2 including', str(self.tree_pop_max) + ".", 'Try again ...\033[0;0m'
@ -630,9 +619,10 @@ class Base_GP(object):
while True:
try:
print '\n\t The current minimum number of nodes is:', self.tree_depth_min
query = int(raw_input('\t Adjust the minimum number of nodes for all Trees (min 3): '))
if query not in menu: raise ValueError()
self.tree_depth_min = query; break
query = raw_input('\t Adjust the minimum number of nodes for all Trees (min 3): ')
if query not in str(menu) or query == '0' or query == '1' or query == '2': raise ValueError() # not ideal 20170918
elif query == '': break
self.tree_depth_min = int(query); break
except ValueError: print '\n\t\033[32m Enter a number from 3 including 1000. Try again ...\033[0;0m'
@ -643,8 +633,8 @@ class Base_GP(object):
# while True:
# try:
# print '\n\t The current \033[3madjusted\033[0;0m maximum Tree depth is:', self.tree_depth_max
# query = int(raw_input('\n\t Adjust the global maximum Tree depth to (1 ... 10): '))
# if query not in menu: raise ValueError()
# query = raw_input('\n\t Adjust the global maximum Tree depth to (1 ... 10): ')
# if query not in str(menu): raise ValueError()
# if query < self.tree_depth_max:
# print '\n\t\033[32m This value is less than the current value.\033[0;0m'
# conf = raw_input('\n\t Are you ok with this? (y/n) ')
@ -666,7 +656,7 @@ class Base_GP(object):
query = raw_input('\t Enter quantity of Trees to be generated by Reproduction: ')
if query not in str(menu): raise ValueError()
elif query == '': break
tmp_repro = int(float(query)); break
tmp_repro = int(query); break # replaced int(float(query)) 20170918
except ValueError: print '\n\t\033[32m Enter a number from 0 including 1000. Try again ...\033[0;0m'
while True:
@ -674,7 +664,7 @@ class Base_GP(object):
query = raw_input('\t Enter quantity of Trees to be generated by Point Mutation: ')
if query not in str(menu): raise ValueError()
elif query == '': break
tmp_point = int(float(query)); break
tmp_point = int(query); break # replaced int(float(query)) 20170918
except ValueError: print '\n\t\033[32m Enter a number from 0 including 1000. Try again ...\033[0;0m'
while True:
@ -682,7 +672,7 @@ class Base_GP(object):
query = raw_input('\t Enter quantity of Trees to be generated by Branch Mutation: ')
if query not in str(menu): raise ValueError()
elif query == '': break
tmp_branch = int(float(query)); break
tmp_branch = int(query); break # replaced int(float(query)) 20170918
except ValueError: print '\n\t\033[32m Enter a number from 0 including 1000. Try again ...\033[0;0m'
while True:
@ -690,7 +680,7 @@ class Base_GP(object):
query = raw_input('\t Enter quantity of Trees to be generated by Crossover: ')
if query not in str(menu): raise ValueError()
elif query == '': break
tmp_cross = int(float(query)); break
tmp_cross = int(query); break # replaced int(float(query)) 20170918
except ValueError: print '\n\t\033[32m Enter a number from 0 including 1000. Try again ...\033[0;0m'
if tmp_repro + tmp_point + tmp_branch + tmp_cross != self.tree_pop_max: print '\n\t The sum of the above does not equal', self.tree_pop_max, 'Try again ...'
@ -720,7 +710,7 @@ class Base_GP(object):
# get simplified expression and process it by TF - tested 2017 02/02
expr = str(self.algo_sym) # might change this to algo_raw for more correct expression evaluation
result = self.fx_fitness_eval(expr, self.data_test, get_labels=True)
result = self.fx_fitness_eval(expr, self.data_test, get_pred_labels = True)
print '\n\t\033[36mTree', query, 'yields (raw):', self.algo_raw, '\033[0;0m'
print '\t\033[36mTree', query, 'yields (sym):\033[1m', self.algo_sym, '\033[0;0m\n'
@ -814,14 +804,14 @@ class Base_GP(object):
elif pause == 'q':
if eol == 0: # if the GP run is not at the final generation
query = raw_input('\n\t \033[32mThe current population_b will be lost!\033[0;0m\n\n\t Are you certain you want to quit? (y/n)')
query = raw_input('\n\t \033[32mThe current population_b will be lost!\033[0;0m\n\n\t Are you certain you want to quit? (y/n) ')
if query == 'y':
self.fx_archive_params_write('Desktop') # save run-time parameters to disk
sys.exit() # quit the script without saving population_b
else: break
else: # if the GP run is complete
query = raw_input('\n\t Are you certain you want to quit? (y/n)')
query = raw_input('\n\t Are you certain you want to quit? (y/n) ')
if query == 'y':
print '\n\t \033[32mYour Trees and runtime parameters are archived in karoo_gp/runs/\033[0;0m'
self.fx_archive_params_write('Desktop') # save run-time parameters to disk
@ -1244,7 +1234,7 @@ class Base_GP(object):
else:
if tree[8, node_id] == '1': # arity of 1 for the explicit pattern 'not [term]'
return self.fx_eval_label(tree, tree[9, node_id]) + tree[6, node_id] # original code
return self.fx_eval_label(tree, tree[9, node_id]) + tree[6, node_id]
elif tree[8, node_id] == '2': # arity of 2 for the pattern '[func] [term] [func]'
return self.fx_eval_label(tree, tree[9, node_id]) + tree[6, node_id] + self.fx_eval_label(tree, tree[10, node_id])
@ -1383,32 +1373,32 @@ class Base_GP(object):
return
def fx_fitness_eval(self, expr, data, get_labels = False):
def fx_fitness_eval(self, expr, data, get_pred_labels = False):
'''
Computes tree expression using TensorFlow (TF) returning results and fitness scores.
This method orchestrates most of the TF routines by parsing input string expression and converting it into TF
operation graph which then is processed in an isolated TF session to compute the results and corresponding fitness
values.
This method orchestrates most of the TF routines by parsing input string 'expression' and converting it into a TF
operation graph which is then processed in an isolated TF session to compute the results and corresponding fitness
values.
'self.tf_device' - controls which device will be used for computations (CPU or GPU).
'self.tf_device_log' - controls device placement logging (debug only).
Args:
'expr' - a string containing math expression to be computed on the data. Variable names should match corresponding
terminal names in 'self.terminals'. Only algebraic operations are currently supported (+, -, *, /, **).
terminal names in 'self.terminals'.
'data' - an 'n by m' matrix of the data points containing n observations and m features each. Variable order should
match corresponding order of terminals in 'self.terminals'.
'get_labels' - a boolean flag which controls whether classification labels should be extracted from the results.
This is applied only to the CLASSIFY kernel and defaults to 'False'.
'data' - an 'n by m' matrix of the data points containing n observations and m features per observation.
Variable order should match corresponding order of terminals in 'self.terminals'.
'get_pred_labels' - a boolean flag which controls whether the predicted labels should be extracted from the
evolved results. This applies only to the CLASSIFY kernel and defaults to 'False'.
Returns:
A dict mapping keys to the following outputs:
'result' - an array of the results of applying given expression to the data
'labels' - an array of the labels extracted from the results; defined only for CLASSIFY kernel, None otherwise
'pred_labels' - an array of the predicted labels extracted from the results; defined only for CLASSIFY kernel, else None
'solution' - an array of the solution values extracted from the data (variable 's' in the dataset)
'pairwise_fitness' - an array of the element-wise results of applying corresponding fitness kernel function
'fitness' - aggregated scalar fitness score
@ -1424,20 +1414,19 @@ class Base_GP(object):
with tf.Session(config=config) as sess:
with sess.graph.device(self.tf_device):
# 1 - Load data into TF
# 1 - Load data into TF vectors
tensors = {}
for i in range(len(self.terminals)):
var = self.terminals[i]
tensors[var] = tf.constant(data[:, i], dtype=tf.float32)
tensors[var] = tf.constant(data[:, i], dtype=tf.float32) # converts data into vectors
# 2- Transform string expression into TF operation graph
result = self.fx_fitness_expr_parse(expr, tensors)
labels = tf.no_op() # a placeholder, applies only to CLASSIFY kernel
pred_labels = tf.no_op() # a placeholder, applies only to CLASSIFY kernel
solution = tensors['s'] # solution value is assumed to be stored in 's' terminal
# 3- Add fitness computation into TF graph
if self.kernel == 'c': # CLASSIFY kernels
if self.kernel == 'c': # CLASSIFY kernel
'''
Creates element-wise fitness computation TensorFlow (TF) sub-graph for CLASSIFY kernel.
@ -1445,22 +1434,24 @@ class Base_GP(object):
This method uses the 'sympified' (SymPy) expression ('algo_sym') created in 'fx_eval_poly' and the data set
loaded at run-time to evaluate the fitness of the selected kernel.
This multiclass classifer compares each row of a given Tree to the known solution, comparing estimated values
(labels) generated by Karoo GP against the correct labels. This method is able to work with any number of
class labels, from 2 to n. The left-most bin includes -inf. The right-most bin includes +inf. Those inbetween
are by default confined to the spacing of 1.0 each, as defined by:
This multiclass classifer compares each row of a given Tree to the known solution, comparing predicted labels
generated by Karoo GP against the true classs labels. This method is able to work with any number of class
labels, from 2 to n. The left-most bin includes -inf. The right-most bin includes +inf. Those inbetween are
by default confined to the spacing of 1.0 each, as defined by:
(solution - 1) < result <= solution
The skew adjusts the boundaries of the bins such that they fall on both the negative and positive sides of the
origin. At the time of this writing, an odd number of class labels will generate an extra bin on the positive
side of origin as it has not yet been determined the effect of enabling the middle bin to include both a
negative and positive space.
negative and positive result.
Arguments required: result, solution
Arguments required: result, solution
'''
if get_labels: labels = tf.map_fn(self.fx_fitness_labels_map, result, dtype=[tf.int32, tf.string], swap_memory=True)
# was breaking with upgrade from Tensorflow 1.1 to 1.3; fixed by Iurii by replacing [] with () as of 20171026
# if get_pred_labels: pred_labels = tf.map_fn(self.fx_fitness_labels_map, result, dtype = [tf.int32, tf.string], swap_memory = True)
if get_pred_labels: pred_labels = tf.map_fn(self.fx_fitness_labels_map, result, dtype = (tf.int32, tf.string), swap_memory = True)
skew = (self.class_labels / 2) - 1
@ -1478,12 +1469,26 @@ class Base_GP(object):
pairwise_fitness = tf.cast(tf.logical_or(tf.logical_or(rule13, rule23), rule33), tf.int32)
elif self.kernel == 'r': # REGRESSION kernel
'''
A very, very basic REGRESSION kernel which is not designed to perform well in the real world. It requires
that you raise the minimum node count to keep it from converging on the value of '1'. Consider writing or
integrating a more sophisticated kernel.
'''
pairwise_fitness = tf.abs(solution - result)
elif self.kernel == 'm': # MATCH kernel
'''
This is used for demonstration purposes only.
'''
# pairwise_fitness = tf.cast(tf.equal(solution, result), tf.int32) # breaks due to floating points
RTOL, ATOL = 1e-05, 1e-08
RTOL, ATOL = 1e-05, 1e-08 # fixes above issue by checking if a float value lies within a range of values
pairwise_fitness = tf.cast(tf.less_equal(tf.abs(solution - result), ATOL + RTOL * tf.abs(result)), tf.int32)
# elif self.kernel == '[other]': # [OTHER] kernel
@ -1494,9 +1499,9 @@ class Base_GP(object):
fitness = tf.reduce_sum(pairwise_fitness)
# Process TF graph and collect the results
result, labels, solution, fitness, pairwise_fitness = sess.run([result, labels, solution, fitness, pairwise_fitness])
result, pred_labels, solution, fitness, pairwise_fitness = sess.run([result, pred_labels, solution, fitness, pairwise_fitness])
return {'result': result, 'labels': labels, 'solution': solution, 'fitness': float(fitness), 'pairwise_fitness': pairwise_fitness}
return {'result': result, 'pred_labels': pred_labels, 'solution': solution, 'fitness': float(fitness), 'pairwise_fitness': pairwise_fitness}
def fx_fitness_expr_parse(self, expr, tensors):
@ -1579,10 +1584,11 @@ class Base_GP(object):
def fx_fitness_labels_map(self, result):
'''
Creates label extraction TensorFlow (TF) sub-graph for CLASSIFY kernel defined as a sequence of boolean conditions.
Outputs an array of tuples containing label extracted from the result and corresponding boolean condition triggered.
For the CLASSIFY kernel, creates a TensorFlow (TF) sub-graph defined as a sequence of boolean conditions based upon
the quantity of true class labels provided in the data .csv. Outputs an array of tuples containing the predicted
labels based upon the result and corresponding boolean condition triggered.
The original (pre-TensorFlow) code is as follows:
For comparison, the original (pre-TensorFlow) cod follows:
skew = (self.class_labels / 2) - 1 # '-1' keeps a binary classification splitting over the origin
if solution == 0 and result <= 0 - skew; fitness = 1: # check for first class (the left-most bin)
@ -1600,9 +1606,9 @@ class Base_GP(object):
cond = (class_label - 1 - skew < result) & (result <= class_label - skew)
label_rules[class_label] = tf.cond(cond, lambda: (tf.constant(class_label), tf.constant(' <= {}'.format(class_label - skew))), lambda: label_rules[class_label + 1])
zero_rule = tf.cond(result <= 0 - skew, lambda: (tf.constant(0), tf.constant(' <= {}'.format(0 - skew))), lambda: label_rules[1])
pred_label = tf.cond(result <= 0 - skew, lambda: (tf.constant(0), tf.constant(' <= {}'.format(0 - skew))), lambda: label_rules[1])
return zero_rule
return pred_label
def fx_fitness_store(self, tree, fitness):
@ -1769,18 +1775,18 @@ class Base_GP(object):
harmonic mean of Precision and Recall (F1) = 2(P x R) / (P + R)
From scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
y_pred = result, the estimated target values (labels) generated by Karoo GP
y_true = solution, the correct target values (labels) associated with the data
y_pred = result, the predicted labels generated by Karoo GP
y_true = solution, the true labels associated with the data
Arguments required: result
'''
for i in range(len(result['result'])):
print '\t\033[36m Data row {} predicts class:\033[1m {} ({} True)\033[0;0m\033[36m as {:.2f}{}\033[0;0m'.format(i, int(result['labels'][0][i]), int(result['solution'][i]), result['result'][i], result['labels'][1][i])
print '\t\033[36m Data row {} predicts class:\033[1m {} ({} True)\033[0;0m\033[36m as {:.2f}{}\033[0;0m'.format(i, int(result['pred_labels'][0][i]), int(result['solution'][i]), result['result'][i], result['pred_labels'][1][i])
print '\n Fitness score: {}'.format(result['fitness'])
print '\n Precision-Recall report:\n', skm.classification_report(result['solution'], result['labels'][0])
print ' Confusion matrix:\n', skm.confusion_matrix(result['solution'], result['labels'][0])
print '\n Precision-Recall report:\n', skm.classification_report(result['solution'], result['pred_labels'][0])
print ' Confusion matrix:\n', skm.confusion_matrix(result['solution'], result['pred_labels'][0])
return
@ -2633,7 +2639,7 @@ class Base_GP(object):
Arguments required: none
'''
file = open(self.path + '/log_config.txt', 'w')
file = open(self.path + 'log_config.txt', 'w')
file.write('Karoo GP ' + app)
file.write('\n launched: ' + str(self.datetime))
file.write('\n dataset: ' + str(self.dataset))
@ -2658,7 +2664,7 @@ class Base_GP(object):
file.close()
file = open(self.path + '/log_test.txt', 'w')
file = open(self.path + 'log_test.txt', 'w')
file.write('Karoo GP ' + app)
file.write('\n launched: ' + str(self.datetime))
file.write('\n dataset: ' + str(self.dataset))
@ -2668,12 +2674,7 @@ class Base_GP(object):
fitness_best = 0
fittest_tree = 0
# original method, using pre-built fittest_dict
# file.write('\n The leading Trees and their associated expressions are:')
# for n in sorted(self.fittest_dict):
# file.write('\n\t ' + str(n) + ' : ' + str(self.fittest_dict[n]))
# revised method, re-evaluating all Trees from stored fitness score
for tree_id in range(1, len(self.population_b)):
@ -2698,19 +2699,19 @@ class Base_GP(object):
# print 'fitness_best:', fitness_best, 'fittest_tree:', fittest_tree
# test the most fit Tree and write to the .txt log
self.fx_eval_poly(self.population_b[int(fittest_tree)]) # generate the raw and sympified equation for the given Tree using SymPy
expr = str(self.algo_sym) # get simplified expression and process it by TF - tested 2017 02/02
result = self.fx_fitness_eval(expr, self.data_test, get_labels=True)
result = self.fx_fitness_eval(expr, self.data_test, get_pred_labels = True)
file.write('\n\n Tree ' + str(fittest_tree) + ' is the most fit, with expression:')
file.write('\n\n ' + str(self.algo_sym))
if self.kernel == 'c':
file.write('\n\n Classification fitness score: {}'.format(result['fitness']))
file.write('\n\n Precision-Recall report:\n {}'.format(skm.classification_report(result['solution'], result['labels'][0])))
file.write('\n Confusion matrix:\n {}'.format(skm.confusion_matrix(result['solution'], result['labels'][0])))
file.write('\n\n Precision-Recall report:\n {}'.format(skm.classification_report(result['solution'], result['pred_labels'][0])))
file.write('\n Confusion matrix:\n {}'.format(skm.confusion_matrix(result['solution'], result['pred_labels'][0])))
elif self.kernel == 'r':
MSE, fitness = skm.mean_squared_error(result['result'], result['solution']), result['fitness']

View File

@ -1,8 +1,7 @@
# Karoo GP Main (desktop)
# Use Genetic Programming for Classification and Symbolic Regression
# by Kai Staats, MSc; see LICENSE.md
# Thanks to Emmanuel Dufourq and Arun Kumar for support during 2014-15 devel; TensorFlow support provided by Iurii Milovanov
# version 1.0.5
# version 1.0.8
'''
A word to the newbie, expert, and brave--
@ -31,10 +30,12 @@ If you include the path to an external dataset, it will auto-load at launch:
$ python karoo_gp_main.py /[path]/[to_your]/[filename].csv
'''
import sys # sys.path.append('modules/') to add the directory 'modules' to the current path
import karoo_gp_base_class; gp = karoo_gp_base_class.Base_GP()
import sys; sys.path.append('modules/') # add directory 'modules' to the current path
import time
import karoo_gp_base_class; gp = karoo_gp_base_class.Base_GP()
#++++++++++++++++++++++++++++++++++++++++++
# User Defined Configuration |
#++++++++++++++++++++++++++++++++++++++++++
@ -87,12 +88,12 @@ while True:
try:
tree_depth_base = raw_input('\t Enter depth of the \033[3minitial\033[0;0m population of Trees (default 3): ')
if tree_depth_base not in str(menu) or tree_depth_base == '0': raise ValueError()
tree_depth_base = tree_depth_base or 3; tree_depth_base = int(tree_depth_base); break
elif tree_depth_base == '': tree_depth_base = 3; break
tree_depth_base = int(tree_depth_base); break
except ValueError: print '\t\033[32m Enter a number from 1 including 10. Try again ...\n\033[0;0m'
except KeyboardInterrupt: sys.exit()
if gp.kernel == 'p': # if the Play kernel is selected
gp.tree_depth_max = tree_depth_base
gp.tree_pop_max = 1
@ -106,11 +107,13 @@ else: # if any other kernel is selected
menu = range(tree_depth_base,11)
while True:
try:
gp.tree_depth_max = raw_input('\t Enter maximum Tree depth (default matches \033[3minitial\033[0;0m): ')
if gp.tree_depth_max not in str(menu) or gp.tree_depth_max == '0': raise ValueError()
gp.tree_depth_max = gp.tree_depth_max or tree_depth_base; gp.tree_depth_max = int(gp.tree_depth_max); break
# gp.tree_depth_max = int(gp.tree_depth_max) - tree_depth_base; break
except ValueError: print '\t\033[32m Enter a number >= the maximum Tree depth. Try again ...\n\033[0;0m'
gp.tree_depth_max = raw_input('\t Enter maximum Tree depth (default %i): ' %tree_depth_base)
if gp.tree_depth_max not in str(menu): raise ValueError()
elif gp.tree_depth_max == '': gp.tree_depth_max = tree_depth_base
gp.tree_depth_max = int(gp.tree_depth_max)
if gp.tree_depth_max < tree_depth_base: raise ValueError() # an ugly exception to the norm 20170918
else: break
except ValueError: print '\t\033[32m Enter a number >= the initial Tree depth. Try again ...\n\033[0;0m'
except KeyboardInterrupt: sys.exit()
menu = range(3,101)
@ -118,7 +121,8 @@ else: # if any other kernel is selected
try:
gp.tree_depth_min = raw_input('\t Enter minimum number of nodes for any given Tree (default 3): ')
if gp.tree_depth_min not in str(menu) or gp.tree_depth_min == '0': raise ValueError()
gp.tree_depth_min = gp.tree_depth_min or 3; gp.tree_depth_min = int(gp.tree_depth_min); break
elif gp.tree_depth_min == '': gp.tree_depth_min = 3
gp.tree_depth_min = int(gp.tree_depth_min); break
except ValueError: print '\t\033[32m Enter a number from 3 to 2^(depth + 1) - 1 including 100. Try again ...\n\033[0;0m'
except KeyboardInterrupt: sys.exit()
@ -127,7 +131,8 @@ else: # if any other kernel is selected
try:
gp.tree_pop_max = raw_input('\t Enter number of Trees in each population (default 100): ')
if gp.tree_pop_max not in str(menu) or gp.tree_pop_max == '0': raise ValueError()
gp.tree_pop_max = gp.tree_pop_max or 100; gp.tree_pop_max = int(gp.tree_pop_max); break
elif gp.tree_pop_max == '': gp.tree_pop_max = 100
gp.tree_pop_max = int(gp.tree_pop_max); break
except ValueError: print '\t\033[32m Enter a number from 10 including 1000. Try again ...\n\033[0;0m'
except KeyboardInterrupt: sys.exit()
@ -136,7 +141,8 @@ else: # if any other kernel is selected
try:
gp.generation_max = raw_input('\t Enter max number of generations (default 10): ')
if gp.generation_max not in str(menu) or gp.generation_max == '0': raise ValueError()
gp.generation_max = gp.generation_max or 10; gp.generation_max = int(gp.generation_max); break
elif gp.generation_max == '': gp.generation_max = 10
gp.generation_max = int(gp.generation_max); break
except ValueError: print '\t\033[32m Enter a number from 1 including 100. Try again ...\n\033[0;0m'
except KeyboardInterrupt: sys.exit()
@ -172,9 +178,11 @@ If the user has selected 'Play' mode, this is the only generation to be construc
'''
start = time.time() # start the clock for the timer
filename = '' # temp place holder
gp.fx_karoo_data_load(tree_type, tree_depth_base, filename)
gp.fx_karoo_data_load(filename)
gp.generation_id = 1 # set initial generation ID
gp.population_a = ['Karoo GP by Kai Staats, Generation ' + str(gp.generation_id)] # an empty list which will store all Tree arrays, one generation at a time

View File

@ -1,8 +1,7 @@
# Karoo GP Server
# Use Genetic Programming for Classification and Symbolic Regression
# by Kai Staats, MSc; see LICENSE.md
# Thanks to Emmanuel Dufourq and Arun Kumar for support during 2014-15 devel; TensorFlow support provided by Iurii Milovanov
# version 1.0.5
# version 1.0.8
'''
A word to the newbie, expert, and brave--
@ -49,9 +48,10 @@ Note that if you include any of the above flags, then you must also include a fl
An example is given, as follows:
$ python karoo_gp_server.py -ker c -typ r -bas 4 -fil /[path]/[to_your]/[filename].csv
'''
import sys # sys.path.append('modules/') to add the directory 'modules' to the current path
import sys; sys.path.append('modules/') # to add the directory 'modules' to the current path
import argparse
import karoo_gp_base_class; gp = karoo_gp_base_class.Base_GP()