diff --git a/forest.py b/forest.py new file mode 100644 index 0000000..5cab1b0 --- /dev/null +++ b/forest.py @@ -0,0 +1,59 @@ +import random +import operator +from tree_bootstrapped import Tree + + +class Forest(object): + def __init__(self, fields, dataset, size, tree_out=False, out=True): + self.fields = fields + self.dataset = dataset + self.size = size + + self.trees = [] + for i in range(size): + n = len(dataset) + bootstrap = [random.randrange(n) for j in range(n)] + tree = Tree(self.fields, self.dataset, bootstrap, (tree_out and out)) + self.trees.append(tree) + + if out: + print("\nPlanted tree {}".format(i)) + + def error_oob(self): + oob = [] + for tree in self.trees: + oob.extend(tree.oob) + + oob = set(oob) + + votes = {} + successes = 0 + + for i in oob: + entry = self.dataset[i] + + for tree in self.trees: + if i not in tree.indices: + predict = tree.classify(entry).predictions + for key, value in predict.items(): + if key not in votes: + votes[key] = predict[key] + else: + votes[key] += predict[key] + majority = max(votes.items(), key=operator.itemgetter(1))[0] + if majority in entry.label: + successes += 1 + + return 1-(float(successes)/float(len(oob))) + + def predict(self, entry): + votes = {} + for tree in self.trees: + predict = tree.classify(entry).predictions + for key, value in predict.items(): + if key not in votes: + votes[key] = predict[key] + else: + votes[key] += predict[key] + majority = max(votes.items(), key=operator.itemgetter(1))[0] + return majority diff --git a/forest_tester.py b/forest_tester.py new file mode 100644 index 0000000..df2e530 --- /dev/null +++ b/forest_tester.py @@ -0,0 +1,104 @@ +import os +import random +from timeit import default_timer as timer +from star_reader import read_stars +# from tree_bootstrapped import Tree +from forest import Forest + + +OUTPUT_FOLDER = "output/forest" + + +def log(s, open_file): + print(s) + open_file.write(str(s) + '\n') + + +if __name__ == '__main__': + if not os.path.exists(OUTPUT_FOLDER): + os.mkdir(OUTPUT_FOLDER) + + if not os.path.exists(OUTPUT_FOLDER + "/testing.txt"): + output = open(OUTPUT_FOLDER + "/testing.txt", 'w', + encoding="utf-8") + else: + output = open(OUTPUT_FOLDER + "/testing.txt", 'a', encoding="utf-8") + + dataset, fields = read_stars() + + random.shuffle(dataset) + + cutoff = 0.4 + forest_size = 10 + + split = int(len(dataset) * cutoff) + training, testing = dataset[:split], dataset[split + 1:] + + log("\n----------\n", output) + + """ + log("\n-- TREE TRAINING --\n", output) + + log("Training Tree...", output) + t_start = timer() + + log("Dataset split: Training with {}% of the set".format(cutoff*100), output) + log("Training set: {} entries.".format(len(training)), output) + log("Testing set: {} entries.".format(len(testing)), output) + + tree = Tree(fields, training, [i for i in range(len(training))]) + + t_end = timer() + log("Training complete.\nElapsed time: {:.3f}\n".format(t_end - t_start), output) + + log("\n-- TREE TEST --\n", output) + + total_success = 0 + + for entry in testing: + success, predict = tree.predict(entry) + # print("Actual: {}\tPredicted: {}.\tSuccess: {}".format(entry.label, predict, success)) + total_success += success + + tested = len(testing) + s_rate = float(total_success)*100/float(tested) + + log("\nTested {} entries.".format(tested), output) + + log("Accuracy: {:.2f}%\nError: {:.2f}%".format(s_rate, 100-s_rate), output) + """ + + log("\n-- FOREST TRAINING --\n", output) + + log("Training Forest...", output) + t_start = timer() + + log("Dataset split: Training with {}% of the set".format(cutoff*100), output) + log("Training set: {} entries.".format(len(training)), output) + log("Testing set: {} entries.".format(len(testing)), output) + + forest = Forest(fields, training, forest_size) + + log("\n-- FOREST TEST --\n", output) + + total_success = 0 + + for entry in testing: + label = entry.label + majority = forest.predict(entry) + if majority in label: + # print("Actual: {}\tPredicted: {}".format(label, predict)) + total_success += 1 + + tested = len(testing) + s_rate = float(total_success)*100/float(tested) + + log("\nTested {} entries.".format(tested), output) + + log("Accuracy: {:.2f}%\nError: {:.2f}%".format(s_rate, 100-s_rate), output) + + error = forest.error_oob() + + log("\nAverage error Out-of-Bag: {:.2f}%".format(error*100), output) + + output.close() diff --git a/question.py b/question.py index de40f25..afb30ab 100644 --- a/question.py +++ b/question.py @@ -23,4 +23,4 @@ class Question(object): condition = self.numeric and ">" or "=" field = self.fields[self.pos] - return "Is {f} {cond} {val}?".format(f=field, cond=condition, val=self.value) # noqa + return "Is {f} {cond} {val}?".format(f=field, cond=condition, val=self.value) diff --git a/star.py b/star.py index cd132d5..ad07ede 100644 --- a/star.py +++ b/star.py @@ -12,5 +12,6 @@ class Star(object): classification = ' or '.join(self.label) else: classification = self.label - return ("Star {} {} of spectral type {}" - .format(self.name, self.data, classification)) + + s = "Star {} {} of spectral type {}" + return s.format(self.name, self.data, classification) diff --git a/star_reader.py b/star_reader.py index 2ffc947..1568504 100644 --- a/star_reader.py +++ b/star_reader.py @@ -32,11 +32,14 @@ def make_star(header, row, fields=None): return None type_list = value.split('/') - types = [] - for star_type in type_list: - for sp_type in STAR_CLASSES: - if star_type and sp_type in star_type.upper(): - types.append(sp_type) + + # now, for each star_type in typelist + # and each sp_type in STAR_CLASSES + # add sp_type to the list if + # sp_type in star_type.upper() + + # Basically, look if the star class letters appear in each possible star type, and add them, ignoring empty star types + types = [sp_type if star_type and sp_type in star_type.upper() else '' for star_type in type_list for sp_type in STAR_CLASSES] value = ''.join(set(types)) if value == '': return None @@ -69,7 +72,7 @@ def read_stars(fields=KEPT_DATA): t_end = timer() - print("Parsed {} stars.\nElapsed time: {:.3f}\n".format(len(star_list), t_end-t_start)) # noqa + print("Parsed {} stars.\nElapsed time: {:.3f}\n".format(len(star_list), t_end-t_start)) return star_list, fields or header diff --git a/tree.py b/tree.py index 65eed4b..2c5cf59 100644 --- a/tree.py +++ b/tree.py @@ -1,4 +1,3 @@ -import random import multiprocessing as mp from question import Question @@ -71,8 +70,7 @@ def find_best_split(fields, dataset, uncertainty=None): # Parallelize best split search cpus = mp.cpu_count() if i == 0: - print("-- Using {} CPUs to parallelize the split search." - .format(cpus)) + print("-- Using {} CPUs to parallelize the split search.".format(cpus)) splits = [] for value in values: question = Question(fields, i, value) @@ -80,13 +78,11 @@ def find_best_split(fields, dataset, uncertainty=None): chunk = max(int(len(splits)/(cpus*4)), 1) with mp.Pool(cpus) as p: - for split in p.imap_unordered(splitter, splits, - chunksize=chunk): + for split in p.imap_unordered(splitter, splits, chunksize=chunk): if split is not None: gain, question, branches = split if gain > best_gain: - best_gain, best_question, best_split = \ - gain, question, branches + best_gain, best_question, best_split = gain, question, branches else: for value in values: question = Question(fields, i, value) @@ -126,7 +122,7 @@ class Node(object): print("Found a level {} split:".format(level)) print(question) - print("Matching: {} entries\tNon-matching: {} entries".format(len(left), len(right))) # noqa + print("Matching: {} entries\tNon-matching: {} entries".format(len(left), len(right))) self.left_branch = Node(self.fields, left, level + 1) self.right_branch = Node(self.fields, right, level + 1) @@ -144,11 +140,18 @@ class Node(object): return self.right_branch.classify(entry) def predict(self, entry): - predict = self.classify(entry).predictions - choices = [] - for label, count in predict.items(): - choices.extend([label]*count) - return random.choice(choices) + successes = [] + predict = self.classify(entry).predictions.copy() + + for label in entry.label: + total = float(sum(predict.values())) + for key, value in predict.items(): + predict[key] = float(predict[key]) / total + if label in predict: + success = float(predict[label]) / total + successes.append(success) + + return sum(successes), predict def print(self, spacing=''): if self.is_leaf: @@ -160,7 +163,7 @@ class Node(object): probs[label] = "{:.2f}%".format(prob) return s + str(probs) - s = spacing + str(self.question) + '\n' + s = spacing + "(Gini: {:.2f}) ".format(self.gini) + str(self.question) + '\n' s += spacing + "├─ True:\n" s += self.left_branch.print(spacing + "│ ") + '\n' s += spacing + "└─ False:\n" diff --git a/tree_bootstrapped.py b/tree_bootstrapped.py index 27ac8fb..121c482 100644 --- a/tree_bootstrapped.py +++ b/tree_bootstrapped.py @@ -34,7 +34,7 @@ def gini(dataset, indices): impurity = 1 for label in counts: - prob = counts[label] / float(len(dataset)) + prob = counts[label] / float(len(indices)) impurity -= prob**2 return impurity @@ -55,87 +55,89 @@ def splitter(info): return (gain, question, (matching, non_matching)) -def find_best_split(fields, dataset, indices, uncertainty=None): - print("Splitting {} entries.".format(len(dataset))) - best_gain, best_question, best_split = 0, None, None - - uncertainty = uncertainty or gini(dataset) - - columns = len(fields) - - for i in range(columns): - values = unique_vals(dataset, indices, i) - - if len(indices) > 400: - # Parallelize best split search - cpus = mp.cpu_count() - if i == 0: - print("-- Using {} CPUs to parallelize the split search." - .format(cpus)) - splits = [] - for value in values: - question = Question(fields, i, value) - splits.append((question, dataset, indices, uncertainty)) - - chunk = max(int(len(splits)/(cpus*4)), 1) - with mp.Pool(cpus) as p: - for split in p.imap_unordered(splitter, splits, - chunksize=chunk): - if split is not None: - gain, question, branches = split - if gain > best_gain: - best_gain, best_question, best_split = \ - gain, question, branches - else: - for value in values: - question = Question(fields, i, value) - - matching, non_matching = partition(dataset, indices, question) - - if not matching or not non_matching: - continue - - gain = info_gain(dataset, matching, non_matching, uncertainty) - - if gain > best_gain: - best_gain, best_question = gain, question - best_split = (matching, non_matching) - - return best_gain, best_question, best_split - - class Node(object): - def __init__(self, fields, dataset, bootstrap, level=0): + def __init__(self, fields, dataset, bootstrap, level=0, out=True): self.fields = fields self.dataset = dataset - self.bootstrap = bootstrap - self.gini = gini(dataset, self.bootstrap) - self.build(level) + self.indices = bootstrap + self.out = out + self.gini = gini(dataset, self.indices) + self.build(level, out) - def build(self, level): - best_split = find_best_split(self.fields, self.dataset, - self.bootstrap, self.gini) + def build(self, level, out=True): + best_split = self.split(out) gain, question, branches = best_split if not branches: # Means we got 0 gain - print("Found a leaf at level {}".format(level)) - self.predictions = count_labels(self.dataset, self.bootstrap) + if out: + print("Found a leaf at level {}".format(level)) + self.predictions = count_labels(self.dataset, self.indices) self.is_leaf = True return left, right = branches - print("Found a level {} split:".format(level)) - print(question) - print("Matching: {} entries\tNon-matching: {} entries".format(len(left), len(right))) # noqa + if out: + print("Found a level {} split:".format(level)) + print(question) + print("Matching: {} entries\tNon-matching: {} entries".format(len(left), len(right))) - self.left_branch = Node(self.fields, self.dataset, left, level + 1) - self.right_branch = Node(self.fields, self.dataset, right, level + 1) + self.left_branch = Node(self.fields, self.dataset, left, level + 1, out) + self.right_branch = Node(self.fields, self.dataset, right, level + 1, out) self.question = question self.is_leaf = False return + def split(self, out=True): + if out: + print("Splitting {} entries.".format(len(self.indices))) + best_gain, best_question, best_split = 0, None, None + + uncertainty = self.gini or gini(self.dataset, self.indices) + + cpus = mp.cpu_count() + columns = len(self.fields) + + parallelize = len(self.indices) > 1000 + + if parallelize and out: + print("\n-- Using {} CPUs to parallelize the split search\n".format(cpus)) + + for i in range(columns): + values = unique_vals(self.dataset, self.indices, i) + + if parallelize: + # Parallelize best split search + splits = [] + for value in values: + question = Question(self.fields, i, value) + splits.append((question, self.dataset, self.indices, uncertainty)) + + chunk = max(int(len(splits)/(cpus*4)), 1) + with mp.Pool(cpus) as p: + for split in p.imap_unordered(splitter, splits, chunksize=chunk): + if split is not None: + gain, question, branches = split + if gain > best_gain: + best_gain, best_question, best_split = gain, question, branches + else: + for value in values: + question = Question(self.fields, i, value) + + matching, non_matching = partition(self.dataset, self.indices, question) + + if not matching or not non_matching: + continue + + gain = info_gain(self.dataset, matching, non_matching, uncertainty) + + if gain > best_gain: + best_gain, best_question = gain, question + best_split = (matching, non_matching) + + return best_gain, best_question, best_split + def classify(self, entry): if self.is_leaf: return self @@ -145,6 +147,20 @@ class Node(object): else: return self.right_branch.classify(entry) + def predict(self, entry): + successes = [] + predict = self.classify(entry).predictions.copy() + total = float(sum(predict.values())) + for key, value in predict.items(): + predict[key] = float(predict[key]) / total + + for label in entry.label: + if label in predict: + success = predict[label] + successes.append(success) + + return sum(successes), predict + def print(self, spacing=''): if self.is_leaf: s = spacing + "Predict: " @@ -160,7 +176,7 @@ class Node(object): s += spacing + "├─ True:\n" s += self.left_branch.print(spacing + "│ ") + '\n' s += spacing + "└─ False:\n" - s += self.right_branch.print(spacing + "│ ") + s += self.right_branch.print(spacing + " ") return s @@ -169,14 +185,20 @@ class Node(object): class Tree(object): - def __init__(self, fields, dataset, bootstrap): + def __init__(self, fields, dataset, bootstrap, out=True): self.fields = fields self.dataset = dataset - self.bootstrap = bootstrap - self.root = Node(self.fields, self.dataset, self.bootstrap) + self.indices = bootstrap + # Out of bag + self.oob = [i for i in range(len(dataset)) if i not in bootstrap] + + self.root = Node(self.fields, self.dataset, self.indices, out=out) def classify(self, entry): return self.root.classify(entry) + def predict(self, entry): + return self.root.predict(entry) + def __str__(self): return str(self.root) diff --git a/tree_tester.py b/tree_tester.py index 076266e..52175db 100644 --- a/tree_tester.py +++ b/tree_tester.py @@ -1,7 +1,10 @@ import os from timeit import default_timer as timer from star_reader import read_stars -from tree import Tree +from tree_bootstrapped import Tree + + +OUTPUT_FOLDER = "output/tree" def log(s, open_file): @@ -10,13 +13,14 @@ def log(s, open_file): if __name__ == '__main__': - if not os.path.exists("output"): - os.mkdir("output") + if not os.path.exists(OUTPUT_FOLDER): + os.mkdir(OUTPUT_FOLDER) - if not os.path.exists("output/tree_testing.txt"): - output = open("output/tree_testing.txt", 'w', encoding="utf-8") + if not os.path.exists(OUTPUT_FOLDER + "/testing.txt"): + output = open(OUTPUT_FOLDER + "/testing.txt", 'w', + encoding="utf-8") else: - output = open("output/tree_testing.txt", 'a', encoding="utf-8") + output = open(OUTPUT_FOLDER + "/testing.txt", 'a', encoding="utf-8") dataset, fields = read_stars() @@ -25,36 +29,34 @@ if __name__ == '__main__': log("Training Tree...", output) t_start = timer() - split = int(len(dataset) * 0.65) + cut = 0.02 + + split = int(len(dataset) * cut) training, testing = dataset[:split], dataset[split + 1:] + log("Dataset split: Training with {}% of the set".format(cut*100), output) log("Training set: {} entries.".format(len(training)), output) log("Testing set: {} entries.".format(len(testing)), output) - tree = Tree(fields, training) + tree = Tree(fields, training, [i for i in range(len(training))]) t_end = timer() - timestamp = "Training complete.\nElapsed time: {:.3f}\n" - log(timestamp.format(t_end - t_start), output) + log("Training complete.\nElapsed time: {:.3f}\n".format(t_end - t_start), output) log(tree, output) log("\n-- TEST --\n", output) - failures = 0 + total_success = 0 for entry in testing: - label = entry.label - predict = tree.predict(entry) - if predict not in label: - print("Actual: {}\tPredicted: {}".format(label, predict)) - failures += 1 + success, predict = tree.predict(entry) + print("Actual: {}\tPredicted: {}.\tSuccess: {}".format(entry.label, predict, success)) + total_success += success tested = len(testing) - success = tested - failures - s_rate = float(success)*100/float(tested) + s_rate = float(total_success)*100/float(tested) - log("\nSuccessfully predicted {} out of {} entries." - .format(success, tested), output) + log("\nTested {} entries.".format(tested), output) log("Accuracy: {:.2f}%\nError: {:.2f}%".format(s_rate, 100-s_rate), output)