📝 Added the Forest class
📝 Changed some format and definitions 📝 Added Out-of-Bag error calculation
This commit is contained in:
parent
b01e30a34d
commit
6f4987e179
8 changed files with 305 additions and 111 deletions
59
forest.py
Normal file
59
forest.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
import random
|
||||
import operator
|
||||
from tree_bootstrapped import Tree
|
||||
|
||||
|
||||
class Forest(object):
|
||||
def __init__(self, fields, dataset, size, tree_out=False, out=True):
|
||||
self.fields = fields
|
||||
self.dataset = dataset
|
||||
self.size = size
|
||||
|
||||
self.trees = []
|
||||
for i in range(size):
|
||||
n = len(dataset)
|
||||
bootstrap = [random.randrange(n) for j in range(n)]
|
||||
tree = Tree(self.fields, self.dataset, bootstrap, (tree_out and out))
|
||||
self.trees.append(tree)
|
||||
|
||||
if out:
|
||||
print("\nPlanted tree {}".format(i))
|
||||
|
||||
def error_oob(self):
|
||||
oob = []
|
||||
for tree in self.trees:
|
||||
oob.extend(tree.oob)
|
||||
|
||||
oob = set(oob)
|
||||
|
||||
votes = {}
|
||||
successes = 0
|
||||
|
||||
for i in oob:
|
||||
entry = self.dataset[i]
|
||||
|
||||
for tree in self.trees:
|
||||
if i not in tree.indices:
|
||||
predict = tree.classify(entry).predictions
|
||||
for key, value in predict.items():
|
||||
if key not in votes:
|
||||
votes[key] = predict[key]
|
||||
else:
|
||||
votes[key] += predict[key]
|
||||
majority = max(votes.items(), key=operator.itemgetter(1))[0]
|
||||
if majority in entry.label:
|
||||
successes += 1
|
||||
|
||||
return 1-(float(successes)/float(len(oob)))
|
||||
|
||||
def predict(self, entry):
|
||||
votes = {}
|
||||
for tree in self.trees:
|
||||
predict = tree.classify(entry).predictions
|
||||
for key, value in predict.items():
|
||||
if key not in votes:
|
||||
votes[key] = predict[key]
|
||||
else:
|
||||
votes[key] += predict[key]
|
||||
majority = max(votes.items(), key=operator.itemgetter(1))[0]
|
||||
return majority
|
104
forest_tester.py
Normal file
104
forest_tester.py
Normal file
|
@ -0,0 +1,104 @@
|
|||
import os
|
||||
import random
|
||||
from timeit import default_timer as timer
|
||||
from star_reader import read_stars
|
||||
# from tree_bootstrapped import Tree
|
||||
from forest import Forest
|
||||
|
||||
|
||||
OUTPUT_FOLDER = "output/forest"
|
||||
|
||||
|
||||
def log(s, open_file):
|
||||
print(s)
|
||||
open_file.write(str(s) + '\n')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if not os.path.exists(OUTPUT_FOLDER):
|
||||
os.mkdir(OUTPUT_FOLDER)
|
||||
|
||||
if not os.path.exists(OUTPUT_FOLDER + "/testing.txt"):
|
||||
output = open(OUTPUT_FOLDER + "/testing.txt", 'w',
|
||||
encoding="utf-8")
|
||||
else:
|
||||
output = open(OUTPUT_FOLDER + "/testing.txt", 'a', encoding="utf-8")
|
||||
|
||||
dataset, fields = read_stars()
|
||||
|
||||
random.shuffle(dataset)
|
||||
|
||||
cutoff = 0.4
|
||||
forest_size = 10
|
||||
|
||||
split = int(len(dataset) * cutoff)
|
||||
training, testing = dataset[:split], dataset[split + 1:]
|
||||
|
||||
log("\n----------\n", output)
|
||||
|
||||
"""
|
||||
log("\n-- TREE TRAINING --\n", output)
|
||||
|
||||
log("Training Tree...", output)
|
||||
t_start = timer()
|
||||
|
||||
log("Dataset split: Training with {}% of the set".format(cutoff*100), output)
|
||||
log("Training set: {} entries.".format(len(training)), output)
|
||||
log("Testing set: {} entries.".format(len(testing)), output)
|
||||
|
||||
tree = Tree(fields, training, [i for i in range(len(training))])
|
||||
|
||||
t_end = timer()
|
||||
log("Training complete.\nElapsed time: {:.3f}\n".format(t_end - t_start), output)
|
||||
|
||||
log("\n-- TREE TEST --\n", output)
|
||||
|
||||
total_success = 0
|
||||
|
||||
for entry in testing:
|
||||
success, predict = tree.predict(entry)
|
||||
# print("Actual: {}\tPredicted: {}.\tSuccess: {}".format(entry.label, predict, success))
|
||||
total_success += success
|
||||
|
||||
tested = len(testing)
|
||||
s_rate = float(total_success)*100/float(tested)
|
||||
|
||||
log("\nTested {} entries.".format(tested), output)
|
||||
|
||||
log("Accuracy: {:.2f}%\nError: {:.2f}%".format(s_rate, 100-s_rate), output)
|
||||
"""
|
||||
|
||||
log("\n-- FOREST TRAINING --\n", output)
|
||||
|
||||
log("Training Forest...", output)
|
||||
t_start = timer()
|
||||
|
||||
log("Dataset split: Training with {}% of the set".format(cutoff*100), output)
|
||||
log("Training set: {} entries.".format(len(training)), output)
|
||||
log("Testing set: {} entries.".format(len(testing)), output)
|
||||
|
||||
forest = Forest(fields, training, forest_size)
|
||||
|
||||
log("\n-- FOREST TEST --\n", output)
|
||||
|
||||
total_success = 0
|
||||
|
||||
for entry in testing:
|
||||
label = entry.label
|
||||
majority = forest.predict(entry)
|
||||
if majority in label:
|
||||
# print("Actual: {}\tPredicted: {}".format(label, predict))
|
||||
total_success += 1
|
||||
|
||||
tested = len(testing)
|
||||
s_rate = float(total_success)*100/float(tested)
|
||||
|
||||
log("\nTested {} entries.".format(tested), output)
|
||||
|
||||
log("Accuracy: {:.2f}%\nError: {:.2f}%".format(s_rate, 100-s_rate), output)
|
||||
|
||||
error = forest.error_oob()
|
||||
|
||||
log("\nAverage error Out-of-Bag: {:.2f}%".format(error*100), output)
|
||||
|
||||
output.close()
|
|
@ -23,4 +23,4 @@ class Question(object):
|
|||
condition = self.numeric and ">" or "="
|
||||
field = self.fields[self.pos]
|
||||
|
||||
return "Is {f} {cond} {val}?".format(f=field, cond=condition, val=self.value) # noqa
|
||||
return "Is {f} {cond} {val}?".format(f=field, cond=condition, val=self.value)
|
||||
|
|
5
star.py
5
star.py
|
@ -12,5 +12,6 @@ class Star(object):
|
|||
classification = ' or '.join(self.label)
|
||||
else:
|
||||
classification = self.label
|
||||
return ("Star {} {} of spectral type {}"
|
||||
.format(self.name, self.data, classification))
|
||||
|
||||
s = "Star {} {} of spectral type {}"
|
||||
return s.format(self.name, self.data, classification)
|
||||
|
|
|
@ -32,11 +32,14 @@ def make_star(header, row, fields=None):
|
|||
return None
|
||||
|
||||
type_list = value.split('/')
|
||||
types = []
|
||||
for star_type in type_list:
|
||||
for sp_type in STAR_CLASSES:
|
||||
if star_type and sp_type in star_type.upper():
|
||||
types.append(sp_type)
|
||||
|
||||
# now, for each star_type in typelist
|
||||
# and each sp_type in STAR_CLASSES
|
||||
# add sp_type to the list if
|
||||
# sp_type in star_type.upper()
|
||||
|
||||
# Basically, look if the star class letters appear in each possible star type, and add them, ignoring empty star types
|
||||
types = [sp_type if star_type and sp_type in star_type.upper() else '' for star_type in type_list for sp_type in STAR_CLASSES]
|
||||
value = ''.join(set(types))
|
||||
if value == '':
|
||||
return None
|
||||
|
@ -69,7 +72,7 @@ def read_stars(fields=KEPT_DATA):
|
|||
|
||||
t_end = timer()
|
||||
|
||||
print("Parsed {} stars.\nElapsed time: {:.3f}\n".format(len(star_list), t_end-t_start)) # noqa
|
||||
print("Parsed {} stars.\nElapsed time: {:.3f}\n".format(len(star_list), t_end-t_start))
|
||||
|
||||
return star_list, fields or header
|
||||
|
||||
|
|
31
tree.py
31
tree.py
|
@ -1,4 +1,3 @@
|
|||
import random
|
||||
import multiprocessing as mp
|
||||
from question import Question
|
||||
|
||||
|
@ -71,8 +70,7 @@ def find_best_split(fields, dataset, uncertainty=None):
|
|||
# Parallelize best split search
|
||||
cpus = mp.cpu_count()
|
||||
if i == 0:
|
||||
print("-- Using {} CPUs to parallelize the split search."
|
||||
.format(cpus))
|
||||
print("-- Using {} CPUs to parallelize the split search.".format(cpus))
|
||||
splits = []
|
||||
for value in values:
|
||||
question = Question(fields, i, value)
|
||||
|
@ -80,13 +78,11 @@ def find_best_split(fields, dataset, uncertainty=None):
|
|||
|
||||
chunk = max(int(len(splits)/(cpus*4)), 1)
|
||||
with mp.Pool(cpus) as p:
|
||||
for split in p.imap_unordered(splitter, splits,
|
||||
chunksize=chunk):
|
||||
for split in p.imap_unordered(splitter, splits, chunksize=chunk):
|
||||
if split is not None:
|
||||
gain, question, branches = split
|
||||
if gain > best_gain:
|
||||
best_gain, best_question, best_split = \
|
||||
gain, question, branches
|
||||
best_gain, best_question, best_split = gain, question, branches
|
||||
else:
|
||||
for value in values:
|
||||
question = Question(fields, i, value)
|
||||
|
@ -126,7 +122,7 @@ class Node(object):
|
|||
|
||||
print("Found a level {} split:".format(level))
|
||||
print(question)
|
||||
print("Matching: {} entries\tNon-matching: {} entries".format(len(left), len(right))) # noqa
|
||||
print("Matching: {} entries\tNon-matching: {} entries".format(len(left), len(right)))
|
||||
|
||||
self.left_branch = Node(self.fields, left, level + 1)
|
||||
self.right_branch = Node(self.fields, right, level + 1)
|
||||
|
@ -144,11 +140,18 @@ class Node(object):
|
|||
return self.right_branch.classify(entry)
|
||||
|
||||
def predict(self, entry):
|
||||
predict = self.classify(entry).predictions
|
||||
choices = []
|
||||
for label, count in predict.items():
|
||||
choices.extend([label]*count)
|
||||
return random.choice(choices)
|
||||
successes = []
|
||||
predict = self.classify(entry).predictions.copy()
|
||||
|
||||
for label in entry.label:
|
||||
total = float(sum(predict.values()))
|
||||
for key, value in predict.items():
|
||||
predict[key] = float(predict[key]) / total
|
||||
if label in predict:
|
||||
success = float(predict[label]) / total
|
||||
successes.append(success)
|
||||
|
||||
return sum(successes), predict
|
||||
|
||||
def print(self, spacing=''):
|
||||
if self.is_leaf:
|
||||
|
@ -160,7 +163,7 @@ class Node(object):
|
|||
probs[label] = "{:.2f}%".format(prob)
|
||||
return s + str(probs)
|
||||
|
||||
s = spacing + str(self.question) + '\n'
|
||||
s = spacing + "(Gini: {:.2f}) ".format(self.gini) + str(self.question) + '\n'
|
||||
s += spacing + "├─ True:\n"
|
||||
s += self.left_branch.print(spacing + "│ ") + '\n'
|
||||
s += spacing + "└─ False:\n"
|
||||
|
|
|
@ -34,7 +34,7 @@ def gini(dataset, indices):
|
|||
impurity = 1
|
||||
|
||||
for label in counts:
|
||||
prob = counts[label] / float(len(dataset))
|
||||
prob = counts[label] / float(len(indices))
|
||||
impurity -= prob**2
|
||||
|
||||
return impurity
|
||||
|
@ -55,47 +55,82 @@ def splitter(info):
|
|||
return (gain, question, (matching, non_matching))
|
||||
|
||||
|
||||
def find_best_split(fields, dataset, indices, uncertainty=None):
|
||||
print("Splitting {} entries.".format(len(dataset)))
|
||||
class Node(object):
|
||||
def __init__(self, fields, dataset, bootstrap, level=0, out=True):
|
||||
self.fields = fields
|
||||
self.dataset = dataset
|
||||
self.indices = bootstrap
|
||||
self.out = out
|
||||
self.gini = gini(dataset, self.indices)
|
||||
self.build(level, out)
|
||||
|
||||
def build(self, level, out=True):
|
||||
best_split = self.split(out)
|
||||
gain, question, branches = best_split
|
||||
|
||||
if not branches:
|
||||
# Means we got 0 gain
|
||||
if out:
|
||||
print("Found a leaf at level {}".format(level))
|
||||
self.predictions = count_labels(self.dataset, self.indices)
|
||||
self.is_leaf = True
|
||||
return
|
||||
|
||||
left, right = branches
|
||||
|
||||
if out:
|
||||
print("Found a level {} split:".format(level))
|
||||
print(question)
|
||||
print("Matching: {} entries\tNon-matching: {} entries".format(len(left), len(right)))
|
||||
|
||||
self.left_branch = Node(self.fields, self.dataset, left, level + 1, out)
|
||||
self.right_branch = Node(self.fields, self.dataset, right, level + 1, out)
|
||||
self.question = question
|
||||
self.is_leaf = False
|
||||
return
|
||||
|
||||
def split(self, out=True):
|
||||
if out:
|
||||
print("Splitting {} entries.".format(len(self.indices)))
|
||||
best_gain, best_question, best_split = 0, None, None
|
||||
|
||||
uncertainty = uncertainty or gini(dataset)
|
||||
uncertainty = self.gini or gini(self.dataset, self.indices)
|
||||
|
||||
columns = len(fields)
|
||||
cpus = mp.cpu_count()
|
||||
columns = len(self.fields)
|
||||
|
||||
parallelize = len(self.indices) > 1000
|
||||
|
||||
if parallelize and out:
|
||||
print("\n-- Using {} CPUs to parallelize the split search\n".format(cpus))
|
||||
|
||||
for i in range(columns):
|
||||
values = unique_vals(dataset, indices, i)
|
||||
values = unique_vals(self.dataset, self.indices, i)
|
||||
|
||||
if len(indices) > 400:
|
||||
if parallelize:
|
||||
# Parallelize best split search
|
||||
cpus = mp.cpu_count()
|
||||
if i == 0:
|
||||
print("-- Using {} CPUs to parallelize the split search."
|
||||
.format(cpus))
|
||||
splits = []
|
||||
for value in values:
|
||||
question = Question(fields, i, value)
|
||||
splits.append((question, dataset, indices, uncertainty))
|
||||
question = Question(self.fields, i, value)
|
||||
splits.append((question, self.dataset, self.indices, uncertainty))
|
||||
|
||||
chunk = max(int(len(splits)/(cpus*4)), 1)
|
||||
with mp.Pool(cpus) as p:
|
||||
for split in p.imap_unordered(splitter, splits,
|
||||
chunksize=chunk):
|
||||
for split in p.imap_unordered(splitter, splits, chunksize=chunk):
|
||||
if split is not None:
|
||||
gain, question, branches = split
|
||||
if gain > best_gain:
|
||||
best_gain, best_question, best_split = \
|
||||
gain, question, branches
|
||||
best_gain, best_question, best_split = gain, question, branches
|
||||
else:
|
||||
for value in values:
|
||||
question = Question(fields, i, value)
|
||||
question = Question(self.fields, i, value)
|
||||
|
||||
matching, non_matching = partition(dataset, indices, question)
|
||||
matching, non_matching = partition(self.dataset, self.indices, question)
|
||||
|
||||
if not matching or not non_matching:
|
||||
continue
|
||||
|
||||
gain = info_gain(dataset, matching, non_matching, uncertainty)
|
||||
gain = info_gain(self.dataset, matching, non_matching, uncertainty)
|
||||
|
||||
if gain > best_gain:
|
||||
best_gain, best_question = gain, question
|
||||
|
@ -103,39 +138,6 @@ def find_best_split(fields, dataset, indices, uncertainty=None):
|
|||
|
||||
return best_gain, best_question, best_split
|
||||
|
||||
|
||||
class Node(object):
|
||||
def __init__(self, fields, dataset, bootstrap, level=0):
|
||||
self.fields = fields
|
||||
self.dataset = dataset
|
||||
self.bootstrap = bootstrap
|
||||
self.gini = gini(dataset, self.bootstrap)
|
||||
self.build(level)
|
||||
|
||||
def build(self, level):
|
||||
best_split = find_best_split(self.fields, self.dataset,
|
||||
self.bootstrap, self.gini)
|
||||
gain, question, branches = best_split
|
||||
|
||||
if not branches:
|
||||
# Means we got 0 gain
|
||||
print("Found a leaf at level {}".format(level))
|
||||
self.predictions = count_labels(self.dataset, self.bootstrap)
|
||||
self.is_leaf = True
|
||||
return
|
||||
|
||||
left, right = branches
|
||||
|
||||
print("Found a level {} split:".format(level))
|
||||
print(question)
|
||||
print("Matching: {} entries\tNon-matching: {} entries".format(len(left), len(right))) # noqa
|
||||
|
||||
self.left_branch = Node(self.fields, self.dataset, left, level + 1)
|
||||
self.right_branch = Node(self.fields, self.dataset, right, level + 1)
|
||||
self.question = question
|
||||
self.is_leaf = False
|
||||
return
|
||||
|
||||
def classify(self, entry):
|
||||
if self.is_leaf:
|
||||
return self
|
||||
|
@ -145,6 +147,20 @@ class Node(object):
|
|||
else:
|
||||
return self.right_branch.classify(entry)
|
||||
|
||||
def predict(self, entry):
|
||||
successes = []
|
||||
predict = self.classify(entry).predictions.copy()
|
||||
total = float(sum(predict.values()))
|
||||
for key, value in predict.items():
|
||||
predict[key] = float(predict[key]) / total
|
||||
|
||||
for label in entry.label:
|
||||
if label in predict:
|
||||
success = predict[label]
|
||||
successes.append(success)
|
||||
|
||||
return sum(successes), predict
|
||||
|
||||
def print(self, spacing=''):
|
||||
if self.is_leaf:
|
||||
s = spacing + "Predict: "
|
||||
|
@ -160,7 +176,7 @@ class Node(object):
|
|||
s += spacing + "├─ True:\n"
|
||||
s += self.left_branch.print(spacing + "│ ") + '\n'
|
||||
s += spacing + "└─ False:\n"
|
||||
s += self.right_branch.print(spacing + "│ ")
|
||||
s += self.right_branch.print(spacing + " ")
|
||||
|
||||
return s
|
||||
|
||||
|
@ -169,14 +185,20 @@ class Node(object):
|
|||
|
||||
|
||||
class Tree(object):
|
||||
def __init__(self, fields, dataset, bootstrap):
|
||||
def __init__(self, fields, dataset, bootstrap, out=True):
|
||||
self.fields = fields
|
||||
self.dataset = dataset
|
||||
self.bootstrap = bootstrap
|
||||
self.root = Node(self.fields, self.dataset, self.bootstrap)
|
||||
self.indices = bootstrap
|
||||
# Out of bag
|
||||
self.oob = [i for i in range(len(dataset)) if i not in bootstrap]
|
||||
|
||||
self.root = Node(self.fields, self.dataset, self.indices, out=out)
|
||||
|
||||
def classify(self, entry):
|
||||
return self.root.classify(entry)
|
||||
|
||||
def predict(self, entry):
|
||||
return self.root.predict(entry)
|
||||
|
||||
def __str__(self):
|
||||
return str(self.root)
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
import os
|
||||
from timeit import default_timer as timer
|
||||
from star_reader import read_stars
|
||||
from tree import Tree
|
||||
from tree_bootstrapped import Tree
|
||||
|
||||
|
||||
OUTPUT_FOLDER = "output/tree"
|
||||
|
||||
|
||||
def log(s, open_file):
|
||||
|
@ -10,13 +13,14 @@ def log(s, open_file):
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if not os.path.exists("output"):
|
||||
os.mkdir("output")
|
||||
if not os.path.exists(OUTPUT_FOLDER):
|
||||
os.mkdir(OUTPUT_FOLDER)
|
||||
|
||||
if not os.path.exists("output/tree_testing.txt"):
|
||||
output = open("output/tree_testing.txt", 'w', encoding="utf-8")
|
||||
if not os.path.exists(OUTPUT_FOLDER + "/testing.txt"):
|
||||
output = open(OUTPUT_FOLDER + "/testing.txt", 'w',
|
||||
encoding="utf-8")
|
||||
else:
|
||||
output = open("output/tree_testing.txt", 'a', encoding="utf-8")
|
||||
output = open(OUTPUT_FOLDER + "/testing.txt", 'a', encoding="utf-8")
|
||||
|
||||
dataset, fields = read_stars()
|
||||
|
||||
|
@ -25,36 +29,34 @@ if __name__ == '__main__':
|
|||
log("Training Tree...", output)
|
||||
t_start = timer()
|
||||
|
||||
split = int(len(dataset) * 0.65)
|
||||
cut = 0.02
|
||||
|
||||
split = int(len(dataset) * cut)
|
||||
training, testing = dataset[:split], dataset[split + 1:]
|
||||
log("Dataset split: Training with {}% of the set".format(cut*100), output)
|
||||
log("Training set: {} entries.".format(len(training)), output)
|
||||
log("Testing set: {} entries.".format(len(testing)), output)
|
||||
|
||||
tree = Tree(fields, training)
|
||||
tree = Tree(fields, training, [i for i in range(len(training))])
|
||||
|
||||
t_end = timer()
|
||||
timestamp = "Training complete.\nElapsed time: {:.3f}\n"
|
||||
log(timestamp.format(t_end - t_start), output)
|
||||
log("Training complete.\nElapsed time: {:.3f}\n".format(t_end - t_start), output)
|
||||
|
||||
log(tree, output)
|
||||
|
||||
log("\n-- TEST --\n", output)
|
||||
|
||||
failures = 0
|
||||
total_success = 0
|
||||
|
||||
for entry in testing:
|
||||
label = entry.label
|
||||
predict = tree.predict(entry)
|
||||
if predict not in label:
|
||||
print("Actual: {}\tPredicted: {}".format(label, predict))
|
||||
failures += 1
|
||||
success, predict = tree.predict(entry)
|
||||
print("Actual: {}\tPredicted: {}.\tSuccess: {}".format(entry.label, predict, success))
|
||||
total_success += success
|
||||
|
||||
tested = len(testing)
|
||||
success = tested - failures
|
||||
s_rate = float(success)*100/float(tested)
|
||||
s_rate = float(total_success)*100/float(tested)
|
||||
|
||||
log("\nSuccessfully predicted {} out of {} entries."
|
||||
.format(success, tested), output)
|
||||
log("\nTested {} entries.".format(tested), output)
|
||||
|
||||
log("Accuracy: {:.2f}%\nError: {:.2f}%".format(s_rate, 100-s_rate), output)
|
||||
|
||||
|
|
Loading…
Reference in a new issue