random-forest/tree_bootstrapped.py
Vylion 6f4987e179 📝 Added the Forest class
📝 Changed some format and definitions
📝 Added Out-of-Bag error calculation
2019-04-26 13:07:16 +02:00

204 lines
6.3 KiB
Python

import multiprocessing as mp
from question import Question
def unique_vals(dataset, indices, column):
return set([dataset[i].data[column] for i in indices])
def count_labels(dataset, indices):
counts = {}
for i in indices:
for label in dataset[i].label:
if label not in counts:
counts[label] = 1
else:
counts[label] += 1
return counts
def partition(dataset, indices, question):
matching, non_matching = [], []
for i in indices:
if question.match(dataset[i]):
matching.append(i)
else:
non_matching.append(i)
return matching, non_matching
def gini(dataset, indices):
counts = count_labels(dataset, indices)
impurity = 1
for label in counts:
prob = counts[label] / float(len(indices))
impurity -= prob**2
return impurity
def info_gain(dataset, lid, rid, uncertainty):
p = float(len(lid)) / float(len(lid) + len(rid))
return uncertainty - p * gini(dataset, lid) - (1-p) * gini(dataset, rid)
def splitter(info):
question, dataset, indices, uncertainty = info
matching, non_matching = partition(dataset, indices, question)
if not matching or not non_matching:
return None
gain = info_gain(dataset, matching, non_matching, uncertainty)
return (gain, question, (matching, non_matching))
class Node(object):
def __init__(self, fields, dataset, bootstrap, level=0, out=True):
self.fields = fields
self.dataset = dataset
self.indices = bootstrap
self.out = out
self.gini = gini(dataset, self.indices)
self.build(level, out)
def build(self, level, out=True):
best_split = self.split(out)
gain, question, branches = best_split
if not branches:
# Means we got 0 gain
if out:
print("Found a leaf at level {}".format(level))
self.predictions = count_labels(self.dataset, self.indices)
self.is_leaf = True
return
left, right = branches
if out:
print("Found a level {} split:".format(level))
print(question)
print("Matching: {} entries\tNon-matching: {} entries".format(len(left), len(right)))
self.left_branch = Node(self.fields, self.dataset, left, level + 1, out)
self.right_branch = Node(self.fields, self.dataset, right, level + 1, out)
self.question = question
self.is_leaf = False
return
def split(self, out=True):
if out:
print("Splitting {} entries.".format(len(self.indices)))
best_gain, best_question, best_split = 0, None, None
uncertainty = self.gini or gini(self.dataset, self.indices)
cpus = mp.cpu_count()
columns = len(self.fields)
parallelize = len(self.indices) > 1000
if parallelize and out:
print("\n-- Using {} CPUs to parallelize the split search\n".format(cpus))
for i in range(columns):
values = unique_vals(self.dataset, self.indices, i)
if parallelize:
# Parallelize best split search
splits = []
for value in values:
question = Question(self.fields, i, value)
splits.append((question, self.dataset, self.indices, uncertainty))
chunk = max(int(len(splits)/(cpus*4)), 1)
with mp.Pool(cpus) as p:
for split in p.imap_unordered(splitter, splits, chunksize=chunk):
if split is not None:
gain, question, branches = split
if gain > best_gain:
best_gain, best_question, best_split = gain, question, branches
else:
for value in values:
question = Question(self.fields, i, value)
matching, non_matching = partition(self.dataset, self.indices, question)
if not matching or not non_matching:
continue
gain = info_gain(self.dataset, matching, non_matching, uncertainty)
if gain > best_gain:
best_gain, best_question = gain, question
best_split = (matching, non_matching)
return best_gain, best_question, best_split
def classify(self, entry):
if self.is_leaf:
return self
if self.question.match(entry):
return self.left_branch.classify(entry)
else:
return self.right_branch.classify(entry)
def predict(self, entry):
successes = []
predict = self.classify(entry).predictions.copy()
total = float(sum(predict.values()))
for key, value in predict.items():
predict[key] = float(predict[key]) / total
for label in entry.label:
if label in predict:
success = predict[label]
successes.append(success)
return sum(successes), predict
def print(self, spacing=''):
if self.is_leaf:
s = spacing + "Predict: "
total = float(sum(self.predictions.values()))
probs = {}
for label in self.predictions:
prob = self.predictions[label] * 100 / total
probs[label] = "{:.2f}%".format(prob)
return s + str(probs)
s = spacing + ("(Gini: {:.2f}) {}\n"
.format(self.gini, str(self.question)))
s += spacing + "├─ True:\n"
s += self.left_branch.print(spacing + "") + '\n'
s += spacing + "└─ False:\n"
s += self.right_branch.print(spacing + " ")
return s
def __str__(self):
return self.print()
class Tree(object):
def __init__(self, fields, dataset, bootstrap, out=True):
self.fields = fields
self.dataset = dataset
self.indices = bootstrap
# Out of bag
self.oob = [i for i in range(len(dataset)) if i not in bootstrap]
self.root = Node(self.fields, self.dataset, self.indices, out=out)
def classify(self, entry):
return self.root.classify(entry)
def predict(self, entry):
return self.root.predict(entry)
def __str__(self):
return str(self.root)