diff --git a/forest.py b/forest.py
new file mode 100644
index 0000000..5cab1b0
--- /dev/null
+++ b/forest.py
@@ -0,0 +1,59 @@
+import random
+import operator
+from tree_bootstrapped import Tree
+
+
+class Forest(object):
+    def __init__(self, fields, dataset, size, tree_out=False, out=True):
+        self.fields = fields
+        self.dataset = dataset
+        self.size = size
+
+        self.trees = []
+        for i in range(size):
+            n = len(dataset)
+            bootstrap = [random.randrange(n) for j in range(n)]
+            tree = Tree(self.fields, self.dataset, bootstrap, (tree_out and out))
+            self.trees.append(tree)
+
+            if out:
+                print("\nPlanted tree {}".format(i))
+
+    def error_oob(self):
+        oob = []
+        for tree in self.trees:
+            oob.extend(tree.oob)
+
+        oob = set(oob)
+
+        votes = {}
+        successes = 0
+
+        for i in oob:
+            entry = self.dataset[i]
+
+            for tree in self.trees:
+                if i not in tree.indices:
+                    predict = tree.classify(entry).predictions
+                    for key, value in predict.items():
+                        if key not in votes:
+                            votes[key] = predict[key]
+                        else:
+                            votes[key] += predict[key]
+                    majority = max(votes.items(), key=operator.itemgetter(1))[0]
+                    if majority in entry.label:
+                        successes += 1
+
+        return 1-(float(successes)/float(len(oob)))
+
+    def predict(self, entry):
+        votes = {}
+        for tree in self.trees:
+            predict = tree.classify(entry).predictions
+            for key, value in predict.items():
+                if key not in votes:
+                    votes[key] = predict[key]
+                else:
+                    votes[key] += predict[key]
+        majority = max(votes.items(), key=operator.itemgetter(1))[0]
+        return majority
diff --git a/forest_tester.py b/forest_tester.py
new file mode 100644
index 0000000..df2e530
--- /dev/null
+++ b/forest_tester.py
@@ -0,0 +1,104 @@
+import os
+import random
+from timeit import default_timer as timer
+from star_reader import read_stars
+# from tree_bootstrapped import Tree
+from forest import Forest
+
+
+OUTPUT_FOLDER = "output/forest"
+
+
+def log(s, open_file):
+    print(s)
+    open_file.write(str(s) + '\n')
+
+
+if __name__ == '__main__':
+    if not os.path.exists(OUTPUT_FOLDER):
+        os.mkdir(OUTPUT_FOLDER)
+
+    if not os.path.exists(OUTPUT_FOLDER + "/testing.txt"):
+        output = open(OUTPUT_FOLDER + "/testing.txt", 'w',
+                      encoding="utf-8")
+    else:
+        output = open(OUTPUT_FOLDER + "/testing.txt", 'a', encoding="utf-8")
+
+    dataset, fields = read_stars()
+
+    random.shuffle(dataset)
+
+    cutoff = 0.4
+    forest_size = 10
+
+    split = int(len(dataset) * cutoff)
+    training, testing = dataset[:split], dataset[split + 1:]
+
+    log("\n----------\n", output)
+
+    """
+    log("\n-- TREE TRAINING --\n", output)
+
+    log("Training Tree...", output)
+    t_start = timer()
+
+    log("Dataset split: Training with {}% of the set".format(cutoff*100), output)
+    log("Training set: {} entries.".format(len(training)), output)
+    log("Testing set: {} entries.".format(len(testing)), output)
+
+    tree = Tree(fields, training, [i for i in range(len(training))])
+
+    t_end = timer()
+    log("Training complete.\nElapsed time: {:.3f}\n".format(t_end - t_start), output)
+
+    log("\n-- TREE TEST --\n", output)
+
+    total_success = 0
+
+    for entry in testing:
+        success, predict = tree.predict(entry)
+        # print("Actual: {}\tPredicted: {}.\tSuccess: {}".format(entry.label, predict, success))
+        total_success += success
+
+    tested = len(testing)
+    s_rate = float(total_success)*100/float(tested)
+
+    log("\nTested {} entries.".format(tested), output)
+
+    log("Accuracy: {:.2f}%\nError: {:.2f}%".format(s_rate, 100-s_rate), output)
+    """
+
+    log("\n-- FOREST TRAINING --\n", output)
+
+    log("Training Forest...", output)
+    t_start = timer()
+
+    log("Dataset split: Training with {}% of the set".format(cutoff*100), output)
+    log("Training set: {} entries.".format(len(training)), output)
+    log("Testing set: {} entries.".format(len(testing)), output)
+
+    forest = Forest(fields, training, forest_size)
+
+    log("\n-- FOREST TEST --\n", output)
+
+    total_success = 0
+
+    for entry in testing:
+        label = entry.label
+        majority = forest.predict(entry)
+        if majority in label:
+            # print("Actual: {}\tPredicted: {}".format(label, predict))
+            total_success += 1
+
+    tested = len(testing)
+    s_rate = float(total_success)*100/float(tested)
+
+    log("\nTested {} entries.".format(tested), output)
+
+    log("Accuracy: {:.2f}%\nError: {:.2f}%".format(s_rate, 100-s_rate), output)
+
+    error = forest.error_oob()
+
+    log("\nAverage error Out-of-Bag: {:.2f}%".format(error*100), output)
+
+    output.close()
diff --git a/question.py b/question.py
index de40f25..afb30ab 100644
--- a/question.py
+++ b/question.py
@@ -23,4 +23,4 @@ class Question(object):
         condition = self.numeric and ">" or "="
         field = self.fields[self.pos]
 
-        return "Is {f} {cond} {val}?".format(f=field, cond=condition, val=self.value) # noqa
+        return "Is {f} {cond} {val}?".format(f=field, cond=condition, val=self.value)
diff --git a/star.py b/star.py
index cd132d5..ad07ede 100644
--- a/star.py
+++ b/star.py
@@ -12,5 +12,6 @@ class Star(object):
             classification = ' or '.join(self.label)
         else:
             classification = self.label
-        return ("Star {} {} of spectral type {}"
-                .format(self.name, self.data, classification))
+
+        s = "Star {} {} of spectral type {}"
+        return s.format(self.name, self.data, classification)
diff --git a/star_reader.py b/star_reader.py
index 2ffc947..1568504 100644
--- a/star_reader.py
+++ b/star_reader.py
@@ -32,11 +32,14 @@ def make_star(header, row, fields=None):
                 return None
 
             type_list = value.split('/')
-            types = []
-            for star_type in type_list:
-                for sp_type in STAR_CLASSES:
-                    if star_type and sp_type in star_type.upper():
-                        types.append(sp_type)
+
+            # now, for each star_type in typelist
+            # and each sp_type in STAR_CLASSES
+            # add sp_type to the list if
+            # sp_type in star_type.upper()
+
+            # Basically, look if the star class letters appear in each possible star type, and add them, ignoring empty star types
+            types = [sp_type if star_type and sp_type in star_type.upper() else '' for star_type in type_list for sp_type in STAR_CLASSES]
             value = ''.join(set(types))
             if value == '':
                 return None
@@ -69,7 +72,7 @@ def read_stars(fields=KEPT_DATA):
 
     t_end = timer()
 
-    print("Parsed {} stars.\nElapsed time: {:.3f}\n".format(len(star_list), t_end-t_start)) # noqa
+    print("Parsed {} stars.\nElapsed time: {:.3f}\n".format(len(star_list), t_end-t_start))
 
     return star_list, fields or header
 
diff --git a/tree.py b/tree.py
index 65eed4b..2c5cf59 100644
--- a/tree.py
+++ b/tree.py
@@ -1,4 +1,3 @@
-import random
 import multiprocessing as mp
 from question import Question
 
@@ -71,8 +70,7 @@ def find_best_split(fields, dataset, uncertainty=None):
             # Parallelize best split search
             cpus = mp.cpu_count()
             if i == 0:
-                print("-- Using {} CPUs to parallelize the split search."
-                      .format(cpus))
+                print("-- Using {} CPUs to parallelize the split search.".format(cpus))
             splits = []
             for value in values:
                 question = Question(fields, i, value)
@@ -80,13 +78,11 @@ def find_best_split(fields, dataset, uncertainty=None):
 
             chunk = max(int(len(splits)/(cpus*4)), 1)
             with mp.Pool(cpus) as p:
-                for split in p.imap_unordered(splitter, splits,
-                                              chunksize=chunk):
+                for split in p.imap_unordered(splitter, splits, chunksize=chunk):
                     if split is not None:
                         gain, question, branches = split
                         if gain > best_gain:
-                            best_gain, best_question, best_split = \
-                                gain, question, branches
+                            best_gain, best_question, best_split = gain, question, branches
         else:
             for value in values:
                 question = Question(fields, i, value)
@@ -126,7 +122,7 @@ class Node(object):
 
         print("Found a level {} split:".format(level))
         print(question)
-        print("Matching: {} entries\tNon-matching: {} entries".format(len(left), len(right))) # noqa
+        print("Matching: {} entries\tNon-matching: {} entries".format(len(left), len(right)))
 
         self.left_branch = Node(self.fields, left, level + 1)
         self.right_branch = Node(self.fields, right, level + 1)
@@ -144,11 +140,18 @@ class Node(object):
             return self.right_branch.classify(entry)
 
     def predict(self, entry):
-        predict = self.classify(entry).predictions
-        choices = []
-        for label, count in predict.items():
-            choices.extend([label]*count)
-        return random.choice(choices)
+        successes = []
+        predict = self.classify(entry).predictions.copy()
+
+        for label in entry.label:
+            total = float(sum(predict.values()))
+            for key, value in predict.items():
+                predict[key] = float(predict[key]) / total
+            if label in predict:
+                success = float(predict[label]) / total
+                successes.append(success)
+
+        return sum(successes), predict
 
     def print(self, spacing=''):
         if self.is_leaf:
@@ -160,7 +163,7 @@ class Node(object):
                 probs[label] = "{:.2f}%".format(prob)
             return s + str(probs)
 
-        s = spacing + str(self.question) + '\n'
+        s = spacing + "(Gini: {:.2f}) ".format(self.gini) + str(self.question) + '\n'
         s += spacing + "├─ True:\n"
         s += self.left_branch.print(spacing + "│  ") + '\n'
         s += spacing + "└─ False:\n"
diff --git a/tree_bootstrapped.py b/tree_bootstrapped.py
index 27ac8fb..121c482 100644
--- a/tree_bootstrapped.py
+++ b/tree_bootstrapped.py
@@ -34,7 +34,7 @@ def gini(dataset, indices):
     impurity = 1
 
     for label in counts:
-        prob = counts[label] / float(len(dataset))
+        prob = counts[label] / float(len(indices))
         impurity -= prob**2
 
     return impurity
@@ -55,87 +55,89 @@ def splitter(info):
     return (gain, question, (matching, non_matching))
 
 
-def find_best_split(fields, dataset, indices, uncertainty=None):
-    print("Splitting {} entries.".format(len(dataset)))
-    best_gain, best_question, best_split = 0, None, None
-
-    uncertainty = uncertainty or gini(dataset)
-
-    columns = len(fields)
-
-    for i in range(columns):
-        values = unique_vals(dataset, indices, i)
-
-        if len(indices) > 400:
-            # Parallelize best split search
-            cpus = mp.cpu_count()
-            if i == 0:
-                print("-- Using {} CPUs to parallelize the split search."
-                      .format(cpus))
-            splits = []
-            for value in values:
-                question = Question(fields, i, value)
-                splits.append((question, dataset, indices, uncertainty))
-
-            chunk = max(int(len(splits)/(cpus*4)), 1)
-            with mp.Pool(cpus) as p:
-                for split in p.imap_unordered(splitter, splits,
-                                              chunksize=chunk):
-                    if split is not None:
-                        gain, question, branches = split
-                        if gain > best_gain:
-                            best_gain, best_question, best_split = \
-                                gain, question, branches
-        else:
-            for value in values:
-                question = Question(fields, i, value)
-
-                matching, non_matching = partition(dataset, indices, question)
-
-                if not matching or not non_matching:
-                    continue
-
-                gain = info_gain(dataset, matching, non_matching, uncertainty)
-
-                if gain > best_gain:
-                    best_gain, best_question = gain, question
-                    best_split = (matching, non_matching)
-
-    return best_gain, best_question, best_split
-
-
 class Node(object):
-    def __init__(self, fields, dataset, bootstrap, level=0):
+    def __init__(self, fields, dataset, bootstrap, level=0, out=True):
         self.fields = fields
         self.dataset = dataset
-        self.bootstrap = bootstrap
-        self.gini = gini(dataset, self.bootstrap)
-        self.build(level)
+        self.indices = bootstrap
+        self.out = out
+        self.gini = gini(dataset, self.indices)
+        self.build(level, out)
 
-    def build(self, level):
-        best_split = find_best_split(self.fields, self.dataset,
-                                     self.bootstrap, self.gini)
+    def build(self, level, out=True):
+        best_split = self.split(out)
         gain, question, branches = best_split
 
         if not branches:
             # Means we got 0 gain
-            print("Found a leaf at level {}".format(level))
-            self.predictions = count_labels(self.dataset, self.bootstrap)
+            if out:
+                print("Found a leaf at level {}".format(level))
+            self.predictions = count_labels(self.dataset, self.indices)
             self.is_leaf = True
             return
 
         left, right = branches
 
-        print("Found a level {} split:".format(level))
-        print(question)
-        print("Matching: {} entries\tNon-matching: {} entries".format(len(left), len(right))) # noqa
+        if out:
+            print("Found a level {} split:".format(level))
+            print(question)
+            print("Matching: {} entries\tNon-matching: {} entries".format(len(left), len(right)))
 
-        self.left_branch = Node(self.fields, self.dataset, left, level + 1)
-        self.right_branch = Node(self.fields, self.dataset, right, level + 1)
+        self.left_branch = Node(self.fields, self.dataset, left, level + 1, out)
+        self.right_branch = Node(self.fields, self.dataset, right, level + 1, out)
         self.question = question
         self.is_leaf = False
         return
 
+    def split(self, out=True):
+        if out:
+            print("Splitting {} entries.".format(len(self.indices)))
+        best_gain, best_question, best_split = 0, None, None
+
+        uncertainty = self.gini or gini(self.dataset, self.indices)
+
+        cpus = mp.cpu_count()
+        columns = len(self.fields)
+
+        parallelize = len(self.indices) > 1000
+
+        if parallelize and out:
+            print("\n-- Using {} CPUs to parallelize the split search\n".format(cpus))
+
+        for i in range(columns):
+            values = unique_vals(self.dataset, self.indices, i)
+
+            if parallelize:
+                # Parallelize best split search
+                splits = []
+                for value in values:
+                    question = Question(self.fields, i, value)
+                    splits.append((question, self.dataset, self.indices, uncertainty))
+
+                chunk = max(int(len(splits)/(cpus*4)), 1)
+                with mp.Pool(cpus) as p:
+                    for split in p.imap_unordered(splitter, splits, chunksize=chunk):
+                        if split is not None:
+                            gain, question, branches = split
+                            if gain > best_gain:
+                                best_gain, best_question, best_split = gain, question, branches
+            else:
+                for value in values:
+                    question = Question(self.fields, i, value)
+
+                    matching, non_matching = partition(self.dataset, self.indices, question)
+
+                    if not matching or not non_matching:
+                        continue
+
+                    gain = info_gain(self.dataset, matching, non_matching, uncertainty)
+
+                    if gain > best_gain:
+                        best_gain, best_question = gain, question
+                        best_split = (matching, non_matching)
+
+        return best_gain, best_question, best_split
+
     def classify(self, entry):
         if self.is_leaf:
             return self
@@ -145,6 +147,20 @@ class Node(object):
         else:
             return self.right_branch.classify(entry)
 
+    def predict(self, entry):
+        successes = []
+        predict = self.classify(entry).predictions.copy()
+        total = float(sum(predict.values()))
+        for key, value in predict.items():
+            predict[key] = float(predict[key]) / total
+
+        for label in entry.label:
+            if label in predict:
+                success = predict[label]
+                successes.append(success)
+
+        return sum(successes), predict
+
     def print(self, spacing=''):
         if self.is_leaf:
             s = spacing + "Predict: "
@@ -160,7 +176,7 @@ class Node(object):
         s += spacing + "├─ True:\n"
         s += self.left_branch.print(spacing + "│  ") + '\n'
         s += spacing + "└─ False:\n"
-        s += self.right_branch.print(spacing + "│  ")
+        s += self.right_branch.print(spacing + "   ")
 
         return s
 
@@ -169,14 +185,20 @@ class Node(object):
 
 
 class Tree(object):
-    def __init__(self, fields, dataset, bootstrap):
+    def __init__(self, fields, dataset, bootstrap, out=True):
         self.fields = fields
         self.dataset = dataset
-        self.bootstrap = bootstrap
-        self.root = Node(self.fields, self.dataset, self.bootstrap)
+        self.indices = bootstrap
+        # Out of bag
+        self.oob = [i for i in range(len(dataset)) if i not in bootstrap]
+
+        self.root = Node(self.fields, self.dataset, self.indices, out=out)
 
     def classify(self, entry):
         return self.root.classify(entry)
 
+    def predict(self, entry):
+        return self.root.predict(entry)
+
     def __str__(self):
         return str(self.root)
diff --git a/tree_tester.py b/tree_tester.py
index 076266e..52175db 100644
--- a/tree_tester.py
+++ b/tree_tester.py
@@ -1,7 +1,10 @@
 import os
 from timeit import default_timer as timer
 from star_reader import read_stars
-from tree import Tree
+from tree_bootstrapped import Tree
+
+
+OUTPUT_FOLDER = "output/tree"
 
 
 def log(s, open_file):
@@ -10,13 +13,14 @@ def log(s, open_file):
 
 
 if __name__ == '__main__':
-    if not os.path.exists("output"):
-        os.mkdir("output")
+    if not os.path.exists(OUTPUT_FOLDER):
+        os.mkdir(OUTPUT_FOLDER)
 
-    if not os.path.exists("output/tree_testing.txt"):
-        output = open("output/tree_testing.txt", 'w', encoding="utf-8")
+    if not os.path.exists(OUTPUT_FOLDER + "/testing.txt"):
+        output = open(OUTPUT_FOLDER + "/testing.txt", 'w',
+                      encoding="utf-8")
     else:
-        output = open("output/tree_testing.txt", 'a', encoding="utf-8")
+        output = open(OUTPUT_FOLDER + "/testing.txt", 'a', encoding="utf-8")
 
     dataset, fields = read_stars()
 
@@ -25,36 +29,34 @@ if __name__ == '__main__':
     log("Training Tree...", output)
     t_start = timer()
 
-    split = int(len(dataset) * 0.65)
+    cut = 0.02
+
+    split = int(len(dataset) * cut)
     training, testing = dataset[:split], dataset[split + 1:]
+    log("Dataset split: Training with {}% of the set".format(cut*100), output)
     log("Training set: {} entries.".format(len(training)), output)
     log("Testing set: {} entries.".format(len(testing)), output)
 
-    tree = Tree(fields, training)
+    tree = Tree(fields, training, [i for i in range(len(training))])
 
     t_end = timer()
-    timestamp = "Training complete.\nElapsed time: {:.3f}\n"
-    log(timestamp.format(t_end - t_start), output)
+    log("Training complete.\nElapsed time: {:.3f}\n".format(t_end - t_start), output)
 
     log(tree, output)
 
     log("\n-- TEST --\n", output)
 
-    failures = 0
+    total_success = 0
 
     for entry in testing:
-        label = entry.label
-        predict = tree.predict(entry)
-        if predict not in label:
-            print("Actual: {}\tPredicted: {}".format(label, predict))
-            failures += 1
+        success, predict = tree.predict(entry)
+        print("Actual: {}\tPredicted: {}.\tSuccess: {}".format(entry.label, predict, success))
+        total_success += success
 
     tested = len(testing)
-    success = tested - failures
-    s_rate = float(success)*100/float(tested)
+    s_rate = float(total_success)*100/float(tested)
 
-    log("\nSuccessfully predicted {} out of {} entries."
-        .format(success, tested), output)
+    log("\nTested {} entries.".format(tested), output)
 
     log("Accuracy: {:.2f}%\nError: {:.2f}%".format(s_rate, 100-s_rate), output)