diff --git a/.gitignore b/.gitignore index 895e92c..fa354c7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__/* output/* .vscode/* +hygdata_v3.csv \ No newline at end of file diff --git a/star.py b/star.py index 68c70f7..cd132d5 100644 --- a/star.py +++ b/star.py @@ -12,4 +12,5 @@ class Star(object): classification = ' or '.join(self.label) else: classification = self.label - return 'Star {} {} of spectral type {}'.format(self.name, self.data, classification) + return ("Star {} {} of spectral type {}" + .format(self.name, self.data, classification)) diff --git a/star_reader.py b/star_reader.py index cb2de74..2ffc947 100644 --- a/star_reader.py +++ b/star_reader.py @@ -15,6 +15,8 @@ def make_star(header, row, fields=None): num = float(value) if num == int(num): num = int(num) + else: + num = round(num, 2) value = num except ValueError: if value == '': @@ -31,10 +33,13 @@ def make_star(header, row, fields=None): type_list = value.split('/') types = [] - for sp_type in type_list: - if sp_type and sp_type[0] in STAR_CLASSES: - types.append(sp_type[0]) + for star_type in type_list: + for sp_type in STAR_CLASSES: + if star_type and sp_type in star_type.upper(): + types.append(sp_type) value = ''.join(set(types)) + if value == '': + return None data[field] = value diff --git a/tree.py b/tree.py index f04e380..65eed4b 100644 --- a/tree.py +++ b/tree.py @@ -1,4 +1,5 @@ - +import random +import multiprocessing as mp from question import Question @@ -46,7 +47,17 @@ def info_gain(left_set, right_set, uncertainty): return uncertainty - p * gini(left_set) - (1-p) * gini(right_set) +def splitter(info): + question, dataset, uncertainty = info + matching, non_matching = partition(dataset, question) + if not matching or not non_matching: + return None + gain = info_gain(matching, non_matching, uncertainty) + return (gain, question, (matching, non_matching)) + + def find_best_split(fields, dataset, uncertainty=None): + print("Splitting {} entries.".format(len(dataset))) best_gain, best_question, best_split = 0, None, None uncertainty = uncertainty or gini(dataset) @@ -55,19 +66,41 @@ def find_best_split(fields, dataset, uncertainty=None): for i in range(columns): values = unique_vals(dataset, i) - for value in values: - question = Question(fields, i, value) - matching, non_matching = partition(dataset, question) + if len(dataset) > 400: + # Parallelize best split search + cpus = mp.cpu_count() + if i == 0: + print("-- Using {} CPUs to parallelize the split search." + .format(cpus)) + splits = [] + for value in values: + question = Question(fields, i, value) + splits.append((question, dataset, uncertainty)) - if not matching or not non_matching: - continue + chunk = max(int(len(splits)/(cpus*4)), 1) + with mp.Pool(cpus) as p: + for split in p.imap_unordered(splitter, splits, + chunksize=chunk): + if split is not None: + gain, question, branches = split + if gain > best_gain: + best_gain, best_question, best_split = \ + gain, question, branches + else: + for value in values: + question = Question(fields, i, value) - gain = info_gain(matching, non_matching, uncertainty) + matching, non_matching = partition(dataset, question) - if gain > best_gain: - best_gain, best_question = gain, question - best_split = (matching, non_matching) + if not matching or not non_matching: + continue + + gain = info_gain(matching, non_matching, uncertainty) + + if gain > best_gain: + best_gain, best_question = gain, question + best_split = (matching, non_matching) return best_gain, best_question, best_split @@ -110,6 +143,13 @@ class Node(object): else: return self.right_branch.classify(entry) + def predict(self, entry): + predict = self.classify(entry).predictions + choices = [] + for label, count in predict.items(): + choices.extend([label]*count) + return random.choice(choices) + def print(self, spacing=''): if self.is_leaf: s = spacing + "Predict: " @@ -121,10 +161,10 @@ class Node(object): return s + str(probs) s = spacing + str(self.question) + '\n' - s += spacing + "-> True:\n" - s += self.left_branch.print(spacing + " ") + '\n' - s += spacing + "-> False:\n" - s += self.right_branch.print(spacing + " ") + s += spacing + "├─ True:\n" + s += self.left_branch.print(spacing + "│ ") + '\n' + s += spacing + "└─ False:\n" + s += self.right_branch.print(spacing + " ") return s @@ -141,5 +181,8 @@ class Tree(object): def classify(self, entry): return self.root.classify(entry) + def predict(self, entry): + return self.root.predict(entry) + def __str__(self): return str(self.root) diff --git a/tree_bootstrapped.py b/tree_bootstrapped.py new file mode 100644 index 0000000..27ac8fb --- /dev/null +++ b/tree_bootstrapped.py @@ -0,0 +1,182 @@ +import multiprocessing as mp +from question import Question + + +def unique_vals(dataset, indices, column): + return set([dataset[i].data[column] for i in indices]) + + +def count_labels(dataset, indices): + counts = {} + for i in indices: + for label in dataset[i].label: + if label not in counts: + counts[label] = 1 + else: + counts[label] += 1 + return counts + + +def partition(dataset, indices, question): + matching, non_matching = [], [] + + for i in indices: + if question.match(dataset[i]): + matching.append(i) + else: + non_matching.append(i) + + return matching, non_matching + + +def gini(dataset, indices): + counts = count_labels(dataset, indices) + impurity = 1 + + for label in counts: + prob = counts[label] / float(len(dataset)) + impurity -= prob**2 + + return impurity + + +def info_gain(dataset, lid, rid, uncertainty): + p = float(len(lid)) / float(len(lid) + len(rid)) + + return uncertainty - p * gini(dataset, lid) - (1-p) * gini(dataset, rid) + + +def splitter(info): + question, dataset, indices, uncertainty = info + matching, non_matching = partition(dataset, indices, question) + if not matching or not non_matching: + return None + gain = info_gain(dataset, matching, non_matching, uncertainty) + return (gain, question, (matching, non_matching)) + + +def find_best_split(fields, dataset, indices, uncertainty=None): + print("Splitting {} entries.".format(len(dataset))) + best_gain, best_question, best_split = 0, None, None + + uncertainty = uncertainty or gini(dataset) + + columns = len(fields) + + for i in range(columns): + values = unique_vals(dataset, indices, i) + + if len(indices) > 400: + # Parallelize best split search + cpus = mp.cpu_count() + if i == 0: + print("-- Using {} CPUs to parallelize the split search." + .format(cpus)) + splits = [] + for value in values: + question = Question(fields, i, value) + splits.append((question, dataset, indices, uncertainty)) + + chunk = max(int(len(splits)/(cpus*4)), 1) + with mp.Pool(cpus) as p: + for split in p.imap_unordered(splitter, splits, + chunksize=chunk): + if split is not None: + gain, question, branches = split + if gain > best_gain: + best_gain, best_question, best_split = \ + gain, question, branches + else: + for value in values: + question = Question(fields, i, value) + + matching, non_matching = partition(dataset, indices, question) + + if not matching or not non_matching: + continue + + gain = info_gain(dataset, matching, non_matching, uncertainty) + + if gain > best_gain: + best_gain, best_question = gain, question + best_split = (matching, non_matching) + + return best_gain, best_question, best_split + + +class Node(object): + def __init__(self, fields, dataset, bootstrap, level=0): + self.fields = fields + self.dataset = dataset + self.bootstrap = bootstrap + self.gini = gini(dataset, self.bootstrap) + self.build(level) + + def build(self, level): + best_split = find_best_split(self.fields, self.dataset, + self.bootstrap, self.gini) + gain, question, branches = best_split + + if not branches: + # Means we got 0 gain + print("Found a leaf at level {}".format(level)) + self.predictions = count_labels(self.dataset, self.bootstrap) + self.is_leaf = True + return + + left, right = branches + + print("Found a level {} split:".format(level)) + print(question) + print("Matching: {} entries\tNon-matching: {} entries".format(len(left), len(right))) # noqa + + self.left_branch = Node(self.fields, self.dataset, left, level + 1) + self.right_branch = Node(self.fields, self.dataset, right, level + 1) + self.question = question + self.is_leaf = False + return + + def classify(self, entry): + if self.is_leaf: + return self + + if self.question.match(entry): + return self.left_branch.classify(entry) + else: + return self.right_branch.classify(entry) + + def print(self, spacing=''): + if self.is_leaf: + s = spacing + "Predict: " + total = float(sum(self.predictions.values())) + probs = {} + for label in self.predictions: + prob = self.predictions[label] * 100 / total + probs[label] = "{:.2f}%".format(prob) + return s + str(probs) + + s = spacing + ("(Gini: {:.2f}) {}\n" + .format(self.gini, str(self.question))) + s += spacing + "├─ True:\n" + s += self.left_branch.print(spacing + "│ ") + '\n' + s += spacing + "└─ False:\n" + s += self.right_branch.print(spacing + "│ ") + + return s + + def __str__(self): + return self.print() + + +class Tree(object): + def __init__(self, fields, dataset, bootstrap): + self.fields = fields + self.dataset = dataset + self.bootstrap = bootstrap + self.root = Node(self.fields, self.dataset, self.bootstrap) + + def classify(self, entry): + return self.root.classify(entry) + + def __str__(self): + return str(self.root) diff --git a/tree_tester.py b/tree_tester.py index 9b5cfa8..076266e 100644 --- a/tree_tester.py +++ b/tree_tester.py @@ -14,9 +14,9 @@ if __name__ == '__main__': os.mkdir("output") if not os.path.exists("output/tree_testing.txt"): - output = open("output/tree_testing.txt", 'w') + output = open("output/tree_testing.txt", 'w', encoding="utf-8") else: - output = open("output/tree_testing.txt", 'a') + output = open("output/tree_testing.txt", 'a', encoding="utf-8") dataset, fields = read_stars() @@ -26,7 +26,6 @@ if __name__ == '__main__': t_start = timer() split = int(len(dataset) * 0.65) - split = 500 training, testing = dataset[:split], dataset[split + 1:] log("Training set: {} entries.".format(len(training)), output) log("Testing set: {} entries.".format(len(testing)), output) @@ -41,9 +40,22 @@ if __name__ == '__main__': log("\n-- TEST --\n", output) + failures = 0 + for entry in testing: label = entry.label - predict = tree.classify(entry) - log("Actual: {}\tPredicted: {}".format(label, predict), output) + predict = tree.predict(entry) + if predict not in label: + print("Actual: {}\tPredicted: {}".format(label, predict)) + failures += 1 + + tested = len(testing) + success = tested - failures + s_rate = float(success)*100/float(tested) + + log("\nSuccessfully predicted {} out of {} entries." + .format(success, tested), output) + + log("Accuracy: {:.2f}%\nError: {:.2f}%".format(s_rate, 100-s_rate), output) output.close()