velascobot/markov.py

#!/usr/bin/env python3

import random
import json

def getkey(w1, w2):
    key = (w1.strip().casefold(), w2.strip().casefold())
    return str(key)

def getwords(key):
    words = key.strip('()').split(', ')
    for i in range(len(words)):
        words[i].strip('\'')
    return words

def triples(wordlist):
    # Generates triples from the given data string. So if our string were
    # "What a lovely day", we'd generate (What, a, lovely) and then
    # (a, lovely, day).
    if len(wordlist) < 3:
        return

    for i in range(len(wordlist) - 2):
        yield (wordlist[i], wordlist[i+1], wordlist[i+2])

class Markov(object):
    ModeJson = "MODE_JSON"
    ModeList = "MODE_LIST"
    ModeChatData = "MODE_CHAT_DATA"

    Head = "\n^MESSAGE_SEPARATOR^"
    Tail = "^MESSAGE_SEPARATOR^"

    def __init__(self, load=None, mode=None):
        if mode is not None:
            if mode == Markov.ModeJson:
                self.cache = json.loads(load)
            elif mode == Markov.ModeList:
                self.cache = {}
                self.loadList(load)
        else:
            self.cache = {}

    def loadList(self, lines):
        for line in lines:
            words = [Markov.Head]
            words.extend(line.split())
            self.learn_words(words)

    def dumps(self):
        return json.dumps(self.cache)

    def loads(dump):
        if len(dump) == 0:
            return Markov()
        return Markov(load=dump, mode=Markov.ModeJson)

    def learn_words(self, words):
        self.database(words)

    def database(self, wordlist):
        for w1, w2, w3 in triples(wordlist):
            if w1 == Markov.Head:
                if w1 in self.cache:
                    self.cache[Markov.Head].append(w2)
                else:
                    self.cache[Markov.Head] = [w2]
            key = getkey(w1, w2)
            if key in self.cache:
                self.cache[key].append(w3)
            else:
                self.cache[key] = [w3]

    def generate_markov_text(self, size=50, silence=False):
        if len(self.cache) == 0:
            return ""
        w1 = random.choice(self.cache[Markov.Head])
        w2 = random.choice(self.cache[getkey(Markov.Head, w1)])
        gen_words = []
        for i in range(size):
            if silence and w1.startswith("@") and len(w1) > 1:
                gen_words.append(w1.replace("@", "(@)"))
            else:
                gen_words.append(w1)
            if w2 == Markov.Tail or not getkey(w1, w2) in self.cache:
                # print("Generated text")
                break
            else:
                w1, w2 = w2, random.choice(self.cache[getkey(w1, w2)])
        return ' '.join(gen_words)

    def cross(self, gen):
        for key in gen.cache:
            if key in self.cache:
                self.cache[key].extend(d[key])
            else:
                self.cache[key] = list(d[key])

    def new_count(self):
        count = 0
        for key in self.cache:
            for word in self.cache[key]:
                if word == Markov.Tail:
                    count += 1
        return count