velascobot/markov.py
vylion 1d1bd6034e Uploading Velasco v1.4
- It now saves the dictionary of vocabulary directly into the file. No
need to deal with the full list of messages.
- This also means that the amount of elements kept in memory has been
reduced, since there is no need to keep a list with all the words one
after another,apart from the dictionary.
- Modularized some constants, like the frequency of saves if the
frequency of speaking is too large, or the stop words that mark the
start and end of a message.
2017-09-21 15:39:53 +02:00

75 lines
2.1 KiB
Python

#!/usr/bin/env python3
import random
import json
HEAD = "\n!kvl"
TAIL = "!kvl"
def trim_and_split(text):
words = text.split(' ')
for i in range(len(words)):
words[i] = words[i].strip(' \t')
return words
def getkey(w1, w2):
key = (w1.strip().casefold(), w2.strip().casefold())
return str(key)
def triples(wordlist):
""" Generates triples from the given data string. So if our string were
"What a lovely day", we'd generate (What, a, lovely) and then
(a, lovely, day).
"""
if len(wordlist) < 3:
return
for i in range(len(wordlist) - 2):
yield (wordlist[i], wordlist[i+1], wordlist[i+2])
class Markov(object):
def __init__(self, text=None, from_json=False):
if not from_json:
self.cache = {}
if text is not None:
for line in text:
self.add_text(line)
else:
self.cache = json.loads(text)
def to_json(self):
return json.dumps(self.cache)
def from_json(string):
return Markov(string, True)
def add_text(self, text):
words = trim_and_split(HEAD + " " + text)
self.database(words)
def database(self, wordlist):
for w1, w2, w3 in triples(wordlist):
if w1 == HEAD:
if w1 in self.cache:
self.cache[HEAD].append(w2)
else:
self.cache[HEAD] = [w2]
key = getkey(w1, w2)
if key in self.cache:
self.cache[key].append(w3)
else:
self.cache[key] = [w3]
def generate_markov_text(self, size=50):
w1 = random.choice(self.cache[HEAD])
w2 = random.choice(self.cache[getkey(HEAD, w1)])
gen_words = []
for i in range(size):
gen_words.append(w1)
if w2 == TAIL or not getkey(w1, w2) in self.cache:
print("Generated text")
break
else:
w1, w2 = w2, random.choice(self.cache[getkey(w1, w2)])
return ' '.join(gen_words)