mirror of
https://gitlab.com/vylion/velascobot.git
synced 2025-04-19 21:46:35 +02:00
- Generator (Markov) ✔️ - ChatCard (Chatlog) ✔️ - ChatReader (Scribe) 🚧 - Speaker 🚧 - - Speaker->get_reader()... 🚧
166 lines
5.9 KiB
Python
166 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import random
|
|
import json
|
|
|
|
|
|
def rewrite(text):
|
|
# This splits strings into lists of words delimited by space.
|
|
# Other whitespaces are appended space characters so they are included
|
|
# as their own Markov chain element, so as not to pollude with
|
|
# "different" words that would only differ in having a whitespace
|
|
# attached or not
|
|
words = text.replace('\n', '\n ').split(' ')
|
|
i = 0
|
|
while i < len(words):
|
|
w = words[i].strip(' \t')
|
|
if len(w) > 0:
|
|
words[i] = w
|
|
else:
|
|
del words[i]
|
|
i -= 1
|
|
i += 1
|
|
return words
|
|
|
|
|
|
def getkey(w1, w2):
|
|
# This gives a dictionary key from 2 words, ignoring case
|
|
key = (w1.strip().casefold(), w2.strip().casefold())
|
|
return str(key)
|
|
|
|
|
|
def getwords(key):
|
|
# This turns a dictionary key back into 2 separate words
|
|
words = key.strip('()').split(', ')
|
|
for i in range(len(words)):
|
|
words[i].strip('\'')
|
|
return words
|
|
|
|
|
|
def triplets(wordlist):
|
|
# Generates triplets of words from the given data string. So if our string
|
|
# were "What a lovely day", we'd generate (What, a, lovely) and then
|
|
# (a, lovely, day).
|
|
if len(wordlist) < 3:
|
|
return
|
|
|
|
for i in range(len(wordlist) - 2):
|
|
yield (wordlist[i], wordlist[i+1], wordlist[i+2])
|
|
|
|
|
|
class Generator(object):
|
|
MODE_JSON = "MODE_JSON"
|
|
# This is to mark when we want to create a Generator object from a given JSON
|
|
|
|
MODE_LIST = "MODE_LIST"
|
|
# This is to mark when we want to create a Generator object from a given list of words
|
|
|
|
MODE_CHAT_DATA = "MODE_CHAT_DATA"
|
|
# This is to mark when we want to create a Generator object from Chat data (WIP)
|
|
|
|
HEAD = "\n^MESSAGE_SEPARATOR^"
|
|
TAIL = "^MESSAGE_SEPARATOR^"
|
|
|
|
def __init__(self, load=None, mode=None):
|
|
if mode is not None:
|
|
# We ain't creating a new Generator from scratch
|
|
if mode == Generator.MODE_JSON:
|
|
self.cache = json.loads(load)
|
|
elif mode == Generator.MODE_LIST:
|
|
self.cache = {}
|
|
self.load_list(load)
|
|
else:
|
|
self.cache = {}
|
|
# The cache is where we store our words
|
|
|
|
def load_list(self, many):
|
|
# Takes a list of strings and adds them to the cache one by one
|
|
for one in many:
|
|
self.add(one)
|
|
|
|
def dumps(self):
|
|
# Dumps the cache dictionary into a JSON-formatted string
|
|
return json.dumps(self.cache)
|
|
|
|
def loads(dump):
|
|
# Loads the cache dictionary from a JSON-formatted string
|
|
if len(dump) == 0:
|
|
# faulty dump gives default Generator
|
|
return Generator()
|
|
# otherwise
|
|
return Generator(load=dump, mode=Generator.MODE_JSON)
|
|
|
|
def add(self, text):
|
|
# This takes a string and stores it in the cache, preceding it
|
|
# with the HEAD that marks the beginning of a new message and
|
|
# following it with the TAIL that marks the end
|
|
words = [Generator.HEAD]
|
|
text = text + " " + Generator.TAIL
|
|
words.extend(text.split())
|
|
self.database(rewrite(text))
|
|
|
|
def database(self, words):
|
|
# This takes a list of words and stores it in the cache, adding
|
|
# a special entry for the first word (the HEAD marker)
|
|
for w1, w2, w3 in triplets(words):
|
|
if w1 == Generator.HEAD:
|
|
if w1 in self.cache:
|
|
self.cache[Generator.HEAD].append(w2)
|
|
else:
|
|
self.cache[Generator.HEAD] = [w2]
|
|
key = getkey(w1, w2)
|
|
if key in self.cache:
|
|
# if the key exists, add the new word to the end of the chain
|
|
self.cache[key].append(w3)
|
|
else:
|
|
# otherwise, create a new entry for the new key starting with
|
|
# the new end of chain
|
|
self.cache[key] = [w3]
|
|
|
|
def generate(self, size=50, silence=False):
|
|
# This generates the Markov text/word chain
|
|
# silence tells if mentions should be silenced
|
|
if len(self.cache) == 0:
|
|
# If there is nothing in the cache we cannot generate anything
|
|
return ""
|
|
|
|
w1 = random.choice(self.cache[Generator.HEAD])
|
|
w2 = random.choice(self.cache[getkey(Generator.HEAD, w1)])
|
|
# Start with a message HEAD and a random message starting word
|
|
gen_words = []
|
|
for i in range(size):
|
|
# As long as we don't go over the size value (max. message length)...
|
|
if silence and w1.startswith("@") and len(w1) > 1:
|
|
gen_words.append(w1.replace("@", "(@)"))
|
|
# ...append the first word, silencing any possible username mention
|
|
else:
|
|
gen_words.append(w1)
|
|
# ..append the first word
|
|
if w2 == Generator.TAIL or not getkey(w1, w2) in self.cache:
|
|
# When there's no key from the last 2 words to follow the chain,
|
|
# or we reached a separation between messages, stop
|
|
break
|
|
else:
|
|
w1, w2 = w2, random.choice(self.cache[getkey(w1, w2)])
|
|
# Make the second word to be the new first word, and
|
|
# make a new random word that follows the chain to be
|
|
# the new second word
|
|
return ' '.join(gen_words)
|
|
|
|
def cross(self, gen):
|
|
# cross 2 Generators into this one
|
|
for key in gen.cache:
|
|
if key in self.cache:
|
|
self.cache[key].extend(gen.cache[key])
|
|
else:
|
|
self.cache[key] = list(gen.cache[key])
|
|
|
|
def new_count(self):
|
|
# Count again the number of messages if the current number is unreliable
|
|
count = 0
|
|
for key in self.cache:
|
|
for word in self.cache[key]:
|
|
if word == Generator.TAIL:
|
|
count += 1
|
|
# by just counting message separators
|
|
return count
|