Overhaul 2 WIP

- Generator (Markov) ✔️ - ChatCard (Chatlog) ✔️ - ChatReader (Scribe) 🚧 - Speaker 🚧 - - Speaker->get_reader()... 🚧
2025-06-06 20:44:38 +02:00 · 2020-10-07 23:32:10 +02:00 · 2020-10-07 23:32:10 +02:00 · 328bd6adbf
commit 328bd6adbf
parent 950bbfbabd
11 changed files with 548 additions and 475 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 chatlogs/*
 __pycache__/*
 misc/*
+test/*
--- a/archivist.py
+++ b/archivist.py
@ -1,13 +1,14 @@

 import os, errno, random, pickle
-from scribe import Scribe
-from markov import Markov
+from chatreader import ChatReader as Reader
+from generator import Generator
+

 class Archivist(object):

    def __init__(self, logger, chatdir=None, chatext=None, admin=0,
-            freqIncrement=5, saveCount=15, maxFreq=100000, maxLen=50,
-            readOnly=False, filterCids=None, bypass=False
+                 freq_increment=5, save_count=15, max_period=100000, max_len=50,
+                 read_only=False, filter_cids=None, bypass=False
                 ):
        if chatdir is None or len(chatdir) == 0:
            raise ValueError("Chatlog directory name is empty")
@ -17,43 +18,46 @@ class Archivist(object):
        self.chatdir = chatdir
        self.chatext = chatext
        self.admin = admin
-        self.freqIncrement = freqIncrement
-        self.saveCount = saveCount
-        self.maxFreq = maxFreq
-        self.maxLen = maxLen
-        self.readOnly = readOnly
-        self.filterCids = filterCids
+        self.freq_increment = freq_increment
+        self.save_count = save_count
+        self.max_period = max_period
+        self.max_len = max_len
+        self.read_only = read_only
+        self.filter_cids = filter_cids
        self.bypass = bypass
-        self.scribeFolder = chatdir + "chat_{tag}"
-        self.scribePath = chatdir + "chat_{tag}/{file}{ext}"
+    
+    def chat_folder(self, *formatting, **key_format):
+        return (self.chatdir + "chat_{tag}").format(*formatting, **key_format)
+
+    def chat_file(self, *formatting, **key_format):
+        return (self.chatdir + "chat_{tag}/{file}{ext}").format(*formatting, **key_format)

    def store(self, tag, log, gen):
-        scribefolder = self.scribeFolder.format(tag=tag)
-        cardfile = self.scribePath.format(tag=tag, file="card", ext=".txt")
-        if self.readOnly:
+        chat_folder = self.chat_folder(tag=tag)
+        chat_card = self.chat_file(tag=tag, file="card", ext=".txt")
+        if self.read_only:
            return
        try:
-            if not os.path.exists(scribefolder):
-                os.makedirs(scribefolder, exist_ok=True)
-                self.logger.info("Storing a new chat. Folder {} created.".format(scribefolder))
+            if not os.path.exists(chat_folder):
+                os.makedirs(chat_folder, exist_ok=True)
+                self.logger.info("Storing a new chat. Folder {} created.".format(chat_folder))
        except:
-            self.logger.error("Failed creating {} folder.".format(scribefolder))
+            self.logger.error("Failed creating {} folder.".format(chat_folder))
            return
-        file = open(cardfile, 'w')
+        file = open(chat_card, 'w')
        file.write(log)
        file.close()
        if gen is not None:
-            recordfile = self.scribePath.format(tag=tag, file="record", ext=self.chatext)
-            file = open(recordfile, 'w')
+            chat_record = self.chat_file(tag=tag, file="record", ext=self.chatext)
+            file = open(chat_record, 'w')
            file.write(gen)
            file.close()

-    def recall(self, filename):
-        #print("Loading chat: " + path)
+    def get_reader(self, filename):
        file = open(self.chatdir + filename, 'rb')
        scribe = None
        try:
-            scribe = Scribe.Recall(pickle.load(file), self)
+            reader, vocab = Reader.FromFile(pickle.load(file), self)
            self.logger.info("Unpickled {}{}".format(self.chatdir, filename))
        except pickle.UnpicklingError:
            file.close()
@ -68,27 +72,24 @@ class Archivist(object):
        file.close()
        return scribe

-    def wakeScribe(self, filepath):
+    def load_reader(self, filepath):
        file = open(filepath.format(filename="card", ext=".txt"), 'r')
        card = file.read()
        file.close()
-        return Scribe.FromFile(card, self)
+        return Reader.FromCard(card, self)

    def wakeParrot(self, tag):
-        filepath = self.scribePath.format(tag=tag, file="record", ext=self.chatext)
+        filepath = self.chat_file(tag=tag, file="record", ext=self.chatext)
        try:
            file = open(filepath, 'r')
-            #print("\nOPening " + filepath + "\n")
            record = file.read()
            file.close()
-            return Markov.loads(record)
+            return Generator.loads(record)
        except:
-            self.logger.error("Parrot file {} not found.".format(filepath))
+            self.logger.error("Record file {} not found.".format(filepath))
            return None

-    def wakeScriptorium(self):
-        scriptorium = {}
-
+    def readers_pass(self):
        directory = os.fsencode(self.chatdir)
        for subdir in os.scandir(directory):
            dirname = subdir.name.decode("utf-8")
@ -96,17 +97,16 @@ class Archivist(object):
                cid = dirname[5:]
                try:
                    filepath = self.chatdir + dirname + "/{filename}{ext}"
-                    scriptorium[cid] = self.wakeScribe(filepath)
-                    self.logger.info("Chat {} contents:\n".format(cid) + scriptorium[cid].chat.dumps())
+                    reader = self.load_reader(filepath)
+                    self.logger.info("Chat {} contents:\n".format(cid) + reader.card.dumps())
                    if self.bypass:
-                        scriptorium[cid].setFreq(random.randint(self.maxFreq//2, self.maxFreq))
-                    elif scriptorium[cid].freq() > self.maxFreq:
-                        scriptorium[cid].setFreq(self.maxFreq)
+                        reader.set_period(random.randint(self.max_period//2, self.max_period))
+                    elif scriptorium[cid].freq() > self.max_period:
+                        scriptorium[cid].setFreq(self.max_period)
                except Exception as e:
                    self.logger.error("Failed reading {}".format(dirname))
                    self.logger.exception(e)
                    raise e
-        return scriptorium

    """
    def wake_old(self):
@ -117,17 +117,17 @@ class Archivist(object):
            filename = os.fsdecode(file)
            if filename.endswith(self.chatext):
                cid = filename[:-(len(self.chatext))]
-                if self.filterCids is not None:
+                if self.filter_cids is not None:
                    #self.logger.info("CID " + cid)
-                    if not cid in self.filterCids:
+                    if not cid in self.filter_cids:
                        continue
                scriptorium[cid] = self.recall(filename)
                scribe = scriptorium[cid]
                if scribe is not None:
                    if self.bypass:
-                        scribe.setFreq(random.randint(self.maxFreq//2, self.maxFreq))
-                    elif scribe.freq() > self.maxFreq:
-                        scribe.setFreq(self.maxFreq)
+                        scribe.setFreq(random.randint(self.max_period//2, self.max_period))
+                    elif scribe.freq() > self.max_period:
+                        scribe.setFreq(self.max_period)
                    self.logger.info("Loaded chat " + scribe.title() + " [" + scribe.cid() + "]"
                                     "\n" + "\n".join(scribe.chat.dumps()))
            else:
--- a/brain.py
+++ b/brain.py
@ -0,0 +1,5 @@
+#!/usr/bin/env python3
+
+import random
+from chatreader import ChatReader as Reader
+
--- a/chatcard.py
+++ b/chatcard.py
@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+
+def parse_card_line(line):
+    # This reads a line in the format 'VARIABLE=value' and gives me the value.
+    # See ChatCard.loadl(...) for more details
+    s = line.split('=', 1)
+    if len(s) < 2:
+        return ""
+    else:
+        return s[1]
+
+
+class ChatCard(object):
+    def __init__(self, cid, ctype, title, count=0, period=None, answer=0.5, restricted=False, silenced=False):
+        self.id = str(cid)
+        # The Telegram chat's ID
+        self.type = ctype
+        # The type of chat
+        self.title = title
+        # The title of the chat
+        if period is None:
+            if "group" in ctype:
+                period = 10
+                # Default period for groups and supergroups
+            else:
+                period = 2
+                # Default period for private or channel chats
+        self.count = count
+        # The number of messages read
+        self.period = period
+        # This chat's configured period
+        self.answer = answer
+        # This chat's configured answer probability
+        self.restricted = restricted
+        # Wether some interactions are restricted to admins only
+        self.silenced = silenced
+        # Wether messages should silence user mentions
+
+    def set_period(self, period):
+        if period < 1:
+            raise ValueError('Tried to set period a value less than 1.')
+        else:
+            self.period = period
+        return self.period
+
+    def set_answer(self, prob):
+        if prob > 1:
+            raise ValueError('Tried to set answer probability higher than 1.')
+        elif prob < 0:
+            raise ValueError('Tried to set answer probability lower than 0.')
+        else:
+            self.answer = prob
+        return self.answer
+
+    def dumps(self):
+        lines = ["CARD=v5"]
+        lines.append("CHAT_ID=" + self.id)
+        lines.append("CHAT_TYPE=" + self.type)
+        lines.append("CHAT_NAME=" + self.title)
+        lines.append("WORD_COUNT=" + str(self.count))
+        lines.append("MESSAGE_PERIOD=" + str(self.period))
+        lines.append("ANSWER_PROB=" + str(self.answer))
+        lines.append("RESTRICTED=" + str(self.restricted))
+        lines.append("SILENCED=" + str(self.silenced))
+        # lines.append("WORD_DICT=")
+        return ('\n'.join(lines)) + '\n'
+
+    def loads(text):
+        lines = text.splitlines()
+        return ChatCard.loadl(lines)
+
+    def loadl(lines):
+        # In a perfect world, I would get both the variable name and its corresponding value
+        # from each side of the lines, but I know the order in which the lines are writen in
+        # the file, I hardcoded it. So I can afford also hardcoding reading it back in the
+        # same order, and nobody can stop me
+        version = parse_card_line(lines[0]).strip()
+        version = version if len(version.strip()) > 1 else (lines[4] if len(lines) > 4 else "LOG_ZERO")
+        if version == "v4" or version == "v5":
+            return ChatCard(cid=parse_card_line(lines[1]),
+                            ctype=parse_card_line(lines[2]),
+                            title=parse_card_line(lines[3]),
+                            count=int(parse_card_line(lines[4])),
+                            period=int(parse_card_line(lines[5])),
+                            answer=float(parse_card_line(lines[6])),
+                            restricted=(parse_card_line(lines[7]) == 'True'),
+                            silenced=(parse_card_line(lines[8]) == 'True')
+                            )
+        elif version == "v3":
+            return ChatCard(cid=parse_card_line(lines[1]),
+                            ctype=parse_card_line(lines[2]),
+                            title=parse_card_line(lines[3]),
+                            count=int(parse_card_line(lines[7])),
+                            period=int(parse_card_line(lines[4])),
+                            answer=float(parse_card_line(lines[5])),
+                            restricted=(parse_card_line(lines[6]) == 'True')
+                            )
+        elif version == "v2":
+            return ChatCard(cid=parse_card_line(lines[1]),
+                            ctype=parse_card_line(lines[2]),
+                            title=parse_card_line(lines[3]),
+                            count=int(parse_card_line(lines[6])),
+                            period=int(parse_card_line(lines[4])),
+                            answer=float(parse_card_line(lines[5]))
+                            )
+        elif version == "dict:":
+            # At some point I decided to number the versions of each dictionary format,
+            # but this was not always the case. This is what you get if you try to read
+            # whatever there is in very old files where the version should be
+            return ChatCard(cid=lines[0],
+                            ctype=lines[1],
+                            title=lines[2],
+                            count=int(lines[5]),
+                            period=int(lines[3])
+                            )
+        else:
+            # This is for the oldest of files
+            return ChatCard(cid=lines[0],
+                            ctype=lines[1],
+                            title=lines[2],
+                            period=int(lines[3])
+                            )
--- a/chatlog.py
+++ b/chatlog.py
@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-
-def parse(l):
-    s = l.split('=', 1)
-    if len(s) < 2:
-        return ""
-    else:
-        return s[1]
-
-class Chatlog(object):
-    def __init__(self, cid, ctype, title, count=0, freq=None, answer=0.5, restricted=False, silenced=False):
-        self.id = str(cid)
-        self.type = ctype
-        self.title = title
-        if freq is None:
-            if "group" in ctype:
-                freq = 10
-            #elif ctype is "private":
-            else:
-                freq = 2
-        self.count = count
-        self.freq = freq
-        self.answer = answer
-        self.restricted = restricted
-        self.silenced = silenced
-
-    def add_msg(self, message):
-        self.gen.add_text(message)
-        self.count += 1
-
-    def set_freq(self, freq):
-        if freq < 1:
-            raise ValueError('Tried to set freq a value less than 1.')
-        else:
-            self.freq = freq
-        return self.freq
-
-    def set_answer(self, afreq):
-        if afreq > 1:
-            raise ValueError('Tried to set answer probability higher than 1.')
-        elif afreq < 0:
-            raise ValueError('Tried to set answer probability lower than 0.')
-        else:
-            self.answer = afreq
-        return self.answer
-
-    def dumps(self):
-        lines = ["LOG=v4"]
-        lines.append("CHAT_ID=" + self.id)
-        lines.append("CHAT_TYPE=" + self.type)
-        lines.append("CHAT_NAME=" + self.title)
-        lines.append("WORD_COUNT=" + str(self.count))
-        lines.append("MESSAGE_FREQ=" + str(self.freq))
-        lines.append("ANSWER_FREQ=" + str(self.answer))
-        lines.append("RESTRICTED=" + str(self.restricted))
-        lines.append("SILENCED=" + str(self.silenced))
-        #lines.append("WORD_DICT=")
-        return '\n'.join(lines)
-
-    def loads(text):
-        lines = text.splitlines()
-        return Chatlog.loadl(lines)
-
-    def loadl(lines):
-        version = parse(lines[0]).strip()
-        version = version if len(version.strip()) > 1 else (lines[4] if len(lines) > 4 else "LOG_ZERO")
-        if version == "v4":
-            return Chatlog(cid=parse(lines[1]),
-                           ctype=parse(lines[2]),
-                           title=parse(lines[3]),
-                           count=int(parse(lines[4])),
-                           freq=int(parse(lines[5])),
-                           answer=float(parse(lines[6])),
-                           restricted=(parse(lines[7]) == 'True'),
-                           silenced=(parse(lines[8]) == 'True')
-                     )
-        elif version == "v3":
-            return Chatlog(cid=parse(lines[1]),
-                           ctype=parse(lines[2]),
-                           title=parse(lines[3]),
-                           count=int(parse(lines[7])),
-                           freq=int(parse(lines[4])),
-                           answer=float(parse(lines[5])),
-                           restricted=(parse(lines[6]) == 'True')
-                      )
-        elif version == "v2":
-            return Chatlog(cid=parse(lines[1]),
-                           ctype=parse(lines[2]),
-                           title=parse(lines[3]),
-                           count=int(parse(lines[6])),
-                           freq=int(parse(lines[4])),
-                           answer=float(parse(lines[5]))
-                      )
-        elif version == "dict:":
-            return Chatlog(cid=lines[0],
-                           ctype=lines[1],
-                           title=lines[2],
-                           count=int(lines[5]),
-                           freq=int(lines[3])
-                      )
-        else:
-            return Chatlog(cid=lines[0],
-                           ctype=lines[1],
-                           title=lines[2],
-                           freq=int(lines[3])
-                      )
--- a/chatreader.py
+++ b/chatreader.py
@ -0,0 +1,190 @@
+#!/usr/bin/env python3
+
+import random
+from chatcard import ChatCard, parse_card_line
+from generator import Generator
+
+
+def get_chat_title(chat):
+    # This gives me the chat title, or the first and maybe last
+    # name of the user as fallback if it's a private chat
+    if chat.title is not None:
+        return chat.title
+    elif chat.first_name is not None:
+        if chat.last_name is not None:
+            return chat.first_name + " " + chat.last_name
+        else:
+            return chat.first_name
+    else:
+        return ""
+
+
+class Memory(object):
+    def __init__(self, mid, content):
+        self.id = mid
+        self.content = content
+
+
+class ChatReader(object):
+    TAG_PREFIX = "^IS_"
+    STICKER_TAG = "^IS_STICKER^"
+    ANIM_TAG = "^IS_ANIMATION^"
+    VIDEO_TAG = "^IS_VIDEO^"
+
+    def __init__(self, chatcard, max_period, logger):
+        self.card = chatcard
+        self.max_period = max_period
+        self.short_term_mem = []
+        self.countdown = self.card.period
+        self.logger = logger
+
+    def FromChat(chat, max_period, logger, newchat=False):
+        # Create a new ChatReader from a Chat object
+        card = ChatCard(chat.id, chat.type, get_chat_title(chat))
+        return ChatReader(card, max_period, logger)
+
+    def FromData(data, max_period, logger):
+        # Create a new ChatReader from a whole Chat history (WIP)
+        return None
+
+    def FromCard(card, max_period, logger):
+        # Create a new ChatReader from a card's file dump
+        chatcard = ChatCard.loads(card)
+        return ChatReader(chatcard, max_period, logger)
+
+    def FromFile(text, max_period, logger):
+        # Load a ChatReader from a file's text string
+        lines = text.splitlines()
+        version = parse_card_line(lines[0]).strip()
+        version = version if len(version.strip()) > 1 else lines[4]
+        logger.info("Dictionary version: {} ({} lines)".format(version, len(lines)))
+        vocab = None
+        if version == "v4" or version == "v5":
+            return ChatReader.FromCard(text, max_period, logger)
+            # I stopped saving the chat metadata and the cache together
+        elif version == "v3":
+            card = ChatCard.loadl(lines[0:8])
+            cache = '\n'.join(lines[9:])
+            vocab = Generator.loads(cache)
+        elif version == "v2":
+            card = ChatCard.loadl(lines[0:7])
+            cache = '\n'.join(lines[8:])
+            vocab = Generator.loads(cache)
+        elif version == "dict:":
+            card = ChatCard.loadl(lines[0:6])
+            cache = '\n'.join(lines[6:])
+            vocab = Generator.loads(cache)
+        else:
+            card = ChatCard.loadl(lines[0:4])
+            cache = lines[4:]
+            vocab = Generator(load=cache, mode=Generator.MODE_LIST)
+            # raise SyntaxError("ChatReader: ChatCard format unrecognized.")
+        s = ChatReader(card, max_period, logger)
+        return (s, vocab)
+
+    def archive(self, vocab):
+        # Returns a nice lice little tuple package for the archivist to save to file.
+        # Also commits to long term memory any pending short term memories
+        self.commit_long_term(vocab)
+        return (self.card.id, self.card.dumps(), vocab)
+
+    def check_type(self, t):
+        # Checks type. Returns "True" for "group" even if it's supergroup
+        return t in self.card.type
+
+    def exactly_type(self, t):
+        # Hard check
+        return t == self.card.type
+
+    def set_title(self, title):
+        self.card.title = title
+
+    def set_period(self, period):
+        if period < self.countdown:
+            self.countdown = max(period, 1)
+        return self.card.set_period(min(period, self.max_period))
+
+    def set_answer(self, prob):
+        return self.card.set_answer(prob)
+
+    def cid(self):
+        return str(self.card.id)
+
+    def count(self):
+        return self.card.count
+
+    def period(self):
+        return self.card.period
+
+    def title(self):
+        return self.card.title
+
+    def answer(self):
+        return self.card.answer
+
+    def ctype(self):
+        return self.card.type
+
+    def is_restricted(self):
+        return self.card.restricted
+
+    def toggle_restrict(self):
+        self.card.restricted = (not self.card.restricted)
+
+    def is_silenced(self):
+        return self.card.silenced
+
+    def toggle_silence(self):
+        self.card.silenced = (not self.card.silenced)
+
+    def is_answering(self):
+        rand = random.random()
+        chance = self.answer()
+        if chance == 1:
+            return True
+        elif chance == 0:
+            return False
+        return rand <= chance
+
+    def add_memory(self, mid, content):
+        mem = Memory(mid, content)
+        self.short_term_mem.append(mem)
+
+    def random_memory(self):
+        mem = random.choice(self.short_term_mem)
+        return mem.id
+
+    def reset_countdown(self):
+        self.countdown = self.card.period
+
+    def read(self, message):
+        mid = str(message.message_id)
+
+        if message.text is not None:
+            self.read(mid, message.text)
+        elif message.sticker is not None:
+            self.learn_drawing(mid, ChatReader.STICKER_TAG, message.sticker.file_id)
+        elif message.animation is not None:
+            self.learn_drawing(mid, ChatReader.ANIM_TAG, message.animation.file_id)
+        elif message.video is not None:
+            self.learn_drawing(mid, ChatReader.VIDEO_TAG, message.video.file_id)
+        self.card.count += 1
+
+    def learn_drawing(self, mid, tag, drawing):
+        self.learn(mid, tag + " " + drawing)
+
+    def learn(self, mid, text):
+        if "velasco" in text.casefold() and len(text.split()) <= 3:
+            return
+        self.add_memory(mid, text)
+
+    def commit_long_term(self, vocab):
+        for mem in self.short_term_mem:
+            vocab.add(mem.content)
+        self.short_term_mem = []
+
+    """
+    def learnFrom(self, scribe):
+        self.card.count += scribe.chat.count
+        self.vocab.cross(scribe.vocab)
+    """
--- a/generator.py
+++ b/generator.py
@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+
+import random
+import json
+
+
+def rewrite(text):
+    # This splits strings into lists of words delimited by space.
+    # Other whitespaces are appended space characters so they are included
+    # as their own Markov chain element, so as not to pollude with
+    # "different" words that would only differ in having a whitespace
+    # attached or not
+    words = text.replace('\n', '\n ').split(' ')
+    i = 0
+    while i < len(words):
+        w = words[i].strip(' \t')
+        if len(w) > 0:
+            words[i] = w
+        else:
+            del words[i]
+            i -= 1
+        i += 1
+    return words
+
+
+def getkey(w1, w2):
+    # This gives a dictionary key from 2 words, ignoring case
+    key = (w1.strip().casefold(), w2.strip().casefold())
+    return str(key)
+
+
+def getwords(key):
+    # This turns a dictionary key back into 2 separate words
+    words = key.strip('()').split(', ')
+    for i in range(len(words)):
+        words[i].strip('\'')
+    return words
+
+
+def triplets(wordlist):
+    # Generates triplets of words from the given data string. So if our string
+    # were "What a lovely day", we'd generate (What, a, lovely) and then
+    # (a, lovely, day).
+    if len(wordlist) < 3:
+        return
+
+    for i in range(len(wordlist) - 2):
+        yield (wordlist[i], wordlist[i+1], wordlist[i+2])
+
+
+class Generator(object):
+    MODE_JSON = "MODE_JSON"
+    # This is to mark when we want to create a Generator object from a given JSON
+
+    MODE_LIST = "MODE_LIST"
+    # This is to mark when we want to create a Generator object from a given list of words
+
+    MODE_CHAT_DATA = "MODE_CHAT_DATA"
+    # This is to mark when we want to create a Generator object from Chat data (WIP)
+
+    HEAD = "\n^MESSAGE_SEPARATOR^"
+    TAIL = "^MESSAGE_SEPARATOR^"
+
+    def __init__(self, load=None, mode=None):
+        if mode is not None:
+            # We ain't creating a new Generator from scratch
+            if mode == Generator.MODE_JSON:
+                self.cache = json.loads(load)
+            elif mode == Generator.MODE_LIST:
+                self.cache = {}
+                self.load_list(load)
+        else:
+            self.cache = {}
+            # The cache is where we store our words
+
+    def load_list(self, many):
+        # Takes a list of strings and adds them to the cache one by one
+        for one in many:
+            self.add(one)
+
+    def dumps(self):
+        # Dumps the cache dictionary into a JSON-formatted string
+        return json.dumps(self.cache)
+
+    def loads(dump):
+        # Loads the cache dictionary from a JSON-formatted string
+        if len(dump) == 0:
+            # faulty dump gives default Generator
+            return Generator()
+        # otherwise
+        return Generator(load=dump, mode=Generator.MODE_JSON)
+
+    def add(self, text):
+        # This takes a string and stores it in the cache, preceding it
+        # with the HEAD that marks the beginning of a new message and
+        # following it with the TAIL that marks the end
+        words = [Generator.HEAD]
+        text = text + " " + Generator.TAIL
+        words.extend(text.split())
+        self.database(rewrite(text))
+
+    def database(self, words):
+        # This takes a list of words and stores it in the cache, adding
+        # a special entry for the first word (the HEAD marker)
+        for w1, w2, w3 in triplets(words):
+            if w1 == Generator.HEAD:
+                if w1 in self.cache:
+                    self.cache[Generator.HEAD].append(w2)
+                else:
+                    self.cache[Generator.HEAD] = [w2]
+            key = getkey(w1, w2)
+            if key in self.cache:
+                # if the key exists, add the new word to the end of the chain
+                self.cache[key].append(w3)
+            else:
+                # otherwise, create a new entry for the new key starting with
+                # the new end of chain
+                self.cache[key] = [w3]
+
+    def generate(self, size=50, silence=False):
+        # This generates the Markov text/word chain
+        # silence tells if mentions should be silenced
+        if len(self.cache) == 0:
+            # If there is nothing in the cache we cannot generate anything
+            return ""
+
+        w1 = random.choice(self.cache[Generator.HEAD])
+        w2 = random.choice(self.cache[getkey(Generator.HEAD, w1)])
+        # Start with a message HEAD and a random message starting word
+        gen_words = []
+        for i in range(size):
+            # As long as we don't go over the size value (max. message length)...
+            if silence and w1.startswith("@") and len(w1) > 1:
+                gen_words.append(w1.replace("@", "(@)"))
+                # ...append the first word, silencing any possible username mention
+            else:
+                gen_words.append(w1)
+                # ..append the first word
+            if w2 == Generator.TAIL or not getkey(w1, w2) in self.cache:
+                # When there's no key from the last 2 words to follow the chain,
+                # or we reached a separation between messages, stop
+                break
+            else:
+                w1, w2 = w2, random.choice(self.cache[getkey(w1, w2)])
+                # Make the second word to be the new first word, and
+                # make a new random word that follows the chain to be
+                # the new second word
+        return ' '.join(gen_words)
+
+    def cross(self, gen):
+        # cross 2 Generators into this one
+        for key in gen.cache:
+            if key in self.cache:
+                self.cache[key].extend(gen.cache[key])
+            else:
+                self.cache[key] = list(gen.cache[key])
+
+    def new_count(self):
+        # Count again the number of messages if the current number is unreliable
+        count = 0
+        for key in self.cache:
+            for word in self.cache[key]:
+                if word == Generator.TAIL:
+                    count += 1
+                    # by just counting message separators
+        return count
--- a/markov.py
+++ b/markov.py
@ -1,105 +0,0 @@
-#!/usr/bin/env python3
-
-import random
-import json
-
-def getkey(w1, w2):
-    key = (w1.strip().casefold(), w2.strip().casefold())
-    return str(key)
-
-def getwords(key):
-    words = key.strip('()').split(', ')
-    for i in range(len(words)):
-        words[i].strip('\'')
-    return words
-
-def triples(wordlist):
-    # Generates triples from the given data string. So if our string were
-    # "What a lovely day", we'd generate (What, a, lovely) and then
-    # (a, lovely, day).
-    if len(wordlist) < 3:
-        return
-
-    for i in range(len(wordlist) - 2):
-        yield (wordlist[i], wordlist[i+1], wordlist[i+2])
-
-class Markov(object):
-    ModeJson = "MODE_JSON"
-    ModeList = "MODE_LIST"
-    ModeChatData = "MODE_CHAT_DATA"
-
-    Head = "\n^MESSAGE_SEPARATOR^"
-    Tail = "^MESSAGE_SEPARATOR^"
-
-    def __init__(self, load=None, mode=None):
-        if mode is not None:
-            if mode == Markov.ModeJson:
-                self.cache = json.loads(load)
-            elif mode == Markov.ModeList:
-                self.cache = {}
-                self.loadList(load)
-        else:
-            self.cache = {}
-
-    def loadList(self, lines):
-        for line in lines:
-            words = [Markov.Head]
-            words.extend(line.split())
-            self.learn_words(words)
-
-    def dumps(self):
-        return json.dumps(self.cache)
-
-    def loads(dump):
-        if len(dump) == 0:
-            return Markov()
-        return Markov(load=dump, mode=Markov.ModeJson)
-
-    def learn_words(self, words):
-        self.database(words)
-
-    def database(self, wordlist):
-        for w1, w2, w3 in triples(wordlist):
-            if w1 == Markov.Head:
-                if w1 in self.cache:
-                    self.cache[Markov.Head].append(w2)
-                else:
-                    self.cache[Markov.Head] = [w2]
-            key = getkey(w1, w2)
-            if key in self.cache:
-                self.cache[key].append(w3)
-            else:
-                self.cache[key] = [w3]
-
-    def generate_markov_text(self, size=50, silence=False):
-        if len(self.cache) == 0:
-            return ""
-        w1 = random.choice(self.cache[Markov.Head])
-        w2 = random.choice(self.cache[getkey(Markov.Head, w1)])
-        gen_words = []
-        for i in range(size):
-            if silence and w1.startswith("@") and len(w1) > 1:
-                gen_words.append(w1.replace("@", "(@)"))
-            else:
-                gen_words.append(w1)
-            if w2 == Markov.Tail or not getkey(w1, w2) in self.cache:
-                # print("Generated text")
-                break
-            else:
-                w1, w2 = w2, random.choice(self.cache[getkey(w1, w2)])
-        return ' '.join(gen_words)
-
-    def cross(self, gen):
-        for key in gen.cache:
-            if key in self.cache:
-                self.cache[key].extend(d[key])
-            else:
-                self.cache[key] = list(d[key])
-
-    def new_count(self):
-        count = 0
-        for key in self.cache:
-            for word in self.cache[key]:
-                if word == Markov.Tail:
-                    count += 1
-        return count
--- a/scribe.py
+++ b/scribe.py
@ -1,194 +0,0 @@
-#!/usr/bin/env python3
-
-import random
-from chatlog import *
-from markov import Markov
-
-def getTitle(chat):
-    if chat.title is not None:
-        return chat.title
-    elif chat.first_name is not None:
-        if chat.last_name is not None:
-            return chat.first_name + " " + chat.last_name
-        else:
-            return chat.first_name
-    else:
-        return ""
-
-def rewrite(text):
-    words = text.replace('\n', '\n ').split(' ')
-    i = 0
-    while i < len(words):
-        w = words[i].strip(' \t')
-        if len(w) > 0:
-            words[i] = w
-        else:
-            del words[i]
-            i -= 1
-        i += 1
-    return words
-
-class Page(object):
-    def __init__(self, mid, content):
-        self.id = mid
-        self.content = content
-
-class Scribe(object):
-    TagPrefix = "^IS_"
-    StickerTag = "^IS_STICKER^"
-    AnimTag = "^IS_ANIMATION^"
-    VideoTag = "^IS_VIDEO^"
-
-    def __init__(self, chatlog, archivist):
-        self.chat = chatlog
-        self.archivist = archivist
-        self.pages = []
-        self.countdown = self.chat.freq
-        self.logger = self.archivist.logger
-
-    def FromChat(chat, archivist, newchat=False):
-        chatlog = Chatlog(chat.id, chat.type, getTitle(chat))
-        scribe = Scribe(chatlog, archivist)
-        return scribe
-
-    def FromData(data, archivist):
-        return None
-
-    def FromFile(log, archivist):
-        chatlog = Chatlog.loads(log)
-        return Scribe(chatlog, archivist)
-
-    def Recall(text, archivist):
-        lines = text.splitlines()
-        version = parse(lines[0]).strip()
-        version = version if len(version.strip()) > 1 else lines[4]
-        archivist.logger.info( "Dictionary version: {} ({} lines)".format(version, len(lines)) )
-        if version == "v4":
-            chatlog = Chatlog.loadl(lines[0:9])
-            cache = '\n'.join(lines[10:])
-            parrot = Markov.loads(cache)
-        elif version == "v3":
-            chatlog = Chatlog.loadl(lines[0:8])
-            cache = '\n'.join(lines[9:])
-            parrot = Markov.loads(cache)
-        elif version == "v2":
-            chatlog = Chatlog.loadl(lines[0:7])
-            cache = '\n'.join(lines[8:])
-            parrot = Markov.loads(cache)
-        elif version == "dict:":
-            chatlog = Chatlog.loadl(lines[0:6])
-            cache = '\n'.join(lines[6:])
-            parrot = Markov.loads(cache)
-        else:
-            chatlog = Chatlog.loadl(lines[0:4])
-            cache = lines[4:]
-            parrot = Markov(load=cache, mode=Markov.ModeList)
-            #raise SyntaxError("Scribe: Chatlog format unrecognized.")
-        s = Scribe(chatlog, archivist)
-        s.parrot = parrot
-        return s
-
-    def store(self, parrot):
-        self.archivist.store(self.chat.id, self.chat.dumps(), parrot)
-
-    def checkType(self, t):
-        return t in self.chat.type
-
-    def compareType(self, t):
-        return t == self.chat.type
-
-    def setTitle(self, title):
-        self.chat.title = title
-
-    def setFreq(self, freq):
-        if freq < self.countdown:
-            self.countdown = max(freq, 1)
-        return self.chat.set_freq(min(freq, self.archivist.maxFreq))
-
-    def setAnswer(self, afreq):
-        return self.chat.set_answer(afreq)
-
-    def cid(self):
-        return str(self.chat.id)
-
-    def count(self):
-        return self.chat.count
-
-    def freq(self):
-        return self.chat.freq
-
-    def title(self):
-        return self.chat.title
-
-    def answer(self):
-        return self.chat.answer
-
-    def type(self):
-        return self.chat.type
-
-    def isRestricted(self):
-        return self.chat.restricted
-
-    def restrict(self):
-        self.chat.restricted = (not self.chat.restricted)
-
-    def isSilenced(self):
-        return self.chat.silenced
-
-    def silence(self):
-        self.chat.silenced = (not self.chat.silenced)
-
-    def isAnswering(self):
-        rand = random.random()
-        chance = self.answer()
-        if chance == 1:
-            return True
-        elif chance == 0:
-            return False
-        return rand <= chance
-
-    def addPage(self, mid, content):
-        page = Page(mid, content)
-        self.pages.append(page)
-
-    def getReference(self):
-        page = random.choice(self.pages)
-        return page.id
-
-    def resetCountdown(self):
-        self.countdown = self.chat.freq
-
-    def learn(self, message):
-        mid = str(message.message_id)
-
-        if message.text is not None:
-            self.read(mid, message.text)
-        elif message.sticker is not None:
-            self.learnDrawing(mid, Scribe.StickerTag, message.sticker.file_id)
-        elif message.animation is not None:
-            self.learnDrawing(mid, Scribe.AnimTag, message.animation.file_id)
-        elif message.video is not None:
-            self.learnDrawing(mid, Scribe.VideoTag, message.video.file_id)
-        self.chat.count += 1
-
-    def learnDrawing(self, mid, tag, drawing):
-        self.read(mid, tag + " " + drawing)
-
-    def read(self, mid, text):
-        if "velasco" in text.casefold() and len(text.split()) <= 3:
-            return
-        words = [Markov.Head]
-        text = text + " " + Markov.Tail
-        words.extend(rewrite(text))
-        self.addPage(mid, words)
-
-    def teachParrot(self, parrot):
-        for page in self.pages:
-            parrot.learn_words(page.content)
-        self.pages = []
-
-"""
-    def learnFrom(self, scribe):
-        self.chat.count += scribe.chat.count
-        self.parrot.cross(scribe.parrot)
-"""
--- a/speaker.py
+++ b/speaker.py
@ -1,24 +1,25 @@
 #!/usr/bin/env python3

 import random
-from scribe import Scribe
-from markov import Markov
+from chatreader import ChatReader as Reader
 from telegram.error import *

-def send(bot, cid, text, replying=None, format=None, logger=None, **kwargs):
-    kwargs["parse_mode"] = format
+
+def send(bot, cid, text, replying=None, formatting=None, logger=None, **kwargs):
+    kwargs["parse_mode"] = formatting
    kwargs["reply_to_message_id"] = replying

-    if text.startswith(Scribe.TagPrefix):
+    if text.startswith(Reader.TAG_PREFIX):
        words = text.split(maxsplit=1)
        if logger:
            logger.info('Sending {} "{}" to {}'.format(words[0][4:-1], words[1], cid))
+            # Logs something like 'Sending VIDEO "VIDEO_ID" to CHAT_ID'

-        if words[0] == Scribe.StickerTag:
+        if words[0] == Reader.STICKER_TAG:
            return bot.send_sticker(cid, words[1], **kwargs)
-        elif words[0] == Scribe.AnimTag:
+        elif words[0] == Reader.ANIM_TAG:
            return bot.send_animation(cid, words[1], **kwargs)
-        elif words[0] == Scribe.VideoTag:
+        elif words[0] == Reader.VIDEO_TAG:
            return bot.send_video(cid, words[1], **kwargs)
    else:
        text
@ -27,17 +28,6 @@ def send(bot, cid, text, replying=None, format=None, logger=None, **kwargs):
            logger.info("Sending a {} to {}: '{}'".format(mtype, cid, text))
        return bot.send_message(cid, text, **kwargs)

-def getTitle(chat):
-    if chat.title:
-        return chat.title
-    else:
-        last = chat.last_name if chat.last_name else ""
-        first = chat.first_name if chat.first_name else ""
-        name = " ".join([first, last]).strip()
-        if len(name) == 0:
-            return "Unknown"
-        else:
-            return name

 class Speaker(object):
    ModeFixed = "FIXED_MODE"
@ -79,7 +69,7 @@ class Speaker(object):
    def getScribe(self, chat):
        cid = str(chat.id)
        if not cid in self.scriptorium:
-            scribe = Scribe.FromChat(chat, self.archivist, newchat=True)
+            scribe = Reader.FromChat(chat, self.archivist, newchat=True)
            self.scriptorium[cid] = scribe
            return scribe
        else:
--- a/velasco.py
+++ b/velasco.py
@ -49,20 +49,24 @@ about_msg = "I am yet another Markov Bot experiment. I read everything you type

 explanation = "I decompose every message I read in groups of 3 consecutive words, so for each consecutive pair I save the word that can follow them. I then use this to make my own messages. At first I will only repeat your messages because for each 2 words I will have very few possible following words.\n\nI also separate my vocabulary by chats, so anything I learn in one chat I will only say in that chat. For privacy, you know. Also, I save my vocabulary in the form of a json dictionary, so no logs are kept.\n\nMy default frequency in private chats is one message of mine from each 2 messages received, and in group chats it\'s 10 messages I read for each message I send."

+
 def static_reply(text, format=None):
    def reply(bot, update):
        update.message.reply_text(text, parse_mode=format)
    return reply

+
 def error(bot, update, error):
    logger.warning('Update "{}" caused error "{}"'.format(update, error))

+
 def stop(bot, update):
    scribe = speakerbot.getScribe(update.message.chat.id)
    #del chatlogs[chatlog.id]
    #os.remove(LOG_DIR + chatlog.id + LOG_EXT)
    logger.warning("I got blocked by user {} [{}]".format(scribe.title(), scribe.cid()))

+
 def main():
    global speakerbot
    parser = argparse.ArgumentParser(description='A Telegram markov bot.')