From 1d1bd6034e02519cc3c21a4cedf18f568855d90e Mon Sep 17 00:00:00 2001 From: vylion Date: Thu, 21 Sep 2017 15:39:53 +0200 Subject: [PATCH] Uploading Velasco v1.4 - It now saves the dictionary of vocabulary directly into the file. No need to deal with the full list of messages. - This also means that the amount of elements kept in memory has been reduced, since there is no need to keep a list with all the words one after another,apart from the dictionary. - Modularized some constants, like the frequency of saves if the frequency of speaking is too large, or the stop words that mark the start and end of a message. --- __pycache__/chatlog.cpython-36.pyc | Bin 2207 -> 2176 bytes __pycache__/markov.cpython-36.pyc | Bin 1833 -> 2478 bytes chatlog.py | 47 ++++++++--------- markov.py | 81 +++++++++++++++++++---------- velasco.py | 20 ++++--- 5 files changed, 86 insertions(+), 62 deletions(-) diff --git a/__pycache__/chatlog.cpython-36.pyc b/__pycache__/chatlog.cpython-36.pyc index 201f5dc36a4b327b6aaf6d518404b913a2e82340..2783f7e1bfb07dee82fc92b8341bfe838452a044 100644 GIT binary patch literal 2176 zcmZ`)&5j#I5bo~z@pyJO*=$0VMGzc*B25&$kpLkfgd#-<2?-xaln5UfP23%4?D34# z)4Q_YXcidvG3xI?3~Tt3H!S6i02Pe~>NSC6(@5Wy*qw&@jufXs=}Yf4XadOn z#)rJCzcNANyW3$IN*|KYbk;kOWEA(n24h_v&Cf2w*cqAD93<&luQu9>Tw33p%#{sH zZVF}nr&>L?UZF}GjH;v3*zoI2Ppf0=ja8Y1)~{9Z)CRd!r5Q#e1W4C)45iSdelU6q z?C(rp#q{?_htH~+Iy^op@~S+1Tq>>hG0gS02V$-xN|uXdKV(cJC9f7 z8uo0rRwi|*>|T=Y;}(Y^x)KErFo!uQ#dbF6(f5mI~tz*FH6 z_6CcfcXN7NoVjxIG}WJtGAUD6Bs*BHowc2rs_Sg5UUj%br)Tw;OxE<)8Q{%py9`c+ zj3z$TUCDkHw0G*60N~($BhpSI9x!N&|4$ok;%|d)p4a+d)oHc{XXNf0&BgWj8ff3No3=CUZ8b5kb}9- zOQY$!^@zJkiU`_a_o=RCE-@}miLdua_AbdfrTRWa??ubcM$i894kLFJiTW6aYr3{d zRK(xodq|frqI2z|J)DTaAlaf`US>0urZ!H~StXAO;FXI)Ypknuh;Z% zuHPbZgUI_tJ|Ob4d>ltz+QmvB)%1YT&aOmW6h%I$7j>OjVC`EQ0*9qXXO_iVxz@W8TG=~CBi=gSxsX~DUF(RNK80}i2M3M4x zZP)^RsnQ<%488Updg*KIwI{toPW@(SGL2yhJ3Hk5%zQK7j6UDk=>75KKhOUT82gv~ z@A05N#wg!|5KQolJ!L;I;RyE?6RzYdcjO4~XBPQu>|hZ${K1&C=5e7s#wedbNH*ev z;lz%;fsb6_3I7!vc_I)U%)SUk7jqzb;u_|T=!*@^q1Y7HF?Yq5xPiGRw#7S`uL%y@ zH%;fs%S30zWH<%_`_SxQJi#b`f~Y*fW?i*ac@>LU9!wa`npb?qp*fWU=~gbJSNYf* zs_zA_oLQj05ze`vd%`{UDfRE7Ww`` zp=OIL8RLV@l5$)$^NT|7(@hA%-r~%Jah&F$KW}X1b-+u|-byOM5S0-e?TA zt5;imfHM^reCCzg8jM@So|KQU> zp$55}Bsx8j0}B7(B*~8DUi#9bsp<^zP11vdA@lxyVv& zf@HCfxv&h0TpM=Nnai?FCepZ|-DQ6}i4ZZq--X4})`fL;b|$3)GF6XQ-~9(gr=i1migQ7-aSU&gUHLEPu|P>eJ8v9}~XGQ_>lcaVYQtp?KD zTU%s=kcLIJ4TUhR@QK(nbygRgcGU-1Q`GE6J1`|xqGe;6Tm$PZ-7hg68IsBD6C5ik zengAqM&puQd&`rS3xq;jA6CR+RFz}gOy+ftueQ}Ch~wfo*PoHS%T?8`wibzfZB|Ek zA?y=Q^#jI%cb0#(j{G4e&^5Z`)UUH_84$e1fKJPRFTen2+93u2fjd;UVZrcT6sj&X zMTu90vu+U~X|}j*qk9*x4VV2a&1I?HryZB#8|Wg|r}`n5%1sCga38-doB!oz6Y6?f zi=0rCL7VlbPzX8dj)T|E={@Q$g8k;~61%t0U8=^)B{onLj&2_?Fq#XjQ`fQ5gLDuZ zF8p2WTKT{n4A*4Hwx2bTY)v^?Q`jTl^lb6g&}=Mj(q39LZklTpz#aPVAqg9jtMG@1 zs+dy*zQVQ(?o-#<#_vtvX*QwqhEbpX0`g=o;!RT8o@jy4KVM0gpc!t3ZWxAs*al&YZx2`r>kM72Pxt!Iz3 z^4PF<(b7G1l1U!3Rx$Dxm!c)4)v}~5op-EdOIMaKI?|J6jP5HYD{|=_lS@%)YPCGc z_gSNQi8rz4*r|NkJB!CQ^=(dbcq}w>`2sEd7G1`=a607^K6}2(TW#q(azUTf}_~nM3JLxd)=-g4^;3&$iI2vF|tIn`t<+hGSSUOLX zOmp7(i>V5nzTe(DPWsW-*+tw*23rS->J8(tjfp4@({?f+-zIviAF5t*wmH1WYj|fr z2nRAS4$Bk*;AD(f_%gr8RTXwy&j#WZH=Oa` z{;OyZsZd9$Psqm@4^`6l^>O4MbUBtjC z22Rx_IQ5Q$S&noKvr5u4%qq(YF-tDVD#nt$BWvCN=ejmdt2 zmhPa-M7q&I-U7-T>y_2B(SwX~TANrXLY`Suj_iR-gQ^x?izr9CB`b;$jB1DN<0$t+ zDFZ_wyPGCpRh)z}1>@{AiVsx@zVb4qIWWt1+(8WxzphlL7X+O_M+bqA`IH#KxXoSe zfi!h7zFmZ~Ip4LYS<1N7tJkgmzY!V?|S zOoNdt$k_@Q2=PO#Bn-UBi)bpkI~0Pz$n77#f4-M{M5?#Y3QU{ODJ6E^{g#oQL! zOd;xgf&4tji<^;BEF}65{27gaLLtiWZ`MRitcjfo&3i)aW#*neS3RTqYUl;($<)`N znpsCCW18X6*-6Y@(H$G z1BXC1Wcw4wyeze~X41C!Z)*$b!}klxw2s)s9<{VHgRu!1?5rq5UH|f;b(id3g(y9 yW76Ln3LSwDen6nDL`67=W$QiK)`e1tNK4mI0hR4 literal 1833 zcmbtVOK;>v5bkb2$D3rA5JgWI67hiDx|ar0u*$ z_G$7v_Rf#sz?C!N&{t0L7xqL|duEhfxX~k5m#fRw_4umhV0G2|_KV+t`qL)lZ*pgu zz#qX=z7K>GPBRjc=Y%tE+!AgGIyFMZ=`jh+8EBAhf^@MAV2duka4{ZfR(1i>u21E|$aMvYRmY9vANTd<9Q=2qYyniAWzQt-<`z zm0O%Pl(U*{k^!6Qlgg;cfKDlBnZQ|7LLnG%+UP1Wu|67JW?o<`D~Xdcp{%Py@={sl zI7=&KWFl7`6h&n^5z6Ip6`#hX2$(WUk@b{WiRox==_lnH5JGMExzwa+gcw|XKs=uZy?_@ zUO<}8AD=+AAywAoH~+cFg^Vjv`jt$_8SFh-jQr|M`2F-!o-&l`M&f?0C{j7i#-GGIgM?0YTDfj#H+x#b}vg5})bEUwKbB?#>|FZUE zskDQpg59OY48ho)VZvsZy}*zTY|;XvEE(s0sHpVtji$PmEm=j0GmrN^yp%2wVy{t? zZPCfvyfSlzWns`PHjhT?55Qb^A>m|7Ao*YZY&5iCjZsr56{#809&5}R(aDCkJi@&; zx5>b$jVXmPc(u{Tl7aLYyg^YAzqA{eas#X@HvvfYii}IVRoU@)EOH)LZA9$qbH%Rq zLw0qbjG?%BBR<)IZyv%^V*G?|&uia;1?Fe_aJ=kBR7UXW7;Q*u1(4#}OU?jj ztYP4>Q8NuJ*trH)D{BE~0D>E%F&pdp8*bIc(2=+Q$IAKP34ttHVAXD%K55+RUS-zM z5Rmt<>D2DPs%^|D+~y4J@{igc*iZupX%GE%z=yvomus!>$@$CL0gpO(^1ALev{HM| z`AevWGw^D6ilfl&N<`QUDIe6f4fx~5NIAHtc_k5J z8k%w)xp$GggG5)?8bgOw%1+LTG!e=i6lorKZK+K?rC>4%9(IsMIJ?454wtp8w`BL_ zCfeKr5_X{7^m!@YMj=kKI~}mG=*FlWTM;_{G3e-j213{AQ-loN1&Twp%J;xd;NTd{S$g@pzY_Z# udr)_)OTCV};<{Q2jsF20D7!cvh@{f8JJD)8y6N3#tq+GdIZ8C>vVQn diff --git a/chatlog.py b/chatlog.py index c1c3186..a1eed2a 100644 --- a/chatlog.py +++ b/chatlog.py @@ -3,21 +3,22 @@ from markov import * class Chatlog(object): - def __init__(self, ident, chattype, title, msgs=None, freq=None): - if msgs is not None: - self.msgs = msgs - else: - self.msgs = [] + def __init__(self, ident, chattype, title, text=None, freq=None): self.id = str(ident) self.type = chattype self.title = title if freq is None: if "group" in chattype: - freq = 20 + freq = 15 #elif chattype is "private": else: - freq = 5 + freq = 2 self.freq = freq + if text is not None: + self.count = len(text) + else: + self.count = 0 + self.gen = Markov(text) def set_title(self, title): self.title = title @@ -31,36 +32,30 @@ class Chatlog(object): return self.freq def add_msg(self, message): - msg = message.split() - msg.append("!kvl") - self.msgs.append(msg) - - def get_markov_gen(self): - msgs = [] - for m in self.msgs: - msgs.append(' '.join(m)) - text = ' '.join(msgs) - self.gen = Markov(text) + self.gen.add_text(message + " !kvl") + self.count += 1 def speak(self): - self.get_markov_gen() return self.gen.generate_markov_text() def get_count(self): - return len(self.msgs) + return self.count def to_txt(self): lines = [self.id] lines.append(self.type) lines.append(self.title) lines.append(str(self.freq)) - for m in self.msgs: - lines.append(' '.join(m)) - return '\n'.join(lines) + lines.append("dict:") + txt = '\n'.join(lines) + return txt + '\n' + self.gen.to_json() def from_txt(text): lines = text.splitlines() - msgs = [] - for m in lines[4:]: - msgs.append(m.split()) - return Chatlog(lines[0], lines[1], lines[2], msgs, int(lines[3])) + if(lines[4] == "dict:"): + new_log = Chatlog(lines[0], lines[1], lines[2], None, int(lines[3])) + cache = '\n'.join(lines[5:]) + new_log.gen = Markov.from_json(cache) + return new_log + else: + return Chatlog(lines[0], lines[1], lines[2], lines[4:], int(lines[3])) diff --git a/markov.py b/markov.py index fe7007a..3e39383 100644 --- a/markov.py +++ b/markov.py @@ -1,50 +1,75 @@ #!/usr/bin/env python3 import random +import json + +HEAD = "\n!kvl" +TAIL = "!kvl" + +def trim_and_split(text): + words = text.split(' ') + for i in range(len(words)): + words[i] = words[i].strip(' \t') + return words + +def getkey(w1, w2): + key = (w1.strip().casefold(), w2.strip().casefold()) + return str(key) + +def triples(wordlist): + """ Generates triples from the given data string. So if our string were + "What a lovely day", we'd generate (What, a, lovely) and then + (a, lovely, day). + """ + + if len(wordlist) < 3: + return + + for i in range(len(wordlist) - 2): + yield (wordlist[i], wordlist[i+1], wordlist[i+2]) class Markov(object): - def __init__(self, text=None): - self.cache = {} - self.words = [] - if text is None: - text = "" - self.words = ("!kvl\n"+text).split() - self.word_size = len(self.words) - self.database() + def __init__(self, text=None, from_json=False): + if not from_json: + self.cache = {} + if text is not None: + for line in text: + self.add_text(line) + else: + self.cache = json.loads(text) - def triples(self): - """ Generates triples from the given data string. So if our string were - "What a lovely day", we'd generate (What, a, lovely) and then - (a, lovely, day). - """ + def to_json(self): + return json.dumps(self.cache) - if len(self.words) < 3: - return + def from_json(string): + return Markov(string, True) - for i in range(len(self.words) - 2): - yield (self.words[i], self.words[i+1], self.words[i+2]) + def add_text(self, text): + words = trim_and_split(HEAD + " " + text) + self.database(words) - def database(self): - for w1, w2, w3 in self.triples(): - key = (w1.casefold(), w2.casefold()) + def database(self, wordlist): + for w1, w2, w3 in triples(wordlist): + if w1 == HEAD: + if w1 in self.cache: + self.cache[HEAD].append(w2) + else: + self.cache[HEAD] = [w2] + key = getkey(w1, w2) if key in self.cache: self.cache[key].append(w3) else: self.cache[key] = [w3] def generate_markov_text(self, size=50): - seed = random.randint(0, self.word_size-4) - seed_word, next_word, next_word2 = self.words[seed], self.words[seed+1], self.words[seed+2] - while not "!kvl" in seed_word: - seed = random.randint(0, self.word_size-4) - seed_word, next_word, next_word2 = self.words[seed], self.words[seed+1], self.words[seed+2] - w1, w2 = next_word, next_word2 + w1 = random.choice(self.cache[HEAD]) + w2 = random.choice(self.cache[getkey(HEAD, w1)]) gen_words = [] for i in range(size): gen_words.append(w1) - if "!kvl" in w2 or not (w1.casefold(), w2.casefold()) in self.cache: + if w2 == TAIL or not getkey(w1, w2) in self.cache: print("Generated text") break else: - w1, w2 = w2, random.choice(self.cache[(w1.casefold(), w2.casefold())]) + w1, w2 = w2, random.choice(self.cache[getkey(w1, w2)]) return ' '.join(gen_words) diff --git a/velasco.py b/velasco.py index 803c72c..8d6ab73 100755 --- a/velasco.py +++ b/velasco.py @@ -2,6 +2,7 @@ import sys, os from telegram.ext import Updater, CommandHandler, MessageHandler, Filters +from telegram.error import * from chatlog import * import logging import argparse @@ -15,7 +16,9 @@ logger = logging.getLogger(__name__) chatlogs = {} disabled = {} -GUILLERMO_ID = 8379173 +GUILLERMO_ID = "8379173" +CHAT_INC = 5 +CHAT_SAVE = 15 def wake(bot): directory = os.fsencode("chatlogs/") @@ -25,7 +28,7 @@ def wake(bot): if filename.endswith(".txt"): chat = loadchat("chatlogs/" + filename) chatlogs[chat.id] = chat - print("loaded chat " + chat.id) + print("loaded chat " + chat.title + " [" + chat.id + "]") continue else: continue @@ -98,12 +101,14 @@ def read(bot, update): # TO DO: aƱadir % de que haga reply en vez de send try: bot.sendMessage(chatlog.id, msg) - except TelegramError: - chatlog.set_freq(chatlog.freq + 20) + except TimedOut: + chatlog.set_freq(chatlog.freq + CHAT_INC) + print("Increased freq for chat " + chatlog.title + " [" + chatlog.id + "]") if get_chatname(chat) != chatlog.title: chatlog.set_title(get_chatname(chat)) savechat(chatlog) - + elif chatlog.freq > CHAT_SAVE and chatlog.get_count()%CHAT_SAVE == 0: + savechat(chatlog) chatlogs[chatlog.id] = chatlog def speak(bot, update): @@ -121,12 +126,10 @@ def speak(bot, update): msg = chatlog.speak() update.message.reply_text(msg) savechat(chatlog) - chatlogs[chatlog.id] = chatlog def get_chatlogs(bot, update): - global GUILLERMO_ID - if update.message.chat.id is GUILLERMO_ID: + if str(update.message.chat.id) == GUILLERMO_ID: m = "I have these chatlogs:" for c in chatlogs: m += "\n" + chatlogs[c].id + " " + chatlogs[c].title @@ -157,6 +160,7 @@ def set_freq(bot, update): value = int(value) value = chatlogs[ident].set_freq(value) reply = "Frequency of speaking set to " + str(value) + savechat(chatlogs[ident]) except: reply = "Format was confusing; frequency not changed from " + str(chatlogs[ident].freq) update.message.reply_text(reply)