algorithm-in-python/string/markov.py

65 lines
1.8 KiB
Python

''' mbinary
#########################################################################
# File : markov.py
# Author: mbinary
# Mail: zhuheqin1@gmail.com
# Blog: https://mbinary.xyz
# Github: https://github.com/mbinary
# Created Time: 2018-07-06 15:57
# Description:
#########################################################################
'''
from random import randint
import re
class markov:
def __init__(self, txt):
self.words = self.clean(txt)
self.dic = self.getDic(self.words)
def clean(self, text):
text = text.replace("\n", " ")
text = text.replace("\"", "")
# 保证每个标点符号都和前面的单词在一起
# 这样不会被剔除,保留在马尔可夫链中
punctuation = [',', '.', ';', ':']
for symbol in punctuation:
text = text.replace(symbol, symbol+" ")
return re.split(' +', text)
def getDic(self, words):
dic = {}
end = len(words)
for i in range(1, end):
if words[i-1] not in dic:
dic[words[i-1]] = {words[i]: 1}
elif words[i] not in dic[words[i-1]]:
dic[words[i-1]][words[i]] = 1
else:
dic[words[i-1]][words[i]] += 1
return dic
def getSum(self, dic):
if '%size' not in dic:
dic['%size'] = sum(list(dic.values()))
return dic['%size']
def nextWord(self, word):
k = randint(1, self.getSum(self.dic[word]))
for i, j in self.dic[word].items():
k -= j
if k <= 0:
return i
def genSentence(self, begin='I', length=30):
li = [begin]
nextWord = begin
for i in range(1, length):
nextWord = self.nextWord(nextWord)
li.append(nextWord)
return ' '.join(li)