Преглед на файлове

Make Markov chain degree-independent. Testing trigrams.

tags/v0.1^2
Ben Kurtovic преди 12 години
родител
ревизия
bf1ad08dc6
променени са 1 файла, в които са добавени 8 реда и са изтрити 9 реда
  1. +8
    -9
      earwigbot/wiki/copyvios/markov.py

+ 8
- 9
earwigbot/wiki/copyvios/markov.py Целия файл

@@ -26,22 +26,21 @@ from re import sub, UNICODE
__all__ = ["MarkovChain", "MarkovChainIntersection"] __all__ = ["MarkovChain", "MarkovChainIntersection"]


class MarkovChain(object): class MarkovChain(object):
"""Implements a basic bigram Markov chain of words."""
"""Implements a basic ngram Markov chain of words."""
START = -1 START = -1
END = -2 END = -2
degree = 3 # 2 for bigrams, 3 for trigrams, etc.


def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) self.chain = defaultdict(lambda: defaultdict(lambda: 0))
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split() words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
prev = self.START
for word in words:
self.chain[prev][word] += 1
prev = word
try: # This won't work if the source text is completely blank
self.chain[word][self.END] += 1
except KeyError:
pass

padding = self.degree - 1
words = ([self.START] * padding) + words + ([self.END] * padding)
for i in range(len(words) - self.degree + 1):
last = i + self.degree - 1
self.chain[words[i:last]][last] += 1


def __repr__(self): def __repr__(self):
"""Return the canonical string representation of the MarkovChain.""" """Return the canonical string representation of the MarkovChain."""


Зареждане…
Отказ
Запис