From ee2b1133bb37a300a02fb0225fd81fb99b4c08ec Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 15 Jan 2012 16:55:08 -0500 Subject: [PATCH] Algorithm for comparing article content against a suspected source using MarkovChains --- earwigbot/wiki/copyright.py | 70 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 13 deletions(-) diff --git a/earwigbot/wiki/copyright.py b/earwigbot/wiki/copyright.py index 4a1fb3e..c5f726b 100644 --- a/earwigbot/wiki/copyright.py +++ b/earwigbot/wiki/copyright.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from collections import defaultdict from functools import partial from gzip import GzipFile from json import loads @@ -35,7 +36,7 @@ except ImportError: from earwigbot.wiki.exceptions import * -class CopyvioCheckResult(object): +class _CopyvioCheckResult(object): def __init__(self, violation, confidence, url, queries): self.violation = violation self.confidence = confidence @@ -43,10 +44,47 @@ class CopyvioCheckResult(object): self.queries = queries def __repr__(self): - r = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" + r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" return r.format(self.violation, self.confidence, self.url, self.queries) +class _MarkovChain(object): + START = "MRKV_CHAIN_START" + END = "MRKV_CHAIN_END" + + def __init__(self, text): + self.text = text + self.chain = defaultdict(lambda: defaultdict(lambda: 0)) + words = text.split() + prev = self.START + for word in words: + self.chain[prev][word] += 1 + prev = word + self.chain[word][self.END] += 1 + + def size(self): + count = 0 + for node in self.chain.itervalues(): + for hits in node.itervalues(): + count += hits + return count + + +class _MarkovChainIntersection(_MarkovChain): + def __init__(self, mc1, mc2): + self.chain = defaultdict(lambda: defaultdict(lambda: 0)) + c1 = mc1.chain + c2 = mc2.chain + + for word, nodes1 in c1.iteritems(): + if word in c2: + nodes2 = c2[word] + for node, count1 in nodes1.iteritems(): + if node in nodes2: + count2 = nodes2[node] + self.chain[word][node] = min(count1, count2) + + class CopyrightMixin(object): """ EarwigBot's Wiki Toolset: Copyright Violation Mixin @@ -136,10 +174,13 @@ class CopyrightMixin(object): return [] return [result["url"] for result in results] - def _copyvio_strip_content(self, content): + def _copyvio_strip_html(self, html): + return html + + def _copyvio_strip_article(self, content): return content - def _copyvio_chunk_content(self, content): + def _copyvio_chunk_article(self, content): return [content] def _copyvio_compare_content(self, content, url): @@ -147,14 +188,17 @@ class CopyrightMixin(object): if not html: return 0 - confidence = 0 - return confidence + article = _MarkovChain(content) + source = _MarkovChain(self._copyvio_strip_html(html)) + delta = _MarkovChainIntersection(article, source) + + return delta.size() / min(article.size(), source.size()) def copyvio_check(self, engine, credentials, min_confidence=0.75, max_queries=-1, interquery_sleep=1, force=False): """Check the page for copyright violations. - Returns a CopyvioCheckResult object, with four useful attributes: + Returns a _CopyvioCheckResult object with four useful attributes: "violation", "confidence", "url", and "queries". "confidence" is a number between 0 and 1; if it is less than "min_confidence", we could not find any indication of a violation (so "violation" will be False @@ -182,8 +226,8 @@ class CopyrightMixin(object): best_match = None num_queries = 0 content = self.get(force) - clean = self._copyvio_strip_content(content) - chunks = self._copyvio_chunk_content(clean) + clean = self._copyvio_strip_article(content) + chunks = self._copyvio_chunk_article(clean) last_query = time() while (chunks and best_confidence < min_confidence and @@ -192,7 +236,7 @@ class CopyrightMixin(object): urls = [url for url in urls if url not in handled_urls] for url in urls: handled_urls.append(url) - confidence = self._copyvio_compare_content(content, url) + confidence = self._copyvio_compare_content(clean, url) if confidence > best_confidence: best_confidence = confidence best_match = url @@ -203,7 +247,7 @@ class CopyrightMixin(object): last_query = time() if best_confidence >= min_confidence: # violation? - vi = True + v = True else: - vi = False - return CopyvioCheckResult(vi, best_confidence, best_match, num_queries) + v = False + return _CopyvioCheckResult(v, best_confidence, best_match, num_queries)