From df7868da3eff3cb05f67fc6618f535f9b4ba6218 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 11 Feb 2012 17:57:47 -0500 Subject: [PATCH] Updates to copyright violation stuff. --- earwigbot/wiki/copyright.py | 47 +++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/earwigbot/wiki/copyright.py b/earwigbot/wiki/copyright.py index c5f726b..0376562 100644 --- a/earwigbot/wiki/copyright.py +++ b/earwigbot/wiki/copyright.py @@ -37,20 +37,27 @@ except ImportError: from earwigbot.wiki.exceptions import * class _CopyvioCheckResult(object): - def __init__(self, violation, confidence, url, queries): + def __init__(self, violation, confidence, url, queries, article, chains): self.violation = violation self.confidence = confidence self.url = url self.queries = queries + self.article_chain = article + self.source_chain = chains[0] + self.delta_chain = chains[1] def __repr__(self): - r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" - return r.format(self.violation, self.confidence, self.url, self.queries) + r = ", ".join(("_CopyvioCheckResult(violation={0!r}", + "confidence={1!r}", "url={2!r}", "queries={3|r}", + "article={4|r}", "chains={5!r})")) + return r.format(self.violation, self.confidence, self.url, + self.queries, self.article_chain, + (self.source_chain, self.delta_chain)) class _MarkovChain(object): - START = "MRKV_CHAIN_START" - END = "MRKV_CHAIN_END" + START = -1 + END = -2 def __init__(self, text): self.text = text @@ -60,7 +67,10 @@ class _MarkovChain(object): for word in words: self.chain[prev][word] += 1 prev = word - self.chain[word][self.END] += 1 + try: # This won't work if the source text is completely blank + self.chain[word][self.END] += 1 + except KeyError: + pass def size(self): count = 0 @@ -180,21 +190,19 @@ class CopyrightMixin(object): def _copyvio_strip_article(self, content): return content - def _copyvio_chunk_article(self, content): + def _copyvio_chunk_article(self, content, max_chunks): return [content] - def _copyvio_compare_content(self, content, url): + def _copyvio_compare_content(self, article, url): html = self._open_url_ignoring_errors(url) if not html: return 0 - article = _MarkovChain(content) source = _MarkovChain(self._copyvio_strip_html(html)) delta = _MarkovChainIntersection(article, source) + return delta.size() / article.size(), (source, delta) - return delta.size() / min(article.size(), source.size()) - - def copyvio_check(self, engine, credentials, min_confidence=0.75, + def copyvio_check(self, engine, credentials, min_confidence=0.5, max_queries=-1, interquery_sleep=1, force=False): """Check the page for copyright violations. @@ -225,9 +233,12 @@ class CopyrightMixin(object): best_confidence = 0 best_match = None num_queries = 0 + empty = _MarkovChain("") + best_chains = (empty, _MarkovChainIntersection(empty, empty)) content = self.get(force) clean = self._copyvio_strip_article(content) - chunks = self._copyvio_chunk_article(clean) + chunks = self._copyvio_chunk_article(clean, max_queries) + article_chain = _MarkovChain(clean) last_query = time() while (chunks and best_confidence < min_confidence and @@ -236,10 +247,11 @@ class CopyrightMixin(object): urls = [url for url in urls if url not in handled_urls] for url in urls: handled_urls.append(url) - confidence = self._copyvio_compare_content(clean, url) - if confidence > best_confidence: - best_confidence = confidence + conf, chains = self._copyvio_compare_content(article_chain, url) + if conf > best_confidence: + best_confidence = conf best_match = url + best_chains = chains num_queries += 1 diff = time() - last_query if diff < interquery_sleep: @@ -250,4 +262,5 @@ class CopyrightMixin(object): v = True else: v = False - return _CopyvioCheckResult(v, best_confidence, best_match, num_queries) + return _CopyvioCheckResult(v, best_confidence, best_match, num_queries, + article_chain, best_chains)