From e6a381f3f7eb4ac37171c44b5ee930bdf4dda354 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 20 Mar 2012 12:25:45 -0400 Subject: [PATCH] Restructuring copyvio stuff as its own package. --- .../wiki/{copyright.py => copyvios/__init__.py} | 83 ++++++---------------- earwigbot/wiki/copyvios/markov.py | 63 ++++++++++++++++ earwigbot/wiki/page.py | 18 ++--- 3 files changed, 93 insertions(+), 71 deletions(-) rename earwigbot/wiki/{copyright.py => copyvios/__init__.py} (82%) create mode 100644 earwigbot/wiki/copyvios/markov.py diff --git a/earwigbot/wiki/copyright.py b/earwigbot/wiki/copyvios/__init__.py similarity index 82% rename from earwigbot/wiki/copyright.py rename to earwigbot/wiki/copyvios/__init__.py index c003ebb..68b4134 100644 --- a/earwigbot/wiki/copyright.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -1,17 +1,17 @@ # -*- coding: utf-8 -*- # # Copyright (C) 2009-2012 by Ben Kurtovic -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is +# copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -20,11 +20,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from collections import defaultdict from functools import partial from gzip import GzipFile from json import loads -from re import sub, UNICODE from StringIO import StringIO from time import sleep, time from urllib import quote_plus, urlencode @@ -36,8 +34,9 @@ except ImportError: oauth = None from earwigbot.wiki.exceptions import * +from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection -class _CopyvioCheckResult(object): +class CopyvioCheckResult(object): def __init__(self, violation, confidence, url, queries, article, chains): self.violation = violation self.confidence = confidence @@ -48,51 +47,11 @@ class _CopyvioCheckResult(object): self.delta_chain = chains[1] def __repr__(self): - r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" + r = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" return r.format(self.violation, self.confidence, self.url, self.queries) -class _MarkovChain(object): - START = -1 - END = -2 - - def __init__(self, text): - self.text = text - self.chain = defaultdict(lambda: defaultdict(lambda: 0)) - words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split() - prev = self.START - for word in words: - self.chain[prev][word] += 1 - prev = word - try: # This won't work if the source text is completely blank - self.chain[word][self.END] += 1 - except KeyError: - pass - - def size(self): - count = 0 - for node in self.chain.itervalues(): - for hits in node.itervalues(): - count += hits - return count - - -class _MarkovChainIntersection(_MarkovChain): - def __init__(self, mc1, mc2): - self.chain = defaultdict(lambda: defaultdict(lambda: 0)) - c1 = mc1.chain - c2 = mc2.chain - - for word, nodes1 in c1.iteritems(): - if word in c2: - nodes2 = c2[word] - for node, count1 in nodes1.iteritems(): - if node in nodes2: - count2 = nodes2[node] - self.chain[word][node] = min(count1, count2) - - -class CopyrightMixin(object): +class CopyvioMixin(object): """ EarwigBot's Wiki Toolset: Copyright Violation Mixin @@ -220,8 +179,8 @@ class CopyrightMixin(object): if not html: return 0 - source = _MarkovChain(self._copyvio_strip_html(html)) - delta = _MarkovChainIntersection(article, source) + source = MarkovChain(self._copyvio_strip_html(html)) + delta = MarkovChainIntersection(article, source) return float(delta.size()) / article.size(), (source, delta) def copyvio_check(self, min_confidence=0.5, max_queries=-1, @@ -255,17 +214,17 @@ class CopyrightMixin(object): best_confidence = 0 best_match = None num_queries = 0 - empty = _MarkovChain("") - best_chains = (empty, _MarkovChainIntersection(empty, empty)) + empty = MarkovChain("") + best_chains = (empty, MarkovChainIntersection(empty, empty)) content = self.get(force) clean = self._copyvio_strip_article(content) chunks = self._copyvio_chunk_article(clean, max_queries) - article_chain = _MarkovChain(clean) + article_chain = MarkovChain(clean) last_query = time() if article_chain.size() < 20: # Auto-fail very small articles - return _CopyvioCheckResult(False, best_confidence, best_match, - num_queries, article_chain, best_chains) + return CopyvioCheckResult(False, best_confidence, best_match, + num_queries, article_chain, best_chains) while (chunks and best_confidence < min_confidence and (max_queries < 0 or num_queries < max_queries)): @@ -288,8 +247,8 @@ class CopyrightMixin(object): v = True else: v = False - return _CopyvioCheckResult(v, best_confidence, best_match, num_queries, - article_chain, best_chains) + return CopyvioCheckResult(v, best_confidence, best_match, num_queries, + article_chain, best_chains) def copyvio_compare(self, url, min_confidence=0.5, force=False): """Check the page like copyvio_check(), but against a specific URL. @@ -298,7 +257,7 @@ class CopyrightMixin(object): comparison is made using Markov chains and the result is returned in a _CopyvioCheckResult object - without using a search engine, as the suspected "violated" URL is supplied from the start. - + Its primary use is to generate a result when the URL is retrieved from a cache, like the one used in EarwigBot's Toolserver site. After a search is done, the resulting URL is stored in a cache for 24 hours so @@ -313,12 +272,12 @@ class CopyrightMixin(object): """ content = self.get(force) clean = self._copyvio_strip_article(content) - article_chain = _MarkovChain(clean) + article_chain = MarkovChain(clean) confidence, chains = self._copyvio_compare_content(article_chain, url) if confidence >= min_confidence: is_violation = True else: is_violation = False - return _CopyvioCheckResult(is_violation, confidence, url, 0, - article_chain, chains) + return CopyvioCheckResult(is_violation, confidence, url, 0, + article_chain, chains) diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py new file mode 100644 index 0000000..4e77ebc --- /dev/null +++ b/earwigbot/wiki/copyvios/markov.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2009-2012 by Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from collections import defaultdict +from re import sub, UNICODE + +class MarkovChain(object): + START = -1 + END = -2 + + def __init__(self, text): + self.text = text + self.chain = defaultdict(lambda: defaultdict(lambda: 0)) + words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split() + prev = self.START + for word in words: + self.chain[prev][word] += 1 + prev = word + try: # This won't work if the source text is completely blank + self.chain[word][self.END] += 1 + except KeyError: + pass + + def size(self): + count = 0 + for node in self.chain.itervalues(): + for hits in node.itervalues(): + count += hits + return count + + +class MarkovChainIntersection(MarkovChain): + def __init__(self, mc1, mc2): + self.chain = defaultdict(lambda: defaultdict(lambda: 0)) + c1 = mc1.chain + c2 = mc2.chain + + for word, nodes1 in c1.iteritems(): + if word in c2: + nodes2 = c2[word] + for node, count1 in nodes1.iteritems(): + if node in nodes2: + count2 = nodes2[node] + self.chain[word][node] = min(count1, count2) diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py index 8407108..0d266b7 100644 --- a/earwigbot/wiki/page.py +++ b/earwigbot/wiki/page.py @@ -1,17 +1,17 @@ # -*- coding: utf-8 -*- # # Copyright (C) 2009-2012 by Ben Kurtovic -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is +# copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -25,10 +25,10 @@ import re from time import gmtime, strftime from urllib import quote -from earwigbot.wiki.copyright import CopyrightMixin +from earwigbot.wiki.copyvios import CopyvioMixin from earwigbot.wiki.exceptions import * -class Page(CopyrightMixin): +class Page(CopyvioMixin): """ EarwigBot's Wiki Toolset: Page Class @@ -264,7 +264,7 @@ class Page(CopyrightMixin): If `params` is given, we'll use it as our API query parameters. Otherwise, we'll build params using the given kwargs via _build_edit_params(). - + We'll then try to do the API query, and catch any errors the API raises in _handle_edit_errors(). We'll then throw these back as subclasses of EditError. @@ -275,7 +275,7 @@ class Page(CopyrightMixin): if not self._token: e = "You don't have permission to edit this page." raise PermissionsError(e) - + # Weed out invalid pages before we get too far: self._force_validity() @@ -336,7 +336,7 @@ class Page(CopyrightMixin): # Page does not exist; don't edit if it already exists: params["createonly"] = "true" else: - params["recreate"] = "true" + params["recreate"] = "true" return params