Restructuring copyvio stuff as its own package.

12 年之前 · e6a381f3f7
--- a/earwigbot/wiki/copyvios/init.py
+++ b/earwigbot/wiki/copyvios/init.py
@@ -1,17 +1,17 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
 # 
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is 
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 # 
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 # 
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -20,11 +20,9 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 from collections import defaultdict
 from functools import partial
 from gzip import GzipFile
 from json import loads
 from re import sub, UNICODE
 from StringIO import StringIO
 from time import sleep, time
 from urllib import quote_plus, urlencode
@@ -36,8 +34,9 @@ except ImportError:
    oauth = None

 from earwigbot.wiki.exceptions import *
 from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection

 class _CopyvioCheckResult(object):
 class CopyvioCheckResult(object):
    def __init__(self, violation, confidence, url, queries, article, chains):
        self.violation = violation
        self.confidence = confidence
@@ -48,51 +47,11 @@ class _CopyvioCheckResult(object):
        self.delta_chain = chains[1]

    def __repr__(self):
        r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
        r = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
        return r.format(self.violation, self.confidence, self.url, self.queries)


 class _MarkovChain(object):
    START = -1
    END = -2

    def __init__(self, text):
        self.text = text
        self.chain = defaultdict(lambda: defaultdict(lambda: 0))
        words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
        prev = self.START
        for word in words:
            self.chain[prev][word] += 1
            prev = word
        try:  # This won't work if the source text is completely blank
            self.chain[word][self.END] += 1
        except KeyError:
            pass

    def size(self):
        count = 0
        for node in self.chain.itervalues():
            for hits in node.itervalues():
                count += hits
        return count


 class _MarkovChainIntersection(_MarkovChain):
    def __init__(self, mc1, mc2):
        self.chain = defaultdict(lambda: defaultdict(lambda: 0))
        c1 = mc1.chain
        c2 = mc2.chain

        for word, nodes1 in c1.iteritems():
            if word in c2:
                nodes2 = c2[word]
                for node, count1 in nodes1.iteritems():
                    if node in nodes2:
                        count2 = nodes2[node]
                        self.chain[word][node] = min(count1, count2)


 class CopyrightMixin(object):
 class CopyvioMixin(object):
    """
    EarwigBot's Wiki Toolset: Copyright Violation Mixin

@@ -220,8 +179,8 @@ class CopyrightMixin(object):
        if not html:
            return 0

        source = _MarkovChain(self._copyvio_strip_html(html))
        delta = _MarkovChainIntersection(article, source)
        source = MarkovChain(self._copyvio_strip_html(html))
        delta = MarkovChainIntersection(article, source)
        return float(delta.size()) / article.size(), (source, delta)

    def copyvio_check(self, min_confidence=0.5, max_queries=-1,
@@ -255,17 +214,17 @@ class CopyrightMixin(object):
        best_confidence = 0
        best_match = None
        num_queries = 0
        empty = _MarkovChain("")
        best_chains = (empty, _MarkovChainIntersection(empty, empty))
        empty = MarkovChain("")
        best_chains = (empty, MarkovChainIntersection(empty, empty))
        content = self.get(force)
        clean = self._copyvio_strip_article(content)
        chunks = self._copyvio_chunk_article(clean, max_queries)
        article_chain = _MarkovChain(clean)
        article_chain = MarkovChain(clean)
        last_query = time()

        if article_chain.size() < 20:  # Auto-fail very small articles
            return _CopyvioCheckResult(False, best_confidence, best_match,
                                       num_queries, article_chain, best_chains)
            return CopyvioCheckResult(False, best_confidence, best_match,
                                      num_queries, article_chain, best_chains)

        while (chunks and best_confidence < min_confidence and
               (max_queries < 0 or num_queries < max_queries)):
@@ -288,8 +247,8 @@ class CopyrightMixin(object):
            v = True
        else:
            v = False
        return _CopyvioCheckResult(v, best_confidence, best_match, num_queries,
                                   article_chain, best_chains)
        return CopyvioCheckResult(v, best_confidence, best_match, num_queries,
                                  article_chain, best_chains)

    def copyvio_compare(self, url, min_confidence=0.5, force=False):
        """Check the page like copyvio_check(), but against a specific URL.
@@ -298,7 +257,7 @@ class CopyrightMixin(object):
        comparison is made using Markov chains and the result is returned in a
        _CopyvioCheckResult object - without using a search engine, as the
        suspected "violated" URL is supplied from the start.
        

        Its primary use is to generate a result when the URL is retrieved from
        a cache, like the one used in EarwigBot's Toolserver site. After a
        search is done, the resulting URL is stored in a cache for 24 hours so
@@ -313,12 +272,12 @@ class CopyrightMixin(object):
        """
        content = self.get(force)
        clean = self._copyvio_strip_article(content)
        article_chain = _MarkovChain(clean)
        article_chain = MarkovChain(clean)
        confidence, chains = self._copyvio_compare_content(article_chain, url)

        if confidence >= min_confidence:
            is_violation = True
        else:
            is_violation = False
        return _CopyvioCheckResult(is_violation, confidence, url, 0,
                                   article_chain, chains)
        return CopyvioCheckResult(is_violation, confidence, url, 0,
                                  article_chain, chains)
--- a/earwigbot/wiki/copyvios/markov.py
+++ b/earwigbot/wiki/copyvios/markov.py
@@ -0,0 +1,63 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 from collections import defaultdict
 from re import sub, UNICODE

 class MarkovChain(object):
    START = -1
    END = -2

    def __init__(self, text):
        self.text = text
        self.chain = defaultdict(lambda: defaultdict(lambda: 0))
        words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
        prev = self.START
        for word in words:
            self.chain[prev][word] += 1
            prev = word
        try:  # This won't work if the source text is completely blank
            self.chain[word][self.END] += 1
        except KeyError:
            pass

    def size(self):
        count = 0
        for node in self.chain.itervalues():
            for hits in node.itervalues():
                count += hits
        return count


 class MarkovChainIntersection(MarkovChain):
    def __init__(self, mc1, mc2):
        self.chain = defaultdict(lambda: defaultdict(lambda: 0))
        c1 = mc1.chain
        c2 = mc2.chain

        for word, nodes1 in c1.iteritems():
            if word in c2:
                nodes2 = c2[word]
                for node, count1 in nodes1.iteritems():
                    if node in nodes2:
                        count2 = nodes2[node]
                        self.chain[word][node] = min(count1, count2)
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -1,17 +1,17 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
 # 
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is 
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 # 
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 # 
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -25,10 +25,10 @@ import re
 from time import gmtime, strftime
 from urllib import quote

 from earwigbot.wiki.copyright import CopyrightMixin
 from earwigbot.wiki.copyvios import CopyvioMixin
 from earwigbot.wiki.exceptions import *

 class Page(CopyrightMixin):
 class Page(CopyvioMixin):
    """
    EarwigBot's Wiki Toolset: Page Class

@@ -264,7 +264,7 @@ class Page(CopyrightMixin):
        If `params` is given, we'll use it as our API query parameters.
        Otherwise, we'll build params using the given kwargs via
        _build_edit_params().
        

        We'll then try to do the API query, and catch any errors the API raises
        in _handle_edit_errors(). We'll then throw these back as subclasses of
        EditError.
@@ -275,7 +275,7 @@ class Page(CopyrightMixin):
        if not self._token:
            e = "You don't have permission to edit this page."
            raise PermissionsError(e)
        

        # Weed out invalid pages before we get too far:
        self._force_validity()

@@ -336,7 +336,7 @@ class Page(CopyrightMixin):
                # Page does not exist; don't edit if it already exists:
                params["createonly"] = "true"
        else:
            params["recreate"] = "true"            
            params["recreate"] = "true"

        return params