Merge branch 'feature/cv2015' into develop

9 years ago · d8202e6094
--- a/+ 2
+++ b/+ 2
@@ -15,7 +15,8 @@ v0.2 (unreleased):
 - Added copyvio detector functionality: specifying a max time for checks;
  improved exclusion support. URL loading and parsing is parallelized to speed
  up check times, with a multi-threaded worker model that avoids concurrent
  requests to the same domain. Fixed assorted bugs.
  requests to the same domain. Improvements to the comparison algorithm. Fixed
  assorted bugs.
 - Added support for Wikimedia Labs when creating a config file.
 - Added and improved lazy importing for various dependencies.
 - Fixed a bug in job scheduling.
--- a/earwigbot/exceptions.py
+++ b/earwigbot/exceptions.py
@@ -52,6 +52,7 @@ This module contains all exceptions used by EarwigBot::
               +-- UnknownSearchEngineError
               +-- UnsupportedSearchEngineError
               +-- SearchQueryError
               +-- ParserExclusionError
 """
 class EarwigBotError(Exception):
@@ -231,9 +232,7 @@ class UnknownSearchEngineError(CopyvioCheckError):
    :py:attr:`config.wiki["search"]["engine"]`.
    Raised by :py:meth:`Page.copyvio_check
    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and
    :py:meth:`Page.copyvio_compare
    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`.
    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`.
    """
 class UnsupportedSearchEngineError(CopyvioCheckError):
@@ -243,16 +242,20 @@ class UnsupportedSearchEngineError(CopyvioCheckError):
    couldn't be imported.
    Raised by :py:meth:`Page.copyvio_check
    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and
    :py:meth:`Page.copyvio_compare
    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`.
    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`.
    """
 class SearchQueryError(CopyvioCheckError):
    """Some error ocurred while doing a search query.
    Raised by :py:meth:`Page.copyvio_check
    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and
    :py:meth:`Page.copyvio_compare
    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`.
    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`.
    """
 class ParserExclusionError(CopyvioCheckError):
    """A content parser detected that the given source should be excluded.
    Raised internally by :py:meth:`Page.copyvio_check
    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`; should not be
    exposed in client code.
    """
--- a/earwigbot/wiki/copyvios/init.py
+++ b/earwigbot/wiki/copyvios/init.py
@@ -118,7 +118,7 @@ class CopyvioMixIn(object):
        article = MarkovChain(parser.strip())
        workspace = CopyvioWorkspace(
            article, min_confidence, max_time, self._logger, self._addheaders,
            short_circuit=short_circuit)
            short_circuit=short_circuit, detect_exclusions=True)
        if self._exclusions_db:
            self._exclusions_db.sync(self.site.name)
            exclude = lambda u: self._exclusions_db.check(self.site.name, u)
@@ -176,7 +176,7 @@ class CopyvioMixIn(object):
        article = MarkovChain(ArticleTextParser(self.get()).strip())
        workspace = CopyvioWorkspace(
            article, min_confidence, max_time, self._logger, self._addheaders,
            max_time, 1)
            max_time, num_workers=1)
        workspace.enqueue([url])
        workspace.wait()
        result = workspace.get_result()
--- a/earwigbot/wiki/copyvios/exclusions.py
+++ b/earwigbot/wiki/copyvios/exclusions.py
@@ -28,7 +28,7 @@ from urlparse import urlparse
 from earwigbot import exceptions
 __all__ = ["ExclusionsDB"]
 __all__ = ["ExclusionsDB", "MIRROR_HINTS"]
 DEFAULT_SOURCES = {
    "all": [  # Applies to all, but located on enwiki
@@ -43,6 +43,8 @@ DEFAULT_SOURCES = {
    ]
 }
 MIRROR_HINTS = ["wikipedia.org/w/"]
 class ExclusionsDB(object):
    """
    **EarwigBot: Wiki Toolset: Exclusions Database Manager**
--- a/earwigbot/wiki/copyvios/markov.py
+++ b/earwigbot/wiki/copyvios/markov.py
@@ -30,7 +30,7 @@ class MarkovChain(object):
    """Implements a basic ngram Markov chain of words."""
    START = -1
    END = -2
    degree = 3  # 2 for bigrams, 3 for trigrams, etc.
    degree = 5  # 2 for bigrams, 3 for trigrams, etc.
    def __init__(self, text):
        self.text = text
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -27,6 +27,8 @@ from StringIO import StringIO
 import mwparserfromhell
 from earwigbot import importer
 from earwigbot.exceptions import ParserExclusionError
 from earwigbot.wiki.copyvios.exclusions import MIRROR_HINTS
 bs4 = importer.new("bs4")
 nltk = importer.new("nltk")
@@ -58,6 +60,21 @@ class _BaseTextParser(object):
 class ArticleTextParser(_BaseTextParser):
    """A parser that can strip and chunk wikicode article text."""
    TYPE = "Article"
    TEMPLATE_MERGE_THRESHOLD = 35
    def _merge_templates(self, code):
        """Merge template contents in to wikicode when the values are long."""
        for template in code.filter_templates(recursive=code.RECURSE_OTHERS):
            chunks = []
            for param in template.params:
                if len(param.value) >= self.TEMPLATE_MERGE_THRESHOLD:
                    self._merge_templates(param.value)
                    chunks.append(param.value)
            if chunks:
                subst = u" ".join(map(unicode, chunks))
                code.replace(template, u" " + subst + u" ")
            else:
                code.remove(template)
    def strip(self):
        """Clean the page's raw text by removing templates and formatting.
@@ -94,6 +111,9 @@ class ArticleTextParser(_BaseTextParser):
        for tag in wikicode.filter_tags(matches=lambda tag: tag.tag == "ref"):
            remove(wikicode, tag)
        # Merge in template contents when the values are long:
        self._merge_templates(wikicode)
        clean = wikicode.strip_code(normalize=True, collapse=True)
        self.clean = re.sub("\n\n+", "\n", clean).strip()
        return self.clean
@@ -167,21 +187,30 @@ class _HTMLParser(_BaseTextParser):
        "script", "style"
    ]
    def parse(self):
    def parse(self, **kwargs):
        """Return the actual text contained within an HTML document.
        Implemented using :py:mod:`BeautifulSoup <bs4>`
        (http://www.crummy.com/software/BeautifulSoup/).
        """
        try:
            soup = bs4.BeautifulSoup(self.text, "lxml").body
            soup = bs4.BeautifulSoup(self.text, "lxml")
        except ValueError:
            soup = bs4.BeautifulSoup(self.text).body
            soup = bs4.BeautifulSoup(self.text)
        if not soup:
        if not soup.body:
            # No <body> tag present in HTML ->
            # no scrapable content (possibly JS or <frame> magic):
            return ""
        if kwargs["detect_exclusions"]:
            # Look for obvious signs that this is a mirror:
            func = lambda attr: attr and any(
                hint in attr for hint in MIRROR_HINTS)
            if soup.find_all(href=func) or soup.find_all(src=func):
                raise ParserExclusionError()
        soup = soup.body
        is_comment = lambda text: isinstance(text, bs4.element.Comment)
        for comment in soup.find_all(text=is_comment):
            comment.extract()
@@ -200,7 +229,7 @@ class _PDFParser(_BaseTextParser):
        (u"\u2022", u" "),
    ]
    def parse(self):
    def parse(self, **kwargs):
        """Return extracted text from the PDF."""
        output = StringIO()
        manager = pdfinterp.PDFResourceManager()
@@ -226,7 +255,7 @@ class _PlainTextParser(_BaseTextParser):
    """A parser that can unicode-ify and strip text from a plain text page."""
    TYPE = "Text"
    def parse(self):
    def parse(self, **kwargs):
        """Unicode-ify and strip whitespace from the plain text document."""
        converted = bs4.UnicodeDammit(self.text).unicode_markup
        return converted.strip() if converted else ""
--- a/earwigbot/wiki/copyvios/result.py
+++ b/earwigbot/wiki/copyvios/result.py
@@ -40,16 +40,21 @@ class CopyvioSource(object):
    - :py:attr:`confidence`: the confidence of a violation, between 0 and 1
    - :py:attr:`chains`:     a 2-tuple of the source chain and the delta chain
    - :py:attr:`skipped`:    whether this URL was skipped during the check
    - :py:attr:`excluded`:   whether this URL was in the exclusions list
    """
    def __init__(self, workspace, url, headers=None, timeout=5):
    def __init__(self, workspace, url, headers=None, timeout=5,
                 detect_exclusions=False):
        self.workspace = workspace
        self.url = url
        self.headers = headers
        self.timeout = timeout
        self.detect_exclusions = detect_exclusions
        self.confidence = 0.0
        self.chains = (EMPTY, EMPTY_INTERSECTION)
        self.skipped = False
        self.excluded = False
        self._event1 = Event()
        self._event2 = Event()
@@ -57,11 +62,15 @@ class CopyvioSource(object):
    def __repr__(self):
        """Return the canonical string representation of the source."""
        res = "CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r})"
        return res.format(self.url, self.confidence, self.skipped)
        res = ("CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r}, "
               "excluded={3!r})")
        return res.format(
            self.url, self.confidence, self.skipped, self.excluded)
    def __str__(self):
        """Return a nice string representation of the source."""
        if self.excluded:
            return "<CopyvioSource ({0}, excluded)>".format(self.url)
        if self.skipped:
            return "<CopyvioSource ({0}, skipped)>".format(self.url)
        res = "<CopyvioSource ({0} with {1} conf)>"
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -34,6 +34,7 @@ from time import time
 from urllib2 import build_opener, URLError
 from earwigbot import importer
 from earwigbot.exceptions import ParserExclusionError
 from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
 from earwigbot.wiki.copyvios.parsers import get_parser
 from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource
@@ -155,7 +156,8 @@ class _CopyvioWorker(object):
            except (IOError, struct_error):
                return None
        return handler(content).parse()
        return handler(content).parse(
            detect_exclusions=source.detect_exclusions)
    def _acquire_new_site(self):
        """Block for a new unassigned site queue."""
@@ -218,9 +220,15 @@ class _CopyvioWorker(object):
            except StopIteration:
                self._logger.debug("Exiting: got stop signal")
                return
            text = self._open_url(source)
            chain = MarkovChain(text) if text else None
            source.workspace.compare(source, chain)
            try:
                text = self._open_url(source)
            except ParserExclusionError:
                source.skipped = source.excluded = True
                source.finish_work()
            else:
                chain = MarkovChain(text) if text else None
                source.workspace.compare(source, chain)
    def start(self):
        """Start the copyvio worker in a new thread."""
@@ -233,7 +241,8 @@ class CopyvioWorkspace(object):
    """Manages a single copyvio check distributed across threads."""
    def __init__(self, article, min_confidence, max_time, logger, headers,
                 url_timeout=5, num_workers=8, short_circuit=True):
                 url_timeout=5, num_workers=8, short_circuit=True,
                 detect_exclusions=False):
        self.sources = []
        self.finished = False
        self.possible_miss = False
@@ -247,7 +256,8 @@ class CopyvioWorkspace(object):
        self._finish_lock = Lock()
        self._short_circuit = short_circuit
        self._source_args = {"workspace": self, "headers": headers,
                             "timeout": url_timeout}
                             "timeout": url_timeout,
                             "detect_exclusions": detect_exclusions}
        if _is_globalized:
            self._queues = _global_queues
@@ -311,11 +321,15 @@ class CopyvioWorkspace(object):
                if url in self._handled_urls:
                    continue
                self._handled_urls.add(url)
                if exclude_check and exclude_check(url):
                    continue
                source = CopyvioSource(url=url, **self._source_args)
                self.sources.append(source)
                if exclude_check and exclude_check(url):
                    self._logger.debug(u"enqueue(): exclude {0}".format(url))
                    source.excluded = True
                    source.skip()
                    continue
                if self._short_circuit and self.finished:
                    self._logger.debug(u"enqueue(): auto-skip {0}".format(url))
                    source.skip()
@@ -371,6 +385,8 @@ class CopyvioWorkspace(object):
        def cmpfunc(s1, s2):
            if s2.confidence != s1.confidence:
                return 1 if s2.confidence > s1.confidence else -1
            if s2.excluded != s1.excluded:
                return 1 if s1.excluded else -1
            return int(s1.skipped) - int(s2.skipped)
        self.sources.sort(cmpfunc)