Add mirror detection logic to parsers; fixes.

9 years ago · 03910b6cb5
--- a/earwigbot/wiki/copyvios/init.py
+++ b/earwigbot/wiki/copyvios/init.py
@@ -118,7 +118,7 @@ class CopyvioMixIn(object):
        article = MarkovChain(parser.strip())
        workspace = CopyvioWorkspace(
            article, min_confidence, max_time, self._logger, self._addheaders,
            short_circuit=short_circuit)
            short_circuit=short_circuit, detect_exclusions=True)
        if self._exclusions_db:
            self._exclusions_db.sync(self.site.name)
            exclude = lambda u: self._exclusions_db.check(self.site.name, u)
@@ -176,7 +176,7 @@ class CopyvioMixIn(object):
        article = MarkovChain(ArticleTextParser(self.get()).strip())
        workspace = CopyvioWorkspace(
            article, min_confidence, max_time, self._logger, self._addheaders,
            max_time, 1)
            max_time, num_workers=1)
        workspace.enqueue([url])
        workspace.wait()
        result = workspace.get_result()
--- a/earwigbot/wiki/copyvios/exclusions.py
+++ b/earwigbot/wiki/copyvios/exclusions.py
@@ -28,7 +28,7 @@ from urlparse import urlparse
 from earwigbot import exceptions
 __all__ = ["ExclusionsDB"]
 __all__ = ["ExclusionsDB", "MIRROR_HINTS"]
 DEFAULT_SOURCES = {
    "all": [  # Applies to all, but located on enwiki
@@ -43,6 +43,8 @@ DEFAULT_SOURCES = {
    ]
 }
 MIRROR_HINTS = ["wikipedia.org/w/"]
 class ExclusionsDB(object):
    """
    **EarwigBot: Wiki Toolset: Exclusions Database Manager**
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -28,6 +28,7 @@ import mwparserfromhell
 from earwigbot import importer
 from earwigbot.exceptions import ParserExclusionError
 from earwigbot.copyvios.exclusions import MIRROR_HINTS
 bs4 = importer.new("bs4")
 nltk = importer.new("nltk")
@@ -186,21 +187,30 @@ class _HTMLParser(_BaseTextParser):
        "script", "style"
    ]
    def parse(self):
    def parse(self, detect_exclusions=False):
        """Return the actual text contained within an HTML document.
        Implemented using :py:mod:`BeautifulSoup <bs4>`
        (http://www.crummy.com/software/BeautifulSoup/).
        """
        try:
            soup = bs4.BeautifulSoup(self.text, "lxml").body
            soup = bs4.BeautifulSoup(self.text, "lxml")
        except ValueError:
            soup = bs4.BeautifulSoup(self.text).body
            soup = bs4.BeautifulSoup(self.text)
        if not soup:
        if not soup.body:
            # No <body> tag present in HTML ->
            # no scrapable content (possibly JS or <frame> magic):
            return ""
        if detect_exclusions:
            # Look for obvious signs that this is a mirror:
            func = lambda attr: attr and any(
                hint in attr for hint in MIRROR_HINTS)
            if soup.find_all(href=func) or soup.find_all(src=func):
                raise ParserExclusionError()
        soup = soup.body
        is_comment = lambda text: isinstance(text, bs4.element.Comment)
        for comment in soup.find_all(text=is_comment):
            comment.extract()
@@ -219,7 +229,7 @@ class _PDFParser(_BaseTextParser):
        (u"\u2022", u" "),
    ]
    def parse(self):
    def parse(self, detect_exclusions=False):
        """Return extracted text from the PDF."""
        output = StringIO()
        manager = pdfinterp.PDFResourceManager()
@@ -245,7 +255,7 @@ class _PlainTextParser(_BaseTextParser):
    """A parser that can unicode-ify and strip text from a plain text page."""
    TYPE = "Text"
    def parse(self):
    def parse(self, detect_exclusions=False):
        """Unicode-ify and strip whitespace from the plain text document."""
        converted = bs4.UnicodeDammit(self.text).unicode_markup
        return converted.strip() if converted else ""
--- a/earwigbot/wiki/copyvios/result.py
+++ b/earwigbot/wiki/copyvios/result.py
@@ -43,11 +43,14 @@ class CopyvioSource(object):
    - :py:attr:`excluded`:   whether this URL was in the exclusions list
    """
    def __init__(self, workspace, url, headers=None, timeout=5):
    def __init__(self, workspace, url, headers=None, timeout=5,
                 detect_exclusions=False):
        self.workspace = workspace
        self.url = url
        self.headers = headers
        self.timeout = timeout
        self.detect_exclusions = detect_exclusions
        self.confidence = 0.0
        self.chains = (EMPTY, EMPTY_INTERSECTION)
        self.skipped = False
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -156,7 +156,7 @@ class _CopyvioWorker(object):
            except (IOError, struct_error):
                return None
        return handler(content).parse()
        return handler(content).parse(source.detect_exclusions)
    def _acquire_new_site(self):
        """Block for a new unassigned site queue."""
@@ -240,7 +240,8 @@ class CopyvioWorkspace(object):
    """Manages a single copyvio check distributed across threads."""
    def __init__(self, article, min_confidence, max_time, logger, headers,
                 url_timeout=5, num_workers=8, short_circuit=True):
                 url_timeout=5, num_workers=8, short_circuit=True,
                 detect_exclusions=False):
        self.sources = []
        self.finished = False
        self.possible_miss = False
@@ -254,7 +255,8 @@ class CopyvioWorkspace(object):
        self._finish_lock = Lock()
        self._short_circuit = short_circuit
        self._source_args = {"workspace": self, "headers": headers,
                             "timeout": url_timeout}
                             "timeout": url_timeout,
                             "detect_exclusions": detect_exclusions}
        if _is_globalized:
            self._queues = _global_queues