Refactor out mirror hinting logic in source parsers.

9 years ago · 91846ce4fb
--- a/earwigbot/wiki/copyvios/init.py
+++ b/earwigbot/wiki/copyvios/init.py
@@ -118,7 +118,7 @@ class CopyvioMixIn(object):
        article = MarkovChain(parser.strip())
        workspace = CopyvioWorkspace(
            article, min_confidence, max_time, self._logger, self._addheaders,
            short_circuit=short_circuit, detect_exclusions=True)
            short_circuit=short_circuit, parser_args={"mirror_hints": ["wikipedia.org/w/"]})
        if self._exclusions_db:
            self._exclusions_db.sync(self.site.name)
            exclude = lambda u: self._exclusions_db.check(self.site.name, u)
--- a/earwigbot/wiki/copyvios/exclusions.py
+++ b/earwigbot/wiki/copyvios/exclusions.py
@@ -28,7 +28,7 @@ from urlparse import urlparse

 from earwigbot import exceptions

 __all__ = ["ExclusionsDB", "MIRROR_HINTS"]
 __all__ = ["ExclusionsDB"]

 DEFAULT_SOURCES = {
    "all": [  # Applies to all, but located on enwiki
@@ -43,8 +43,6 @@ DEFAULT_SOURCES = {
    ]
 }

 MIRROR_HINTS = ["wikipedia.org/w/"]

 class ExclusionsDB(object):
    """
    **EarwigBot: Wiki Toolset: Exclusions Database Manager**
--- a/earwigbot/wiki/copyvios/markov.py
+++ b/earwigbot/wiki/copyvios/markov.py
@@ -35,7 +35,7 @@ class MarkovChain(object):
    def __init__(self, text):
        self.text = text
        self.chain = defaultdict(lambda: defaultdict(lambda: 0))
        words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
        words = sub(r"[^\w\s-]", "", text.lower(), flags=UNICODE).split()

        padding = self.degree - 1
        words = ([self.START] * padding) + words + ([self.END] * padding)
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -28,7 +28,6 @@ import mwparserfromhell

 from earwigbot import importer
 from earwigbot.exceptions import ParserExclusionError
 from earwigbot.wiki.copyvios.exclusions import MIRROR_HINTS

 bs4 = importer.new("bs4")
 nltk = importer.new("nltk")
@@ -44,8 +43,9 @@ class _BaseTextParser(object):
    """Base class for a parser that handles text."""
    TYPE = None

    def __init__(self, text):
    def __init__(self, text, args=None):
        self.text = text
        self._args = args or {}

    def __repr__(self):
        """Return the canonical string representation of the text parser."""
@@ -187,7 +187,7 @@ class _HTMLParser(_BaseTextParser):
        "script", "style"
    ]

    def parse(self, **kwargs):
    def parse(self):
        """Return the actual text contained within an HTML document.

        Implemented using :py:mod:`BeautifulSoup <bs4>`
@@ -203,10 +203,10 @@ class _HTMLParser(_BaseTextParser):
            # no scrapable content (possibly JS or <frame> magic):
            return ""

        if kwargs["detect_exclusions"]:
        if "mirror_hints" in self._args:
            # Look for obvious signs that this is a mirror:
            func = lambda attr: attr and any(
                hint in attr for hint in MIRROR_HINTS)
                hint in attr for hint in self._args["mirror_hints"])
            if soup.find_all(href=func) or soup.find_all(src=func):
                raise ParserExclusionError()

@@ -229,7 +229,7 @@ class _PDFParser(_BaseTextParser):
        (u"\u2022", u" "),
    ]

    def parse(self, **kwargs):
    def parse(self):
        """Return extracted text from the PDF."""
        output = StringIO()
        manager = pdfinterp.PDFResourceManager()
@@ -255,7 +255,7 @@ class _PlainTextParser(_BaseTextParser):
    """A parser that can unicode-ify and strip text from a plain text page."""
    TYPE = "Text"

    def parse(self, **kwargs):
    def parse(self):
        """Unicode-ify and strip whitespace from the plain text document."""
        converted = bs4.UnicodeDammit(self.text).unicode_markup
        return converted.strip() if converted else ""
--- a/earwigbot/wiki/copyvios/result.py
+++ b/earwigbot/wiki/copyvios/result.py
@@ -44,12 +44,12 @@ class CopyvioSource(object):
    """

    def __init__(self, workspace, url, headers=None, timeout=5,
                 detect_exclusions=False):
                 parser_args=None):
        self.workspace = workspace
        self.url = url
        self.headers = headers
        self.timeout = timeout
        self.detect_exclusions = detect_exclusions
        self.parser_args = parser_args

        self.confidence = 0.0
        self.chains = (EMPTY, EMPTY_INTERSECTION)
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -156,8 +156,7 @@ class _CopyvioWorker(object):
            except (IOError, struct_error):
                return None

        return handler(content).parse(
            detect_exclusions=source.detect_exclusions)
        return handler(content, source.parser_args).parse()

    def _acquire_new_site(self):
        """Block for a new unassigned site queue."""
@@ -242,7 +241,7 @@ class CopyvioWorkspace(object):

    def __init__(self, article, min_confidence, max_time, logger, headers,
                 url_timeout=5, num_workers=8, short_circuit=True,
                 detect_exclusions=False):
                 parser_args=None):
        self.sources = []
        self.finished = False
        self.possible_miss = False
@@ -255,9 +254,9 @@ class CopyvioWorkspace(object):
        self._handled_urls = set()
        self._finish_lock = Lock()
        self._short_circuit = short_circuit
        self._source_args = {"workspace": self, "headers": headers,
                             "timeout": url_timeout,
                             "detect_exclusions": detect_exclusions}
        self._source_args = {
            "workspace": self, "headers": headers, "timeout": url_timeout,
            "parser_args": parser_args}

        if _is_globalized:
            self._queues = _global_queues