From 03910b6cb5b5711f1dc040c0f24e92f269213bb9 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 29 Sep 2015 04:00:25 -0500
Subject: [PATCH] Add mirror detection logic to parsers; fixes.

---
 earwigbot/wiki/copyvios/__init__.py   |  4 ++--
 earwigbot/wiki/copyvios/exclusions.py |  4 +++-
 earwigbot/wiki/copyvios/parsers.py    | 22 ++++++++++++++++------
 earwigbot/wiki/copyvios/result.py     |  5 ++++-
 earwigbot/wiki/copyvios/workers.py    |  8 +++++---
 5 files changed, 30 insertions(+), 13 deletions(-)
diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index 46fbf96..74dc0eb 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -118,7 +118,7 @@ class CopyvioMixIn(object):
         article = MarkovChain(parser.strip())
         workspace = CopyvioWorkspace(
             article, min_confidence, max_time, self._logger, self._addheaders,
-            short_circuit=short_circuit)
+            short_circuit=short_circuit, detect_exclusions=True)
         if self._exclusions_db:
             self._exclusions_db.sync(self.site.name)
             exclude = lambda u: self._exclusions_db.check(self.site.name, u)
@@ -176,7 +176,7 @@ class CopyvioMixIn(object):
         article = MarkovChain(ArticleTextParser(self.get()).strip())
         workspace = CopyvioWorkspace(
             article, min_confidence, max_time, self._logger, self._addheaders,
-            max_time, 1)
+            max_time, num_workers=1)
         workspace.enqueue([url])
         workspace.wait()
         result = workspace.get_result()
diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py
index 3c88011..33bb5f8 100644
--- a/earwigbot/wiki/copyvios/exclusions.py
+++ b/earwigbot/wiki/copyvios/exclusions.py
@@ -28,7 +28,7 @@ from urlparse import urlparse
 
 from earwigbot import exceptions
 
-__all__ = ["ExclusionsDB"]
+__all__ = ["ExclusionsDB", "MIRROR_HINTS"]
 
 DEFAULT_SOURCES = {
     "all": [  # Applies to all, but located on enwiki
@@ -43,6 +43,8 @@ DEFAULT_SOURCES = {
     ]
 }
 
+MIRROR_HINTS = ["wikipedia.org/w/"]
+
 class ExclusionsDB(object):
     """
     **EarwigBot: Wiki Toolset: Exclusions Database Manager**
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index dbd103e..2f9a4a1 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -28,6 +28,7 @@ import mwparserfromhell
 
 from earwigbot import importer
 from earwigbot.exceptions import ParserExclusionError
+from earwigbot.copyvios.exclusions import MIRROR_HINTS
 
 bs4 = importer.new("bs4")
 nltk = importer.new("nltk")
@@ -186,21 +187,30 @@ class _HTMLParser(_BaseTextParser):
         "script", "style"
     ]
 
-    def parse(self):
+    def parse(self, detect_exclusions=False):
         """Return the actual text contained within an HTML document.
 
         Implemented using :py:mod:`BeautifulSoup <bs4>`
         (http://www.crummy.com/software/BeautifulSoup/).
         """
         try:
-            soup = bs4.BeautifulSoup(self.text, "lxml").body
+            soup = bs4.BeautifulSoup(self.text, "lxml")
         except ValueError:
-            soup = bs4.BeautifulSoup(self.text).body
+            soup = bs4.BeautifulSoup(self.text)
 
-        if not soup:
+        if not soup.body:
             # No <body> tag present in HTML ->
             # no scrapable content (possibly JS or <frame> magic):
             return ""
+
+        if detect_exclusions:
+            # Look for obvious signs that this is a mirror:
+            func = lambda attr: attr and any(
+                hint in attr for hint in MIRROR_HINTS)
+            if soup.find_all(href=func) or soup.find_all(src=func):
+                raise ParserExclusionError()
+
+        soup = soup.body
         is_comment = lambda text: isinstance(text, bs4.element.Comment)
         for comment in soup.find_all(text=is_comment):
             comment.extract()
@@ -219,7 +229,7 @@ class _PDFParser(_BaseTextParser):
         (u"\u2022", u" "),
     ]
 
-    def parse(self):
+    def parse(self, detect_exclusions=False):
         """Return extracted text from the PDF."""
         output = StringIO()
         manager = pdfinterp.PDFResourceManager()
@@ -245,7 +255,7 @@ class _PlainTextParser(_BaseTextParser):
     """A parser that can unicode-ify and strip text from a plain text page."""
     TYPE = "Text"
 
-    def parse(self):
+    def parse(self, detect_exclusions=False):
         """Unicode-ify and strip whitespace from the plain text document."""
         converted = bs4.UnicodeDammit(self.text).unicode_markup
         return converted.strip() if converted else ""
diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py
index f044c03..5a221ca 100644
--- a/earwigbot/wiki/copyvios/result.py
+++ b/earwigbot/wiki/copyvios/result.py
@@ -43,11 +43,14 @@ class CopyvioSource(object):
     - :py:attr:`excluded`:   whether this URL was in the exclusions list
     """
 
-    def __init__(self, workspace, url, headers=None, timeout=5):
+    def __init__(self, workspace, url, headers=None, timeout=5,
+                 detect_exclusions=False):
         self.workspace = workspace
         self.url = url
         self.headers = headers
         self.timeout = timeout
+        self.detect_exclusions = detect_exclusions
+
         self.confidence = 0.0
         self.chains = (EMPTY, EMPTY_INTERSECTION)
         self.skipped = False
diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py
index 4ba25bf..f35f484 100644
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -156,7 +156,7 @@ class _CopyvioWorker(object):
             except (IOError, struct_error):
                 return None
 
-        return handler(content).parse()
+        return handler(content).parse(source.detect_exclusions)
 
     def _acquire_new_site(self):
         """Block for a new unassigned site queue."""
@@ -240,7 +240,8 @@ class CopyvioWorkspace(object):
     """Manages a single copyvio check distributed across threads."""
 
     def __init__(self, article, min_confidence, max_time, logger, headers,
-                 url_timeout=5, num_workers=8, short_circuit=True):
+                 url_timeout=5, num_workers=8, short_circuit=True,
+                 detect_exclusions=False):
         self.sources = []
         self.finished = False
         self.possible_miss = False
@@ -254,7 +255,8 @@ class CopyvioWorkspace(object):
         self._finish_lock = Lock()
         self._short_circuit = short_circuit
         self._source_args = {"workspace": self, "headers": headers,
-                             "timeout": url_timeout}
+                             "timeout": url_timeout,
+                             "detect_exclusions": detect_exclusions}
 
         if _is_globalized:
             self._queues = _global_queues