From d741667c4c1af7ad758ce98607dcbb3d08125939 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Mon, 28 Sep 2015 21:43:43 -0500
Subject: [PATCH 1/7] Try using pentagrams rather than trigrams for copyvio
 Markov chains.

---
 CHANGELOG                         | 3 ++-
 earwigbot/wiki/copyvios/markov.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 68725f3..fec7e5e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -15,7 +15,8 @@ v0.2 (unreleased):
 - Added copyvio detector functionality: specifying a max time for checks;
   improved exclusion support. URL loading and parsing is parallelized to speed
   up check times, with a multi-threaded worker model that avoids concurrent
-  requests to the same domain. Fixed assorted bugs.
+  requests to the same domain. Improvements to the comparison algorithm. Fixed
+  assorted bugs.
 - Added support for Wikimedia Labs when creating a config file.
 - Added and improved lazy importing for various dependencies.
 - Fixed a bug in job scheduling.
diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py
index 491c875..057fcc1 100644
--- a/earwigbot/wiki/copyvios/markov.py
+++ b/earwigbot/wiki/copyvios/markov.py
@@ -30,7 +30,7 @@ class MarkovChain(object):
     """Implements a basic ngram Markov chain of words."""
     START = -1
     END = -2
-    degree = 3  # 2 for bigrams, 3 for trigrams, etc.
+    degree = 5  # 2 for bigrams, 3 for trigrams, etc.
 
     def __init__(self, text):
         self.text = text

From 509598d7fcf684cffd5693e8f1a2f1e413ceaf02 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Mon, 28 Sep 2015 23:57:31 -0500
Subject: [PATCH 2/7] Try merging in templates with parameter values of a
 certain size (fixes #42)

---
 earwigbot/wiki/copyvios/parsers.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index a676413..49bc4af 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -58,6 +58,21 @@ class _BaseTextParser(object):
 class ArticleTextParser(_BaseTextParser):
     """A parser that can strip and chunk wikicode article text."""
     TYPE = "Article"
+    TEMPLATE_MERGE_THRESHOLD = 35
+
+    def _merge_templates(self, code):
+        """Merge template contents in to wikicode when the values are long."""
+        for template in code.filter_templates(recursive=code.RECURSE_OTHERS):
+            chunks = []
+            for param in template.params:
+                if len(param.value) >= self.TEMPLATE_MERGE_THRESHOLD:
+                    self._merge_templates(param.value)
+                    chunks.append(param.value)
+            if chunks:
+                subst = u" ".join(map(unicode, chunks))
+                code.replace(template, u" " + subst + u" ")
+            else:
+                code.remove(template)
 
     def strip(self):
         """Clean the page's raw text by removing templates and formatting.
@@ -94,6 +109,9 @@ class ArticleTextParser(_BaseTextParser):
         for tag in wikicode.filter_tags(matches=lambda tag: tag.tag == "ref"):
             remove(wikicode, tag)
 
+        # Merge in template contents when the values are long:
+        self._merge_templates(code)
+
         clean = wikicode.strip_code(normalize=True, collapse=True)
         self.clean = re.sub("\n\n+", "\n", clean).strip()
         return self.clean

From e99e1c1ef171ff62cd64006e7d6034901627f04b Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 29 Sep 2015 00:03:40 -0500
Subject: [PATCH 3/7] Typo fix.

---
 earwigbot/wiki/copyvios/parsers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index 49bc4af..cafc746 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -110,7 +110,7 @@ class ArticleTextParser(_BaseTextParser):
             remove(wikicode, tag)
 
         # Merge in template contents when the values are long:
-        self._merge_templates(code)
+        self._merge_templates(wikicode)
 
         clean = wikicode.strip_code(normalize=True, collapse=True)
         self.clean = re.sub("\n\n+", "\n", clean).strip()

From bb819c93065b77467e94c2da83cbb43ce92bcb6c Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 29 Sep 2015 02:26:32 -0500
Subject: [PATCH 4/7] Explicitly include excluded URLs in the result set; mark
 as excluded.

---
 earwigbot/wiki/copyvios/result.py  | 10 ++++++++--
 earwigbot/wiki/copyvios/workers.py | 10 ++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py
index 85b5cc4..f044c03 100644
--- a/earwigbot/wiki/copyvios/result.py
+++ b/earwigbot/wiki/copyvios/result.py
@@ -40,6 +40,7 @@ class CopyvioSource(object):
     - :py:attr:`confidence`: the confidence of a violation, between 0 and 1
     - :py:attr:`chains`:     a 2-tuple of the source chain and the delta chain
     - :py:attr:`skipped`:    whether this URL was skipped during the check
+    - :py:attr:`excluded`:   whether this URL was in the exclusions list
     """
 
     def __init__(self, workspace, url, headers=None, timeout=5):
@@ -50,6 +51,7 @@ class CopyvioSource(object):
         self.confidence = 0.0
         self.chains = (EMPTY, EMPTY_INTERSECTION)
         self.skipped = False
+        self.excluded = False
 
         self._event1 = Event()
         self._event2 = Event()
@@ -57,11 +59,15 @@ class CopyvioSource(object):
 
     def __repr__(self):
         """Return the canonical string representation of the source."""
-        res = "CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r})"
-        return res.format(self.url, self.confidence, self.skipped)
+        res = ("CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r}, "
+               "excluded={3!r})")
+        return res.format(
+            self.url, self.confidence, self.skipped, self.excluded)
 
     def __str__(self):
         """Return a nice string representation of the source."""
+        if self.excluded:
+            return "<CopyvioSource ({0}, excluded)>".format(self.url)
         if self.skipped:
             return "<CopyvioSource ({0}, skipped)>".format(self.url)
         res = "<CopyvioSource ({0} with {1} conf)>"
diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py
index e471651..5230a44 100644
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -311,11 +311,15 @@ class CopyvioWorkspace(object):
                 if url in self._handled_urls:
                     continue
                 self._handled_urls.add(url)
-                if exclude_check and exclude_check(url):
-                    continue
 
                 source = CopyvioSource(url=url, **self._source_args)
                 self.sources.append(source)
+
+                if exclude_check and exclude_check(url):
+                    self._logger.debug(u"enqueue(): exclude {0}".format(url))
+                    source.excluded = True
+                    source.skip()
+                    continue
                 if self._short_circuit and self.finished:
                     self._logger.debug(u"enqueue(): auto-skip {0}".format(url))
                     source.skip()
@@ -371,6 +375,8 @@ class CopyvioWorkspace(object):
         def cmpfunc(s1, s2):
             if s2.confidence != s1.confidence:
                 return 1 if s2.confidence > s1.confidence else -1
+            if s2.excluded != s1.excluded:
+                return 1 if s1.excluded else -1
             return int(s1.skipped) - int(s2.skipped)
 
         self.sources.sort(cmpfunc)

From 81a090c923db15ae76a11e6f2f8b759709a7cedc Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 29 Sep 2015 03:26:59 -0500
Subject: [PATCH 5/7] Allow content parsers to signal that a source should be
 excluded.

---
 earwigbot/exceptions.py            | 21 ++++++++++++---------
 earwigbot/wiki/copyvios/parsers.py |  1 +
 earwigbot/wiki/copyvios/workers.py | 13 ++++++++++---
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/earwigbot/exceptions.py b/earwigbot/exceptions.py
index 56bdfaa..ad34ae0 100644
--- a/earwigbot/exceptions.py
+++ b/earwigbot/exceptions.py
@@ -52,6 +52,7 @@ This module contains all exceptions used by EarwigBot::
                +-- UnknownSearchEngineError
                +-- UnsupportedSearchEngineError
                +-- SearchQueryError
+               +-- ParserExclusionError
 """
 
 class EarwigBotError(Exception):
@@ -231,9 +232,7 @@ class UnknownSearchEngineError(CopyvioCheckError):
     :py:attr:`config.wiki["search"]["engine"]`.
 
     Raised by :py:meth:`Page.copyvio_check
-    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and
-    :py:meth:`Page.copyvio_compare
-    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`.
+    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`.
     """
 
 class UnsupportedSearchEngineError(CopyvioCheckError):
@@ -243,16 +242,20 @@ class UnsupportedSearchEngineError(CopyvioCheckError):
     couldn't be imported.
 
     Raised by :py:meth:`Page.copyvio_check
-    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and
-    :py:meth:`Page.copyvio_compare
-    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`.
+    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`.
     """
 
 class SearchQueryError(CopyvioCheckError):
     """Some error ocurred while doing a search query.
 
     Raised by :py:meth:`Page.copyvio_check
-    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and
-    :py:meth:`Page.copyvio_compare
-    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`.
+    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`.
+    """
+
+class ParserExclusionError(CopyvioCheckError):
+    """A content parser detected that the given source should be excluded.
+
+    Raised internally by :py:meth:`Page.copyvio_check
+    <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`; should not be
+    exposed in client code.
     """
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index cafc746..dbd103e 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -27,6 +27,7 @@ from StringIO import StringIO
 import mwparserfromhell
 
 from earwigbot import importer
+from earwigbot.exceptions import ParserExclusionError
 
 bs4 = importer.new("bs4")
 nltk = importer.new("nltk")
diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py
index 5230a44..4ba25bf 100644
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -34,6 +34,7 @@ from time import time
 from urllib2 import build_opener, URLError
 
 from earwigbot import importer
+from earwigbot.exceptions import ParserExclusionError
 from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
 from earwigbot.wiki.copyvios.parsers import get_parser
 from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource
@@ -218,9 +219,15 @@ class _CopyvioWorker(object):
             except StopIteration:
                 self._logger.debug("Exiting: got stop signal")
                 return
-            text = self._open_url(source)
-            chain = MarkovChain(text) if text else None
-            source.workspace.compare(source, chain)
+
+            try:
+                text = self._open_url(source)
+            except ParserExclusionError:
+                source.skipped = source.excluded = True
+                source.finish_work()
+            else:
+                chain = MarkovChain(text) if text else None
+                source.workspace.compare(source, chain)
 
     def start(self):
         """Start the copyvio worker in a new thread."""

From 03910b6cb5b5711f1dc040c0f24e92f269213bb9 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 29 Sep 2015 04:00:25 -0500
Subject: [PATCH 6/7] Add mirror detection logic to parsers; fixes.

---
 earwigbot/wiki/copyvios/__init__.py   |  4 ++--
 earwigbot/wiki/copyvios/exclusions.py |  4 +++-
 earwigbot/wiki/copyvios/parsers.py    | 22 ++++++++++++++++------
 earwigbot/wiki/copyvios/result.py     |  5 ++++-
 earwigbot/wiki/copyvios/workers.py    |  8 +++++---
 5 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index 46fbf96..74dc0eb 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -118,7 +118,7 @@ class CopyvioMixIn(object):
         article = MarkovChain(parser.strip())
         workspace = CopyvioWorkspace(
             article, min_confidence, max_time, self._logger, self._addheaders,
-            short_circuit=short_circuit)
+            short_circuit=short_circuit, detect_exclusions=True)
         if self._exclusions_db:
             self._exclusions_db.sync(self.site.name)
             exclude = lambda u: self._exclusions_db.check(self.site.name, u)
@@ -176,7 +176,7 @@ class CopyvioMixIn(object):
         article = MarkovChain(ArticleTextParser(self.get()).strip())
         workspace = CopyvioWorkspace(
             article, min_confidence, max_time, self._logger, self._addheaders,
-            max_time, 1)
+            max_time, num_workers=1)
         workspace.enqueue([url])
         workspace.wait()
         result = workspace.get_result()
diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py
index 3c88011..33bb5f8 100644
--- a/earwigbot/wiki/copyvios/exclusions.py
+++ b/earwigbot/wiki/copyvios/exclusions.py
@@ -28,7 +28,7 @@ from urlparse import urlparse
 
 from earwigbot import exceptions
 
-__all__ = ["ExclusionsDB"]
+__all__ = ["ExclusionsDB", "MIRROR_HINTS"]
 
 DEFAULT_SOURCES = {
     "all": [  # Applies to all, but located on enwiki
@@ -43,6 +43,8 @@ DEFAULT_SOURCES = {
     ]
 }
 
+MIRROR_HINTS = ["wikipedia.org/w/"]
+
 class ExclusionsDB(object):
     """
     **EarwigBot: Wiki Toolset: Exclusions Database Manager**
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index dbd103e..2f9a4a1 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -28,6 +28,7 @@ import mwparserfromhell
 
 from earwigbot import importer
 from earwigbot.exceptions import ParserExclusionError
+from earwigbot.copyvios.exclusions import MIRROR_HINTS
 
 bs4 = importer.new("bs4")
 nltk = importer.new("nltk")
@@ -186,21 +187,30 @@ class _HTMLParser(_BaseTextParser):
         "script", "style"
     ]
 
-    def parse(self):
+    def parse(self, detect_exclusions=False):
         """Return the actual text contained within an HTML document.
 
         Implemented using :py:mod:`BeautifulSoup <bs4>`
         (http://www.crummy.com/software/BeautifulSoup/).
         """
         try:
-            soup = bs4.BeautifulSoup(self.text, "lxml").body
+            soup = bs4.BeautifulSoup(self.text, "lxml")
         except ValueError:
-            soup = bs4.BeautifulSoup(self.text).body
+            soup = bs4.BeautifulSoup(self.text)
 
-        if not soup:
+        if not soup.body:
             # No <body> tag present in HTML ->
             # no scrapable content (possibly JS or <frame> magic):
             return ""
+
+        if detect_exclusions:
+            # Look for obvious signs that this is a mirror:
+            func = lambda attr: attr and any(
+                hint in attr for hint in MIRROR_HINTS)
+            if soup.find_all(href=func) or soup.find_all(src=func):
+                raise ParserExclusionError()
+
+        soup = soup.body
         is_comment = lambda text: isinstance(text, bs4.element.Comment)
         for comment in soup.find_all(text=is_comment):
             comment.extract()
@@ -219,7 +229,7 @@ class _PDFParser(_BaseTextParser):
         (u"\u2022", u" "),
     ]
 
-    def parse(self):
+    def parse(self, detect_exclusions=False):
         """Return extracted text from the PDF."""
         output = StringIO()
         manager = pdfinterp.PDFResourceManager()
@@ -245,7 +255,7 @@ class _PlainTextParser(_BaseTextParser):
     """A parser that can unicode-ify and strip text from a plain text page."""
     TYPE = "Text"
 
-    def parse(self):
+    def parse(self, detect_exclusions=False):
         """Unicode-ify and strip whitespace from the plain text document."""
         converted = bs4.UnicodeDammit(self.text).unicode_markup
         return converted.strip() if converted else ""
diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py
index f044c03..5a221ca 100644
--- a/earwigbot/wiki/copyvios/result.py
+++ b/earwigbot/wiki/copyvios/result.py
@@ -43,11 +43,14 @@ class CopyvioSource(object):
     - :py:attr:`excluded`:   whether this URL was in the exclusions list
     """
 
-    def __init__(self, workspace, url, headers=None, timeout=5):
+    def __init__(self, workspace, url, headers=None, timeout=5,
+                 detect_exclusions=False):
         self.workspace = workspace
         self.url = url
         self.headers = headers
         self.timeout = timeout
+        self.detect_exclusions = detect_exclusions
+
         self.confidence = 0.0
         self.chains = (EMPTY, EMPTY_INTERSECTION)
         self.skipped = False
diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py
index 4ba25bf..f35f484 100644
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -156,7 +156,7 @@ class _CopyvioWorker(object):
             except (IOError, struct_error):
                 return None
 
-        return handler(content).parse()
+        return handler(content).parse(source.detect_exclusions)
 
     def _acquire_new_site(self):
         """Block for a new unassigned site queue."""
@@ -240,7 +240,8 @@ class CopyvioWorkspace(object):
     """Manages a single copyvio check distributed across threads."""
 
     def __init__(self, article, min_confidence, max_time, logger, headers,
-                 url_timeout=5, num_workers=8, short_circuit=True):
+                 url_timeout=5, num_workers=8, short_circuit=True,
+                 detect_exclusions=False):
         self.sources = []
         self.finished = False
         self.possible_miss = False
@@ -254,7 +255,8 @@ class CopyvioWorkspace(object):
         self._finish_lock = Lock()
         self._short_circuit = short_circuit
         self._source_args = {"workspace": self, "headers": headers,
-                             "timeout": url_timeout}
+                             "timeout": url_timeout,
+                             "detect_exclusions": detect_exclusions}
 
         if _is_globalized:
             self._queues = _global_queues

From 147b46f572bef94547ba8f33954026de69592495 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 29 Sep 2015 04:03:39 -0500
Subject: [PATCH 7/7] A couple more fixes and cleanup.

---
 earwigbot/wiki/copyvios/parsers.py | 10 +++++-----
 earwigbot/wiki/copyvios/workers.py |  3 ++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index 2f9a4a1..502bd4d 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -28,7 +28,7 @@ import mwparserfromhell
 
 from earwigbot import importer
 from earwigbot.exceptions import ParserExclusionError
-from earwigbot.copyvios.exclusions import MIRROR_HINTS
+from earwigbot.wiki.copyvios.exclusions import MIRROR_HINTS
 
 bs4 = importer.new("bs4")
 nltk = importer.new("nltk")
@@ -187,7 +187,7 @@ class _HTMLParser(_BaseTextParser):
         "script", "style"
     ]
 
-    def parse(self, detect_exclusions=False):
+    def parse(self, **kwargs):
         """Return the actual text contained within an HTML document.
 
         Implemented using :py:mod:`BeautifulSoup <bs4>`
@@ -203,7 +203,7 @@ class _HTMLParser(_BaseTextParser):
             # no scrapable content (possibly JS or <frame> magic):
             return ""
 
-        if detect_exclusions:
+        if kwargs["detect_exclusions"]:
             # Look for obvious signs that this is a mirror:
             func = lambda attr: attr and any(
                 hint in attr for hint in MIRROR_HINTS)
@@ -229,7 +229,7 @@ class _PDFParser(_BaseTextParser):
         (u"\u2022", u" "),
     ]
 
-    def parse(self, detect_exclusions=False):
+    def parse(self, **kwargs):
         """Return extracted text from the PDF."""
         output = StringIO()
         manager = pdfinterp.PDFResourceManager()
@@ -255,7 +255,7 @@ class _PlainTextParser(_BaseTextParser):
     """A parser that can unicode-ify and strip text from a plain text page."""
     TYPE = "Text"
 
-    def parse(self, detect_exclusions=False):
+    def parse(self, **kwargs):
         """Unicode-ify and strip whitespace from the plain text document."""
         converted = bs4.UnicodeDammit(self.text).unicode_markup
         return converted.strip() if converted else ""
diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py
index f35f484..e03765e 100644
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -156,7 +156,8 @@ class _CopyvioWorker(object):
             except (IOError, struct_error):
                 return None
 
-        return handler(content).parse(source.detect_exclusions)
+        return handler(content).parse(
+            detect_exclusions=source.detect_exclusions)
 
     def _acquire_new_site(self):
         """Block for a new unassigned site queue."""