From d741667c4c1af7ad758ce98607dcbb3d08125939 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 28 Sep 2015 21:43:43 -0500 Subject: [PATCH 1/7] Try using pentagrams rather than trigrams for copyvio Markov chains. --- CHANGELOG | 3 ++- earwigbot/wiki/copyvios/markov.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 68725f3..fec7e5e 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -15,7 +15,8 @@ v0.2 (unreleased): - Added copyvio detector functionality: specifying a max time for checks; improved exclusion support. URL loading and parsing is parallelized to speed up check times, with a multi-threaded worker model that avoids concurrent - requests to the same domain. Fixed assorted bugs. + requests to the same domain. Improvements to the comparison algorithm. Fixed + assorted bugs. - Added support for Wikimedia Labs when creating a config file. - Added and improved lazy importing for various dependencies. - Fixed a bug in job scheduling. diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py index 491c875..057fcc1 100644 --- a/earwigbot/wiki/copyvios/markov.py +++ b/earwigbot/wiki/copyvios/markov.py @@ -30,7 +30,7 @@ class MarkovChain(object): """Implements a basic ngram Markov chain of words.""" START = -1 END = -2 - degree = 3 # 2 for bigrams, 3 for trigrams, etc. + degree = 5 # 2 for bigrams, 3 for trigrams, etc. def __init__(self, text): self.text = text From 509598d7fcf684cffd5693e8f1a2f1e413ceaf02 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 28 Sep 2015 23:57:31 -0500 Subject: [PATCH 2/7] Try merging in templates with parameter values of a certain size (fixes #42) --- earwigbot/wiki/copyvios/parsers.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index a676413..49bc4af 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -58,6 +58,21 @@ class _BaseTextParser(object): class ArticleTextParser(_BaseTextParser): """A parser that can strip and chunk wikicode article text.""" TYPE = "Article" + TEMPLATE_MERGE_THRESHOLD = 35 + + def _merge_templates(self, code): + """Merge template contents in to wikicode when the values are long.""" + for template in code.filter_templates(recursive=code.RECURSE_OTHERS): + chunks = [] + for param in template.params: + if len(param.value) >= self.TEMPLATE_MERGE_THRESHOLD: + self._merge_templates(param.value) + chunks.append(param.value) + if chunks: + subst = u" ".join(map(unicode, chunks)) + code.replace(template, u" " + subst + u" ") + else: + code.remove(template) def strip(self): """Clean the page's raw text by removing templates and formatting. @@ -94,6 +109,9 @@ class ArticleTextParser(_BaseTextParser): for tag in wikicode.filter_tags(matches=lambda tag: tag.tag == "ref"): remove(wikicode, tag) + # Merge in template contents when the values are long: + self._merge_templates(code) + clean = wikicode.strip_code(normalize=True, collapse=True) self.clean = re.sub("\n\n+", "\n", clean).strip() return self.clean From e99e1c1ef171ff62cd64006e7d6034901627f04b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 29 Sep 2015 00:03:40 -0500 Subject: [PATCH 3/7] Typo fix. --- earwigbot/wiki/copyvios/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 49bc4af..cafc746 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -110,7 +110,7 @@ class ArticleTextParser(_BaseTextParser): remove(wikicode, tag) # Merge in template contents when the values are long: - self._merge_templates(code) + self._merge_templates(wikicode) clean = wikicode.strip_code(normalize=True, collapse=True) self.clean = re.sub("\n\n+", "\n", clean).strip() From bb819c93065b77467e94c2da83cbb43ce92bcb6c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 29 Sep 2015 02:26:32 -0500 Subject: [PATCH 4/7] Explicitly include excluded URLs in the result set; mark as excluded. --- earwigbot/wiki/copyvios/result.py | 10 ++++++++-- earwigbot/wiki/copyvios/workers.py | 10 ++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py index 85b5cc4..f044c03 100644 --- a/earwigbot/wiki/copyvios/result.py +++ b/earwigbot/wiki/copyvios/result.py @@ -40,6 +40,7 @@ class CopyvioSource(object): - :py:attr:`confidence`: the confidence of a violation, between 0 and 1 - :py:attr:`chains`: a 2-tuple of the source chain and the delta chain - :py:attr:`skipped`: whether this URL was skipped during the check + - :py:attr:`excluded`: whether this URL was in the exclusions list """ def __init__(self, workspace, url, headers=None, timeout=5): @@ -50,6 +51,7 @@ class CopyvioSource(object): self.confidence = 0.0 self.chains = (EMPTY, EMPTY_INTERSECTION) self.skipped = False + self.excluded = False self._event1 = Event() self._event2 = Event() @@ -57,11 +59,15 @@ class CopyvioSource(object): def __repr__(self): """Return the canonical string representation of the source.""" - res = "CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r})" - return res.format(self.url, self.confidence, self.skipped) + res = ("CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r}, " + "excluded={3!r})") + return res.format( + self.url, self.confidence, self.skipped, self.excluded) def __str__(self): """Return a nice string representation of the source.""" + if self.excluded: + return "".format(self.url) if self.skipped: return "".format(self.url) res = "" diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index e471651..5230a44 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -311,11 +311,15 @@ class CopyvioWorkspace(object): if url in self._handled_urls: continue self._handled_urls.add(url) - if exclude_check and exclude_check(url): - continue source = CopyvioSource(url=url, **self._source_args) self.sources.append(source) + + if exclude_check and exclude_check(url): + self._logger.debug(u"enqueue(): exclude {0}".format(url)) + source.excluded = True + source.skip() + continue if self._short_circuit and self.finished: self._logger.debug(u"enqueue(): auto-skip {0}".format(url)) source.skip() @@ -371,6 +375,8 @@ class CopyvioWorkspace(object): def cmpfunc(s1, s2): if s2.confidence != s1.confidence: return 1 if s2.confidence > s1.confidence else -1 + if s2.excluded != s1.excluded: + return 1 if s1.excluded else -1 return int(s1.skipped) - int(s2.skipped) self.sources.sort(cmpfunc) From 81a090c923db15ae76a11e6f2f8b759709a7cedc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 29 Sep 2015 03:26:59 -0500 Subject: [PATCH 5/7] Allow content parsers to signal that a source should be excluded. --- earwigbot/exceptions.py | 21 ++++++++++++--------- earwigbot/wiki/copyvios/parsers.py | 1 + earwigbot/wiki/copyvios/workers.py | 13 ++++++++++--- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/earwigbot/exceptions.py b/earwigbot/exceptions.py index 56bdfaa..ad34ae0 100644 --- a/earwigbot/exceptions.py +++ b/earwigbot/exceptions.py @@ -52,6 +52,7 @@ This module contains all exceptions used by EarwigBot:: +-- UnknownSearchEngineError +-- UnsupportedSearchEngineError +-- SearchQueryError + +-- ParserExclusionError """ class EarwigBotError(Exception): @@ -231,9 +232,7 @@ class UnknownSearchEngineError(CopyvioCheckError): :py:attr:`config.wiki["search"]["engine"]`. Raised by :py:meth:`Page.copyvio_check - ` and - :py:meth:`Page.copyvio_compare - `. + `. """ class UnsupportedSearchEngineError(CopyvioCheckError): @@ -243,16 +242,20 @@ class UnsupportedSearchEngineError(CopyvioCheckError): couldn't be imported. Raised by :py:meth:`Page.copyvio_check - ` and - :py:meth:`Page.copyvio_compare - `. + `. """ class SearchQueryError(CopyvioCheckError): """Some error ocurred while doing a search query. Raised by :py:meth:`Page.copyvio_check - ` and - :py:meth:`Page.copyvio_compare - `. + `. + """ + +class ParserExclusionError(CopyvioCheckError): + """A content parser detected that the given source should be excluded. + + Raised internally by :py:meth:`Page.copyvio_check + `; should not be + exposed in client code. """ diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index cafc746..dbd103e 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -27,6 +27,7 @@ from StringIO import StringIO import mwparserfromhell from earwigbot import importer +from earwigbot.exceptions import ParserExclusionError bs4 = importer.new("bs4") nltk = importer.new("nltk") diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index 5230a44..4ba25bf 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -34,6 +34,7 @@ from time import time from urllib2 import build_opener, URLError from earwigbot import importer +from earwigbot.exceptions import ParserExclusionError from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection from earwigbot.wiki.copyvios.parsers import get_parser from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource @@ -218,9 +219,15 @@ class _CopyvioWorker(object): except StopIteration: self._logger.debug("Exiting: got stop signal") return - text = self._open_url(source) - chain = MarkovChain(text) if text else None - source.workspace.compare(source, chain) + + try: + text = self._open_url(source) + except ParserExclusionError: + source.skipped = source.excluded = True + source.finish_work() + else: + chain = MarkovChain(text) if text else None + source.workspace.compare(source, chain) def start(self): """Start the copyvio worker in a new thread.""" From 03910b6cb5b5711f1dc040c0f24e92f269213bb9 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 29 Sep 2015 04:00:25 -0500 Subject: [PATCH 6/7] Add mirror detection logic to parsers; fixes. --- earwigbot/wiki/copyvios/__init__.py | 4 ++-- earwigbot/wiki/copyvios/exclusions.py | 4 +++- earwigbot/wiki/copyvios/parsers.py | 22 ++++++++++++++++------ earwigbot/wiki/copyvios/result.py | 5 ++++- earwigbot/wiki/copyvios/workers.py | 8 +++++--- 5 files changed, 30 insertions(+), 13 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 46fbf96..74dc0eb 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -118,7 +118,7 @@ class CopyvioMixIn(object): article = MarkovChain(parser.strip()) workspace = CopyvioWorkspace( article, min_confidence, max_time, self._logger, self._addheaders, - short_circuit=short_circuit) + short_circuit=short_circuit, detect_exclusions=True) if self._exclusions_db: self._exclusions_db.sync(self.site.name) exclude = lambda u: self._exclusions_db.check(self.site.name, u) @@ -176,7 +176,7 @@ class CopyvioMixIn(object): article = MarkovChain(ArticleTextParser(self.get()).strip()) workspace = CopyvioWorkspace( article, min_confidence, max_time, self._logger, self._addheaders, - max_time, 1) + max_time, num_workers=1) workspace.enqueue([url]) workspace.wait() result = workspace.get_result() diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py index 3c88011..33bb5f8 100644 --- a/earwigbot/wiki/copyvios/exclusions.py +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -28,7 +28,7 @@ from urlparse import urlparse from earwigbot import exceptions -__all__ = ["ExclusionsDB"] +__all__ = ["ExclusionsDB", "MIRROR_HINTS"] DEFAULT_SOURCES = { "all": [ # Applies to all, but located on enwiki @@ -43,6 +43,8 @@ DEFAULT_SOURCES = { ] } +MIRROR_HINTS = ["wikipedia.org/w/"] + class ExclusionsDB(object): """ **EarwigBot: Wiki Toolset: Exclusions Database Manager** diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index dbd103e..2f9a4a1 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -28,6 +28,7 @@ import mwparserfromhell from earwigbot import importer from earwigbot.exceptions import ParserExclusionError +from earwigbot.copyvios.exclusions import MIRROR_HINTS bs4 = importer.new("bs4") nltk = importer.new("nltk") @@ -186,21 +187,30 @@ class _HTMLParser(_BaseTextParser): "script", "style" ] - def parse(self): + def parse(self, detect_exclusions=False): """Return the actual text contained within an HTML document. Implemented using :py:mod:`BeautifulSoup ` (http://www.crummy.com/software/BeautifulSoup/). """ try: - soup = bs4.BeautifulSoup(self.text, "lxml").body + soup = bs4.BeautifulSoup(self.text, "lxml") except ValueError: - soup = bs4.BeautifulSoup(self.text).body + soup = bs4.BeautifulSoup(self.text) - if not soup: + if not soup.body: # No tag present in HTML -> # no scrapable content (possibly JS or magic): return "" + + if detect_exclusions: + # Look for obvious signs that this is a mirror: + func = lambda attr: attr and any( + hint in attr for hint in MIRROR_HINTS) + if soup.find_all(href=func) or soup.find_all(src=func): + raise ParserExclusionError() + + soup = soup.body is_comment = lambda text: isinstance(text, bs4.element.Comment) for comment in soup.find_all(text=is_comment): comment.extract() @@ -219,7 +229,7 @@ class _PDFParser(_BaseTextParser): (u"\u2022", u" "), ] - def parse(self): + def parse(self, detect_exclusions=False): """Return extracted text from the PDF.""" output = StringIO() manager = pdfinterp.PDFResourceManager() @@ -245,7 +255,7 @@ class _PlainTextParser(_BaseTextParser): """A parser that can unicode-ify and strip text from a plain text page.""" TYPE = "Text" - def parse(self): + def parse(self, detect_exclusions=False): """Unicode-ify and strip whitespace from the plain text document.""" converted = bs4.UnicodeDammit(self.text).unicode_markup return converted.strip() if converted else "" diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py index f044c03..5a221ca 100644 --- a/earwigbot/wiki/copyvios/result.py +++ b/earwigbot/wiki/copyvios/result.py @@ -43,11 +43,14 @@ class CopyvioSource(object): - :py:attr:`excluded`: whether this URL was in the exclusions list """ - def __init__(self, workspace, url, headers=None, timeout=5): + def __init__(self, workspace, url, headers=None, timeout=5, + detect_exclusions=False): self.workspace = workspace self.url = url self.headers = headers self.timeout = timeout + self.detect_exclusions = detect_exclusions + self.confidence = 0.0 self.chains = (EMPTY, EMPTY_INTERSECTION) self.skipped = False diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index 4ba25bf..f35f484 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -156,7 +156,7 @@ class _CopyvioWorker(object): except (IOError, struct_error): return None - return handler(content).parse() + return handler(content).parse(source.detect_exclusions) def _acquire_new_site(self): """Block for a new unassigned site queue.""" @@ -240,7 +240,8 @@ class CopyvioWorkspace(object): """Manages a single copyvio check distributed across threads.""" def __init__(self, article, min_confidence, max_time, logger, headers, - url_timeout=5, num_workers=8, short_circuit=True): + url_timeout=5, num_workers=8, short_circuit=True, + detect_exclusions=False): self.sources = [] self.finished = False self.possible_miss = False @@ -254,7 +255,8 @@ class CopyvioWorkspace(object): self._finish_lock = Lock() self._short_circuit = short_circuit self._source_args = {"workspace": self, "headers": headers, - "timeout": url_timeout} + "timeout": url_timeout, + "detect_exclusions": detect_exclusions} if _is_globalized: self._queues = _global_queues From 147b46f572bef94547ba8f33954026de69592495 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 29 Sep 2015 04:03:39 -0500 Subject: [PATCH 7/7] A couple more fixes and cleanup. --- earwigbot/wiki/copyvios/parsers.py | 10 +++++----- earwigbot/wiki/copyvios/workers.py | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 2f9a4a1..502bd4d 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -28,7 +28,7 @@ import mwparserfromhell from earwigbot import importer from earwigbot.exceptions import ParserExclusionError -from earwigbot.copyvios.exclusions import MIRROR_HINTS +from earwigbot.wiki.copyvios.exclusions import MIRROR_HINTS bs4 = importer.new("bs4") nltk = importer.new("nltk") @@ -187,7 +187,7 @@ class _HTMLParser(_BaseTextParser): "script", "style" ] - def parse(self, detect_exclusions=False): + def parse(self, **kwargs): """Return the actual text contained within an HTML document. Implemented using :py:mod:`BeautifulSoup ` @@ -203,7 +203,7 @@ class _HTMLParser(_BaseTextParser): # no scrapable content (possibly JS or magic): return "" - if detect_exclusions: + if kwargs["detect_exclusions"]: # Look for obvious signs that this is a mirror: func = lambda attr: attr and any( hint in attr for hint in MIRROR_HINTS) @@ -229,7 +229,7 @@ class _PDFParser(_BaseTextParser): (u"\u2022", u" "), ] - def parse(self, detect_exclusions=False): + def parse(self, **kwargs): """Return extracted text from the PDF.""" output = StringIO() manager = pdfinterp.PDFResourceManager() @@ -255,7 +255,7 @@ class _PlainTextParser(_BaseTextParser): """A parser that can unicode-ify and strip text from a plain text page.""" TYPE = "Text" - def parse(self, detect_exclusions=False): + def parse(self, **kwargs): """Unicode-ify and strip whitespace from the plain text document.""" converted = bs4.UnicodeDammit(self.text).unicode_markup return converted.strip() if converted else "" diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index f35f484..e03765e 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -156,7 +156,8 @@ class _CopyvioWorker(object): except (IOError, struct_error): return None - return handler(content).parse(source.detect_exclusions) + return handler(content).parse( + detect_exclusions=source.detect_exclusions) def _acquire_new_site(self): """Block for a new unassigned site queue."""