diff --git a/CHANGELOG b/CHANGELOG index 68725f3..fec7e5e 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -15,7 +15,8 @@ v0.2 (unreleased): - Added copyvio detector functionality: specifying a max time for checks; improved exclusion support. URL loading and parsing is parallelized to speed up check times, with a multi-threaded worker model that avoids concurrent - requests to the same domain. Fixed assorted bugs. + requests to the same domain. Improvements to the comparison algorithm. Fixed + assorted bugs. - Added support for Wikimedia Labs when creating a config file. - Added and improved lazy importing for various dependencies. - Fixed a bug in job scheduling. diff --git a/earwigbot/exceptions.py b/earwigbot/exceptions.py index 56bdfaa..ad34ae0 100644 --- a/earwigbot/exceptions.py +++ b/earwigbot/exceptions.py @@ -52,6 +52,7 @@ This module contains all exceptions used by EarwigBot:: +-- UnknownSearchEngineError +-- UnsupportedSearchEngineError +-- SearchQueryError + +-- ParserExclusionError """ class EarwigBotError(Exception): @@ -231,9 +232,7 @@ class UnknownSearchEngineError(CopyvioCheckError): :py:attr:`config.wiki["search"]["engine"]`. Raised by :py:meth:`Page.copyvio_check - ` and - :py:meth:`Page.copyvio_compare - `. + `. """ class UnsupportedSearchEngineError(CopyvioCheckError): @@ -243,16 +242,20 @@ class UnsupportedSearchEngineError(CopyvioCheckError): couldn't be imported. Raised by :py:meth:`Page.copyvio_check - ` and - :py:meth:`Page.copyvio_compare - `. + `. """ class SearchQueryError(CopyvioCheckError): """Some error ocurred while doing a search query. Raised by :py:meth:`Page.copyvio_check - ` and - :py:meth:`Page.copyvio_compare - `. + `. + """ + +class ParserExclusionError(CopyvioCheckError): + """A content parser detected that the given source should be excluded. + + Raised internally by :py:meth:`Page.copyvio_check + `; should not be + exposed in client code. """ diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 46fbf96..74dc0eb 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -118,7 +118,7 @@ class CopyvioMixIn(object): article = MarkovChain(parser.strip()) workspace = CopyvioWorkspace( article, min_confidence, max_time, self._logger, self._addheaders, - short_circuit=short_circuit) + short_circuit=short_circuit, detect_exclusions=True) if self._exclusions_db: self._exclusions_db.sync(self.site.name) exclude = lambda u: self._exclusions_db.check(self.site.name, u) @@ -176,7 +176,7 @@ class CopyvioMixIn(object): article = MarkovChain(ArticleTextParser(self.get()).strip()) workspace = CopyvioWorkspace( article, min_confidence, max_time, self._logger, self._addheaders, - max_time, 1) + max_time, num_workers=1) workspace.enqueue([url]) workspace.wait() result = workspace.get_result() diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py index 3c88011..33bb5f8 100644 --- a/earwigbot/wiki/copyvios/exclusions.py +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -28,7 +28,7 @@ from urlparse import urlparse from earwigbot import exceptions -__all__ = ["ExclusionsDB"] +__all__ = ["ExclusionsDB", "MIRROR_HINTS"] DEFAULT_SOURCES = { "all": [ # Applies to all, but located on enwiki @@ -43,6 +43,8 @@ DEFAULT_SOURCES = { ] } +MIRROR_HINTS = ["wikipedia.org/w/"] + class ExclusionsDB(object): """ **EarwigBot: Wiki Toolset: Exclusions Database Manager** diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py index 491c875..057fcc1 100644 --- a/earwigbot/wiki/copyvios/markov.py +++ b/earwigbot/wiki/copyvios/markov.py @@ -30,7 +30,7 @@ class MarkovChain(object): """Implements a basic ngram Markov chain of words.""" START = -1 END = -2 - degree = 3 # 2 for bigrams, 3 for trigrams, etc. + degree = 5 # 2 for bigrams, 3 for trigrams, etc. def __init__(self, text): self.text = text diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index a676413..502bd4d 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -27,6 +27,8 @@ from StringIO import StringIO import mwparserfromhell from earwigbot import importer +from earwigbot.exceptions import ParserExclusionError +from earwigbot.wiki.copyvios.exclusions import MIRROR_HINTS bs4 = importer.new("bs4") nltk = importer.new("nltk") @@ -58,6 +60,21 @@ class _BaseTextParser(object): class ArticleTextParser(_BaseTextParser): """A parser that can strip and chunk wikicode article text.""" TYPE = "Article" + TEMPLATE_MERGE_THRESHOLD = 35 + + def _merge_templates(self, code): + """Merge template contents in to wikicode when the values are long.""" + for template in code.filter_templates(recursive=code.RECURSE_OTHERS): + chunks = [] + for param in template.params: + if len(param.value) >= self.TEMPLATE_MERGE_THRESHOLD: + self._merge_templates(param.value) + chunks.append(param.value) + if chunks: + subst = u" ".join(map(unicode, chunks)) + code.replace(template, u" " + subst + u" ") + else: + code.remove(template) def strip(self): """Clean the page's raw text by removing templates and formatting. @@ -94,6 +111,9 @@ class ArticleTextParser(_BaseTextParser): for tag in wikicode.filter_tags(matches=lambda tag: tag.tag == "ref"): remove(wikicode, tag) + # Merge in template contents when the values are long: + self._merge_templates(wikicode) + clean = wikicode.strip_code(normalize=True, collapse=True) self.clean = re.sub("\n\n+", "\n", clean).strip() return self.clean @@ -167,21 +187,30 @@ class _HTMLParser(_BaseTextParser): "script", "style" ] - def parse(self): + def parse(self, **kwargs): """Return the actual text contained within an HTML document. Implemented using :py:mod:`BeautifulSoup ` (http://www.crummy.com/software/BeautifulSoup/). """ try: - soup = bs4.BeautifulSoup(self.text, "lxml").body + soup = bs4.BeautifulSoup(self.text, "lxml") except ValueError: - soup = bs4.BeautifulSoup(self.text).body + soup = bs4.BeautifulSoup(self.text) - if not soup: + if not soup.body: # No tag present in HTML -> # no scrapable content (possibly JS or magic): return "" + + if kwargs["detect_exclusions"]: + # Look for obvious signs that this is a mirror: + func = lambda attr: attr and any( + hint in attr for hint in MIRROR_HINTS) + if soup.find_all(href=func) or soup.find_all(src=func): + raise ParserExclusionError() + + soup = soup.body is_comment = lambda text: isinstance(text, bs4.element.Comment) for comment in soup.find_all(text=is_comment): comment.extract() @@ -200,7 +229,7 @@ class _PDFParser(_BaseTextParser): (u"\u2022", u" "), ] - def parse(self): + def parse(self, **kwargs): """Return extracted text from the PDF.""" output = StringIO() manager = pdfinterp.PDFResourceManager() @@ -226,7 +255,7 @@ class _PlainTextParser(_BaseTextParser): """A parser that can unicode-ify and strip text from a plain text page.""" TYPE = "Text" - def parse(self): + def parse(self, **kwargs): """Unicode-ify and strip whitespace from the plain text document.""" converted = bs4.UnicodeDammit(self.text).unicode_markup return converted.strip() if converted else "" diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py index 85b5cc4..5a221ca 100644 --- a/earwigbot/wiki/copyvios/result.py +++ b/earwigbot/wiki/copyvios/result.py @@ -40,16 +40,21 @@ class CopyvioSource(object): - :py:attr:`confidence`: the confidence of a violation, between 0 and 1 - :py:attr:`chains`: a 2-tuple of the source chain and the delta chain - :py:attr:`skipped`: whether this URL was skipped during the check + - :py:attr:`excluded`: whether this URL was in the exclusions list """ - def __init__(self, workspace, url, headers=None, timeout=5): + def __init__(self, workspace, url, headers=None, timeout=5, + detect_exclusions=False): self.workspace = workspace self.url = url self.headers = headers self.timeout = timeout + self.detect_exclusions = detect_exclusions + self.confidence = 0.0 self.chains = (EMPTY, EMPTY_INTERSECTION) self.skipped = False + self.excluded = False self._event1 = Event() self._event2 = Event() @@ -57,11 +62,15 @@ class CopyvioSource(object): def __repr__(self): """Return the canonical string representation of the source.""" - res = "CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r})" - return res.format(self.url, self.confidence, self.skipped) + res = ("CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r}, " + "excluded={3!r})") + return res.format( + self.url, self.confidence, self.skipped, self.excluded) def __str__(self): """Return a nice string representation of the source.""" + if self.excluded: + return "".format(self.url) if self.skipped: return "".format(self.url) res = "" diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index e471651..e03765e 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -34,6 +34,7 @@ from time import time from urllib2 import build_opener, URLError from earwigbot import importer +from earwigbot.exceptions import ParserExclusionError from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection from earwigbot.wiki.copyvios.parsers import get_parser from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource @@ -155,7 +156,8 @@ class _CopyvioWorker(object): except (IOError, struct_error): return None - return handler(content).parse() + return handler(content).parse( + detect_exclusions=source.detect_exclusions) def _acquire_new_site(self): """Block for a new unassigned site queue.""" @@ -218,9 +220,15 @@ class _CopyvioWorker(object): except StopIteration: self._logger.debug("Exiting: got stop signal") return - text = self._open_url(source) - chain = MarkovChain(text) if text else None - source.workspace.compare(source, chain) + + try: + text = self._open_url(source) + except ParserExclusionError: + source.skipped = source.excluded = True + source.finish_work() + else: + chain = MarkovChain(text) if text else None + source.workspace.compare(source, chain) def start(self): """Start the copyvio worker in a new thread.""" @@ -233,7 +241,8 @@ class CopyvioWorkspace(object): """Manages a single copyvio check distributed across threads.""" def __init__(self, article, min_confidence, max_time, logger, headers, - url_timeout=5, num_workers=8, short_circuit=True): + url_timeout=5, num_workers=8, short_circuit=True, + detect_exclusions=False): self.sources = [] self.finished = False self.possible_miss = False @@ -247,7 +256,8 @@ class CopyvioWorkspace(object): self._finish_lock = Lock() self._short_circuit = short_circuit self._source_args = {"workspace": self, "headers": headers, - "timeout": url_timeout} + "timeout": url_timeout, + "detect_exclusions": detect_exclusions} if _is_globalized: self._queues = _global_queues @@ -311,11 +321,15 @@ class CopyvioWorkspace(object): if url in self._handled_urls: continue self._handled_urls.add(url) - if exclude_check and exclude_check(url): - continue source = CopyvioSource(url=url, **self._source_args) self.sources.append(source) + + if exclude_check and exclude_check(url): + self._logger.debug(u"enqueue(): exclude {0}".format(url)) + source.excluded = True + source.skip() + continue if self._short_circuit and self.finished: self._logger.debug(u"enqueue(): auto-skip {0}".format(url)) source.skip() @@ -371,6 +385,8 @@ class CopyvioWorkspace(object): def cmpfunc(s1, s2): if s2.confidence != s1.confidence: return 1 if s2.confidence > s1.confidence else -1 + if s2.excluded != s1.excluded: + return 1 if s1.excluded else -1 return int(s1.skipped) - int(s2.skipped) self.sources.sort(cmpfunc)