@@ -15,7 +15,8 @@ v0.2 (unreleased): | |||
- Added copyvio detector functionality: specifying a max time for checks; | |||
improved exclusion support. URL loading and parsing is parallelized to speed | |||
up check times, with a multi-threaded worker model that avoids concurrent | |||
requests to the same domain. Fixed assorted bugs. | |||
requests to the same domain. Improvements to the comparison algorithm. Fixed | |||
assorted bugs. | |||
- Added support for Wikimedia Labs when creating a config file. | |||
- Added and improved lazy importing for various dependencies. | |||
- Fixed a bug in job scheduling. | |||
@@ -52,6 +52,7 @@ This module contains all exceptions used by EarwigBot:: | |||
+-- UnknownSearchEngineError | |||
+-- UnsupportedSearchEngineError | |||
+-- SearchQueryError | |||
+-- ParserExclusionError | |||
""" | |||
class EarwigBotError(Exception): | |||
@@ -231,9 +232,7 @@ class UnknownSearchEngineError(CopyvioCheckError): | |||
:py:attr:`config.wiki["search"]["engine"]`. | |||
Raised by :py:meth:`Page.copyvio_check | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and | |||
:py:meth:`Page.copyvio_compare | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`. | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`. | |||
""" | |||
class UnsupportedSearchEngineError(CopyvioCheckError): | |||
@@ -243,16 +242,20 @@ class UnsupportedSearchEngineError(CopyvioCheckError): | |||
couldn't be imported. | |||
Raised by :py:meth:`Page.copyvio_check | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and | |||
:py:meth:`Page.copyvio_compare | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`. | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`. | |||
""" | |||
class SearchQueryError(CopyvioCheckError): | |||
"""Some error ocurred while doing a search query. | |||
Raised by :py:meth:`Page.copyvio_check | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and | |||
:py:meth:`Page.copyvio_compare | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`. | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`. | |||
""" | |||
class ParserExclusionError(CopyvioCheckError): | |||
"""A content parser detected that the given source should be excluded. | |||
Raised internally by :py:meth:`Page.copyvio_check | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`; should not be | |||
exposed in client code. | |||
""" |
@@ -118,7 +118,7 @@ class CopyvioMixIn(object): | |||
article = MarkovChain(parser.strip()) | |||
workspace = CopyvioWorkspace( | |||
article, min_confidence, max_time, self._logger, self._addheaders, | |||
short_circuit=short_circuit) | |||
short_circuit=short_circuit, detect_exclusions=True) | |||
if self._exclusions_db: | |||
self._exclusions_db.sync(self.site.name) | |||
exclude = lambda u: self._exclusions_db.check(self.site.name, u) | |||
@@ -176,7 +176,7 @@ class CopyvioMixIn(object): | |||
article = MarkovChain(ArticleTextParser(self.get()).strip()) | |||
workspace = CopyvioWorkspace( | |||
article, min_confidence, max_time, self._logger, self._addheaders, | |||
max_time, 1) | |||
max_time, num_workers=1) | |||
workspace.enqueue([url]) | |||
workspace.wait() | |||
result = workspace.get_result() | |||
@@ -28,7 +28,7 @@ from urlparse import urlparse | |||
from earwigbot import exceptions | |||
__all__ = ["ExclusionsDB"] | |||
__all__ = ["ExclusionsDB", "MIRROR_HINTS"] | |||
DEFAULT_SOURCES = { | |||
"all": [ # Applies to all, but located on enwiki | |||
@@ -43,6 +43,8 @@ DEFAULT_SOURCES = { | |||
] | |||
} | |||
MIRROR_HINTS = ["wikipedia.org/w/"] | |||
class ExclusionsDB(object): | |||
""" | |||
**EarwigBot: Wiki Toolset: Exclusions Database Manager** | |||
@@ -30,7 +30,7 @@ class MarkovChain(object): | |||
"""Implements a basic ngram Markov chain of words.""" | |||
START = -1 | |||
END = -2 | |||
degree = 3 # 2 for bigrams, 3 for trigrams, etc. | |||
degree = 5 # 2 for bigrams, 3 for trigrams, etc. | |||
def __init__(self, text): | |||
self.text = text | |||
@@ -27,6 +27,8 @@ from StringIO import StringIO | |||
import mwparserfromhell | |||
from earwigbot import importer | |||
from earwigbot.exceptions import ParserExclusionError | |||
from earwigbot.wiki.copyvios.exclusions import MIRROR_HINTS | |||
bs4 = importer.new("bs4") | |||
nltk = importer.new("nltk") | |||
@@ -58,6 +60,21 @@ class _BaseTextParser(object): | |||
class ArticleTextParser(_BaseTextParser): | |||
"""A parser that can strip and chunk wikicode article text.""" | |||
TYPE = "Article" | |||
TEMPLATE_MERGE_THRESHOLD = 35 | |||
def _merge_templates(self, code): | |||
"""Merge template contents in to wikicode when the values are long.""" | |||
for template in code.filter_templates(recursive=code.RECURSE_OTHERS): | |||
chunks = [] | |||
for param in template.params: | |||
if len(param.value) >= self.TEMPLATE_MERGE_THRESHOLD: | |||
self._merge_templates(param.value) | |||
chunks.append(param.value) | |||
if chunks: | |||
subst = u" ".join(map(unicode, chunks)) | |||
code.replace(template, u" " + subst + u" ") | |||
else: | |||
code.remove(template) | |||
def strip(self): | |||
"""Clean the page's raw text by removing templates and formatting. | |||
@@ -94,6 +111,9 @@ class ArticleTextParser(_BaseTextParser): | |||
for tag in wikicode.filter_tags(matches=lambda tag: tag.tag == "ref"): | |||
remove(wikicode, tag) | |||
# Merge in template contents when the values are long: | |||
self._merge_templates(wikicode) | |||
clean = wikicode.strip_code(normalize=True, collapse=True) | |||
self.clean = re.sub("\n\n+", "\n", clean).strip() | |||
return self.clean | |||
@@ -167,21 +187,30 @@ class _HTMLParser(_BaseTextParser): | |||
"script", "style" | |||
] | |||
def parse(self): | |||
def parse(self, **kwargs): | |||
"""Return the actual text contained within an HTML document. | |||
Implemented using :py:mod:`BeautifulSoup <bs4>` | |||
(http://www.crummy.com/software/BeautifulSoup/). | |||
""" | |||
try: | |||
soup = bs4.BeautifulSoup(self.text, "lxml").body | |||
soup = bs4.BeautifulSoup(self.text, "lxml") | |||
except ValueError: | |||
soup = bs4.BeautifulSoup(self.text).body | |||
soup = bs4.BeautifulSoup(self.text) | |||
if not soup: | |||
if not soup.body: | |||
# No <body> tag present in HTML -> | |||
# no scrapable content (possibly JS or <frame> magic): | |||
return "" | |||
if kwargs["detect_exclusions"]: | |||
# Look for obvious signs that this is a mirror: | |||
func = lambda attr: attr and any( | |||
hint in attr for hint in MIRROR_HINTS) | |||
if soup.find_all(href=func) or soup.find_all(src=func): | |||
raise ParserExclusionError() | |||
soup = soup.body | |||
is_comment = lambda text: isinstance(text, bs4.element.Comment) | |||
for comment in soup.find_all(text=is_comment): | |||
comment.extract() | |||
@@ -200,7 +229,7 @@ class _PDFParser(_BaseTextParser): | |||
(u"\u2022", u" "), | |||
] | |||
def parse(self): | |||
def parse(self, **kwargs): | |||
"""Return extracted text from the PDF.""" | |||
output = StringIO() | |||
manager = pdfinterp.PDFResourceManager() | |||
@@ -226,7 +255,7 @@ class _PlainTextParser(_BaseTextParser): | |||
"""A parser that can unicode-ify and strip text from a plain text page.""" | |||
TYPE = "Text" | |||
def parse(self): | |||
def parse(self, **kwargs): | |||
"""Unicode-ify and strip whitespace from the plain text document.""" | |||
converted = bs4.UnicodeDammit(self.text).unicode_markup | |||
return converted.strip() if converted else "" | |||
@@ -40,16 +40,21 @@ class CopyvioSource(object): | |||
- :py:attr:`confidence`: the confidence of a violation, between 0 and 1 | |||
- :py:attr:`chains`: a 2-tuple of the source chain and the delta chain | |||
- :py:attr:`skipped`: whether this URL was skipped during the check | |||
- :py:attr:`excluded`: whether this URL was in the exclusions list | |||
""" | |||
def __init__(self, workspace, url, headers=None, timeout=5): | |||
def __init__(self, workspace, url, headers=None, timeout=5, | |||
detect_exclusions=False): | |||
self.workspace = workspace | |||
self.url = url | |||
self.headers = headers | |||
self.timeout = timeout | |||
self.detect_exclusions = detect_exclusions | |||
self.confidence = 0.0 | |||
self.chains = (EMPTY, EMPTY_INTERSECTION) | |||
self.skipped = False | |||
self.excluded = False | |||
self._event1 = Event() | |||
self._event2 = Event() | |||
@@ -57,11 +62,15 @@ class CopyvioSource(object): | |||
def __repr__(self): | |||
"""Return the canonical string representation of the source.""" | |||
res = "CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r})" | |||
return res.format(self.url, self.confidence, self.skipped) | |||
res = ("CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r}, " | |||
"excluded={3!r})") | |||
return res.format( | |||
self.url, self.confidence, self.skipped, self.excluded) | |||
def __str__(self): | |||
"""Return a nice string representation of the source.""" | |||
if self.excluded: | |||
return "<CopyvioSource ({0}, excluded)>".format(self.url) | |||
if self.skipped: | |||
return "<CopyvioSource ({0}, skipped)>".format(self.url) | |||
res = "<CopyvioSource ({0} with {1} conf)>" | |||
@@ -34,6 +34,7 @@ from time import time | |||
from urllib2 import build_opener, URLError | |||
from earwigbot import importer | |||
from earwigbot.exceptions import ParserExclusionError | |||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | |||
from earwigbot.wiki.copyvios.parsers import get_parser | |||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | |||
@@ -155,7 +156,8 @@ class _CopyvioWorker(object): | |||
except (IOError, struct_error): | |||
return None | |||
return handler(content).parse() | |||
return handler(content).parse( | |||
detect_exclusions=source.detect_exclusions) | |||
def _acquire_new_site(self): | |||
"""Block for a new unassigned site queue.""" | |||
@@ -218,9 +220,15 @@ class _CopyvioWorker(object): | |||
except StopIteration: | |||
self._logger.debug("Exiting: got stop signal") | |||
return | |||
text = self._open_url(source) | |||
chain = MarkovChain(text) if text else None | |||
source.workspace.compare(source, chain) | |||
try: | |||
text = self._open_url(source) | |||
except ParserExclusionError: | |||
source.skipped = source.excluded = True | |||
source.finish_work() | |||
else: | |||
chain = MarkovChain(text) if text else None | |||
source.workspace.compare(source, chain) | |||
def start(self): | |||
"""Start the copyvio worker in a new thread.""" | |||
@@ -233,7 +241,8 @@ class CopyvioWorkspace(object): | |||
"""Manages a single copyvio check distributed across threads.""" | |||
def __init__(self, article, min_confidence, max_time, logger, headers, | |||
url_timeout=5, num_workers=8, short_circuit=True): | |||
url_timeout=5, num_workers=8, short_circuit=True, | |||
detect_exclusions=False): | |||
self.sources = [] | |||
self.finished = False | |||
self.possible_miss = False | |||
@@ -247,7 +256,8 @@ class CopyvioWorkspace(object): | |||
self._finish_lock = Lock() | |||
self._short_circuit = short_circuit | |||
self._source_args = {"workspace": self, "headers": headers, | |||
"timeout": url_timeout} | |||
"timeout": url_timeout, | |||
"detect_exclusions": detect_exclusions} | |||
if _is_globalized: | |||
self._queues = _global_queues | |||
@@ -311,11 +321,15 @@ class CopyvioWorkspace(object): | |||
if url in self._handled_urls: | |||
continue | |||
self._handled_urls.add(url) | |||
if exclude_check and exclude_check(url): | |||
continue | |||
source = CopyvioSource(url=url, **self._source_args) | |||
self.sources.append(source) | |||
if exclude_check and exclude_check(url): | |||
self._logger.debug(u"enqueue(): exclude {0}".format(url)) | |||
source.excluded = True | |||
source.skip() | |||
continue | |||
if self._short_circuit and self.finished: | |||
self._logger.debug(u"enqueue(): auto-skip {0}".format(url)) | |||
source.skip() | |||
@@ -371,6 +385,8 @@ class CopyvioWorkspace(object): | |||
def cmpfunc(s1, s2): | |||
if s2.confidence != s1.confidence: | |||
return 1 if s2.confidence > s1.confidence else -1 | |||
if s2.excluded != s1.excluded: | |||
return 1 if s1.excluded else -1 | |||
return int(s1.skipped) - int(s2.skipped) | |||
self.sources.sort(cmpfunc) | |||