@@ -15,7 +15,8 @@ v0.2 (unreleased): | |||||
- Added copyvio detector functionality: specifying a max time for checks; | - Added copyvio detector functionality: specifying a max time for checks; | ||||
improved exclusion support. URL loading and parsing is parallelized to speed | improved exclusion support. URL loading and parsing is parallelized to speed | ||||
up check times, with a multi-threaded worker model that avoids concurrent | up check times, with a multi-threaded worker model that avoids concurrent | ||||
requests to the same domain. Fixed assorted bugs. | |||||
requests to the same domain. Improvements to the comparison algorithm. Fixed | |||||
assorted bugs. | |||||
- Added support for Wikimedia Labs when creating a config file. | - Added support for Wikimedia Labs when creating a config file. | ||||
- Added and improved lazy importing for various dependencies. | - Added and improved lazy importing for various dependencies. | ||||
- Fixed a bug in job scheduling. | - Fixed a bug in job scheduling. | ||||
@@ -52,6 +52,7 @@ This module contains all exceptions used by EarwigBot:: | |||||
+-- UnknownSearchEngineError | +-- UnknownSearchEngineError | ||||
+-- UnsupportedSearchEngineError | +-- UnsupportedSearchEngineError | ||||
+-- SearchQueryError | +-- SearchQueryError | ||||
+-- ParserExclusionError | |||||
""" | """ | ||||
class EarwigBotError(Exception): | class EarwigBotError(Exception): | ||||
@@ -231,9 +232,7 @@ class UnknownSearchEngineError(CopyvioCheckError): | |||||
:py:attr:`config.wiki["search"]["engine"]`. | :py:attr:`config.wiki["search"]["engine"]`. | ||||
Raised by :py:meth:`Page.copyvio_check | Raised by :py:meth:`Page.copyvio_check | ||||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and | |||||
:py:meth:`Page.copyvio_compare | |||||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`. | |||||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`. | |||||
""" | """ | ||||
class UnsupportedSearchEngineError(CopyvioCheckError): | class UnsupportedSearchEngineError(CopyvioCheckError): | ||||
@@ -243,16 +242,20 @@ class UnsupportedSearchEngineError(CopyvioCheckError): | |||||
couldn't be imported. | couldn't be imported. | ||||
Raised by :py:meth:`Page.copyvio_check | Raised by :py:meth:`Page.copyvio_check | ||||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and | |||||
:py:meth:`Page.copyvio_compare | |||||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`. | |||||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`. | |||||
""" | """ | ||||
class SearchQueryError(CopyvioCheckError): | class SearchQueryError(CopyvioCheckError): | ||||
"""Some error ocurred while doing a search query. | """Some error ocurred while doing a search query. | ||||
Raised by :py:meth:`Page.copyvio_check | Raised by :py:meth:`Page.copyvio_check | ||||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and | |||||
:py:meth:`Page.copyvio_compare | |||||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`. | |||||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`. | |||||
""" | |||||
class ParserExclusionError(CopyvioCheckError): | |||||
"""A content parser detected that the given source should be excluded. | |||||
Raised internally by :py:meth:`Page.copyvio_check | |||||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`; should not be | |||||
exposed in client code. | |||||
""" | """ |
@@ -118,7 +118,7 @@ class CopyvioMixIn(object): | |||||
article = MarkovChain(parser.strip()) | article = MarkovChain(parser.strip()) | ||||
workspace = CopyvioWorkspace( | workspace = CopyvioWorkspace( | ||||
article, min_confidence, max_time, self._logger, self._addheaders, | article, min_confidence, max_time, self._logger, self._addheaders, | ||||
short_circuit=short_circuit) | |||||
short_circuit=short_circuit, detect_exclusions=True) | |||||
if self._exclusions_db: | if self._exclusions_db: | ||||
self._exclusions_db.sync(self.site.name) | self._exclusions_db.sync(self.site.name) | ||||
exclude = lambda u: self._exclusions_db.check(self.site.name, u) | exclude = lambda u: self._exclusions_db.check(self.site.name, u) | ||||
@@ -176,7 +176,7 @@ class CopyvioMixIn(object): | |||||
article = MarkovChain(ArticleTextParser(self.get()).strip()) | article = MarkovChain(ArticleTextParser(self.get()).strip()) | ||||
workspace = CopyvioWorkspace( | workspace = CopyvioWorkspace( | ||||
article, min_confidence, max_time, self._logger, self._addheaders, | article, min_confidence, max_time, self._logger, self._addheaders, | ||||
max_time, 1) | |||||
max_time, num_workers=1) | |||||
workspace.enqueue([url]) | workspace.enqueue([url]) | ||||
workspace.wait() | workspace.wait() | ||||
result = workspace.get_result() | result = workspace.get_result() | ||||
@@ -28,7 +28,7 @@ from urlparse import urlparse | |||||
from earwigbot import exceptions | from earwigbot import exceptions | ||||
__all__ = ["ExclusionsDB"] | |||||
__all__ = ["ExclusionsDB", "MIRROR_HINTS"] | |||||
DEFAULT_SOURCES = { | DEFAULT_SOURCES = { | ||||
"all": [ # Applies to all, but located on enwiki | "all": [ # Applies to all, but located on enwiki | ||||
@@ -43,6 +43,8 @@ DEFAULT_SOURCES = { | |||||
] | ] | ||||
} | } | ||||
MIRROR_HINTS = ["wikipedia.org/w/"] | |||||
class ExclusionsDB(object): | class ExclusionsDB(object): | ||||
""" | """ | ||||
**EarwigBot: Wiki Toolset: Exclusions Database Manager** | **EarwigBot: Wiki Toolset: Exclusions Database Manager** | ||||
@@ -30,7 +30,7 @@ class MarkovChain(object): | |||||
"""Implements a basic ngram Markov chain of words.""" | """Implements a basic ngram Markov chain of words.""" | ||||
START = -1 | START = -1 | ||||
END = -2 | END = -2 | ||||
degree = 3 # 2 for bigrams, 3 for trigrams, etc. | |||||
degree = 5 # 2 for bigrams, 3 for trigrams, etc. | |||||
def __init__(self, text): | def __init__(self, text): | ||||
self.text = text | self.text = text | ||||
@@ -27,6 +27,8 @@ from StringIO import StringIO | |||||
import mwparserfromhell | import mwparserfromhell | ||||
from earwigbot import importer | from earwigbot import importer | ||||
from earwigbot.exceptions import ParserExclusionError | |||||
from earwigbot.wiki.copyvios.exclusions import MIRROR_HINTS | |||||
bs4 = importer.new("bs4") | bs4 = importer.new("bs4") | ||||
nltk = importer.new("nltk") | nltk = importer.new("nltk") | ||||
@@ -58,6 +60,21 @@ class _BaseTextParser(object): | |||||
class ArticleTextParser(_BaseTextParser): | class ArticleTextParser(_BaseTextParser): | ||||
"""A parser that can strip and chunk wikicode article text.""" | """A parser that can strip and chunk wikicode article text.""" | ||||
TYPE = "Article" | TYPE = "Article" | ||||
TEMPLATE_MERGE_THRESHOLD = 35 | |||||
def _merge_templates(self, code): | |||||
"""Merge template contents in to wikicode when the values are long.""" | |||||
for template in code.filter_templates(recursive=code.RECURSE_OTHERS): | |||||
chunks = [] | |||||
for param in template.params: | |||||
if len(param.value) >= self.TEMPLATE_MERGE_THRESHOLD: | |||||
self._merge_templates(param.value) | |||||
chunks.append(param.value) | |||||
if chunks: | |||||
subst = u" ".join(map(unicode, chunks)) | |||||
code.replace(template, u" " + subst + u" ") | |||||
else: | |||||
code.remove(template) | |||||
def strip(self): | def strip(self): | ||||
"""Clean the page's raw text by removing templates and formatting. | """Clean the page's raw text by removing templates and formatting. | ||||
@@ -94,6 +111,9 @@ class ArticleTextParser(_BaseTextParser): | |||||
for tag in wikicode.filter_tags(matches=lambda tag: tag.tag == "ref"): | for tag in wikicode.filter_tags(matches=lambda tag: tag.tag == "ref"): | ||||
remove(wikicode, tag) | remove(wikicode, tag) | ||||
# Merge in template contents when the values are long: | |||||
self._merge_templates(wikicode) | |||||
clean = wikicode.strip_code(normalize=True, collapse=True) | clean = wikicode.strip_code(normalize=True, collapse=True) | ||||
self.clean = re.sub("\n\n+", "\n", clean).strip() | self.clean = re.sub("\n\n+", "\n", clean).strip() | ||||
return self.clean | return self.clean | ||||
@@ -167,21 +187,30 @@ class _HTMLParser(_BaseTextParser): | |||||
"script", "style" | "script", "style" | ||||
] | ] | ||||
def parse(self): | |||||
def parse(self, **kwargs): | |||||
"""Return the actual text contained within an HTML document. | """Return the actual text contained within an HTML document. | ||||
Implemented using :py:mod:`BeautifulSoup <bs4>` | Implemented using :py:mod:`BeautifulSoup <bs4>` | ||||
(http://www.crummy.com/software/BeautifulSoup/). | (http://www.crummy.com/software/BeautifulSoup/). | ||||
""" | """ | ||||
try: | try: | ||||
soup = bs4.BeautifulSoup(self.text, "lxml").body | |||||
soup = bs4.BeautifulSoup(self.text, "lxml") | |||||
except ValueError: | except ValueError: | ||||
soup = bs4.BeautifulSoup(self.text).body | |||||
soup = bs4.BeautifulSoup(self.text) | |||||
if not soup: | |||||
if not soup.body: | |||||
# No <body> tag present in HTML -> | # No <body> tag present in HTML -> | ||||
# no scrapable content (possibly JS or <frame> magic): | # no scrapable content (possibly JS or <frame> magic): | ||||
return "" | return "" | ||||
if kwargs["detect_exclusions"]: | |||||
# Look for obvious signs that this is a mirror: | |||||
func = lambda attr: attr and any( | |||||
hint in attr for hint in MIRROR_HINTS) | |||||
if soup.find_all(href=func) or soup.find_all(src=func): | |||||
raise ParserExclusionError() | |||||
soup = soup.body | |||||
is_comment = lambda text: isinstance(text, bs4.element.Comment) | is_comment = lambda text: isinstance(text, bs4.element.Comment) | ||||
for comment in soup.find_all(text=is_comment): | for comment in soup.find_all(text=is_comment): | ||||
comment.extract() | comment.extract() | ||||
@@ -200,7 +229,7 @@ class _PDFParser(_BaseTextParser): | |||||
(u"\u2022", u" "), | (u"\u2022", u" "), | ||||
] | ] | ||||
def parse(self): | |||||
def parse(self, **kwargs): | |||||
"""Return extracted text from the PDF.""" | """Return extracted text from the PDF.""" | ||||
output = StringIO() | output = StringIO() | ||||
manager = pdfinterp.PDFResourceManager() | manager = pdfinterp.PDFResourceManager() | ||||
@@ -226,7 +255,7 @@ class _PlainTextParser(_BaseTextParser): | |||||
"""A parser that can unicode-ify and strip text from a plain text page.""" | """A parser that can unicode-ify and strip text from a plain text page.""" | ||||
TYPE = "Text" | TYPE = "Text" | ||||
def parse(self): | |||||
def parse(self, **kwargs): | |||||
"""Unicode-ify and strip whitespace from the plain text document.""" | """Unicode-ify and strip whitespace from the plain text document.""" | ||||
converted = bs4.UnicodeDammit(self.text).unicode_markup | converted = bs4.UnicodeDammit(self.text).unicode_markup | ||||
return converted.strip() if converted else "" | return converted.strip() if converted else "" | ||||
@@ -40,16 +40,21 @@ class CopyvioSource(object): | |||||
- :py:attr:`confidence`: the confidence of a violation, between 0 and 1 | - :py:attr:`confidence`: the confidence of a violation, between 0 and 1 | ||||
- :py:attr:`chains`: a 2-tuple of the source chain and the delta chain | - :py:attr:`chains`: a 2-tuple of the source chain and the delta chain | ||||
- :py:attr:`skipped`: whether this URL was skipped during the check | - :py:attr:`skipped`: whether this URL was skipped during the check | ||||
- :py:attr:`excluded`: whether this URL was in the exclusions list | |||||
""" | """ | ||||
def __init__(self, workspace, url, headers=None, timeout=5): | |||||
def __init__(self, workspace, url, headers=None, timeout=5, | |||||
detect_exclusions=False): | |||||
self.workspace = workspace | self.workspace = workspace | ||||
self.url = url | self.url = url | ||||
self.headers = headers | self.headers = headers | ||||
self.timeout = timeout | self.timeout = timeout | ||||
self.detect_exclusions = detect_exclusions | |||||
self.confidence = 0.0 | self.confidence = 0.0 | ||||
self.chains = (EMPTY, EMPTY_INTERSECTION) | self.chains = (EMPTY, EMPTY_INTERSECTION) | ||||
self.skipped = False | self.skipped = False | ||||
self.excluded = False | |||||
self._event1 = Event() | self._event1 = Event() | ||||
self._event2 = Event() | self._event2 = Event() | ||||
@@ -57,11 +62,15 @@ class CopyvioSource(object): | |||||
def __repr__(self): | def __repr__(self): | ||||
"""Return the canonical string representation of the source.""" | """Return the canonical string representation of the source.""" | ||||
res = "CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r})" | |||||
return res.format(self.url, self.confidence, self.skipped) | |||||
res = ("CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r}, " | |||||
"excluded={3!r})") | |||||
return res.format( | |||||
self.url, self.confidence, self.skipped, self.excluded) | |||||
def __str__(self): | def __str__(self): | ||||
"""Return a nice string representation of the source.""" | """Return a nice string representation of the source.""" | ||||
if self.excluded: | |||||
return "<CopyvioSource ({0}, excluded)>".format(self.url) | |||||
if self.skipped: | if self.skipped: | ||||
return "<CopyvioSource ({0}, skipped)>".format(self.url) | return "<CopyvioSource ({0}, skipped)>".format(self.url) | ||||
res = "<CopyvioSource ({0} with {1} conf)>" | res = "<CopyvioSource ({0} with {1} conf)>" | ||||
@@ -34,6 +34,7 @@ from time import time | |||||
from urllib2 import build_opener, URLError | from urllib2 import build_opener, URLError | ||||
from earwigbot import importer | from earwigbot import importer | ||||
from earwigbot.exceptions import ParserExclusionError | |||||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | ||||
from earwigbot.wiki.copyvios.parsers import get_parser | from earwigbot.wiki.copyvios.parsers import get_parser | ||||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | ||||
@@ -155,7 +156,8 @@ class _CopyvioWorker(object): | |||||
except (IOError, struct_error): | except (IOError, struct_error): | ||||
return None | return None | ||||
return handler(content).parse() | |||||
return handler(content).parse( | |||||
detect_exclusions=source.detect_exclusions) | |||||
def _acquire_new_site(self): | def _acquire_new_site(self): | ||||
"""Block for a new unassigned site queue.""" | """Block for a new unassigned site queue.""" | ||||
@@ -218,9 +220,15 @@ class _CopyvioWorker(object): | |||||
except StopIteration: | except StopIteration: | ||||
self._logger.debug("Exiting: got stop signal") | self._logger.debug("Exiting: got stop signal") | ||||
return | return | ||||
text = self._open_url(source) | |||||
chain = MarkovChain(text) if text else None | |||||
source.workspace.compare(source, chain) | |||||
try: | |||||
text = self._open_url(source) | |||||
except ParserExclusionError: | |||||
source.skipped = source.excluded = True | |||||
source.finish_work() | |||||
else: | |||||
chain = MarkovChain(text) if text else None | |||||
source.workspace.compare(source, chain) | |||||
def start(self): | def start(self): | ||||
"""Start the copyvio worker in a new thread.""" | """Start the copyvio worker in a new thread.""" | ||||
@@ -233,7 +241,8 @@ class CopyvioWorkspace(object): | |||||
"""Manages a single copyvio check distributed across threads.""" | """Manages a single copyvio check distributed across threads.""" | ||||
def __init__(self, article, min_confidence, max_time, logger, headers, | def __init__(self, article, min_confidence, max_time, logger, headers, | ||||
url_timeout=5, num_workers=8, short_circuit=True): | |||||
url_timeout=5, num_workers=8, short_circuit=True, | |||||
detect_exclusions=False): | |||||
self.sources = [] | self.sources = [] | ||||
self.finished = False | self.finished = False | ||||
self.possible_miss = False | self.possible_miss = False | ||||
@@ -247,7 +256,8 @@ class CopyvioWorkspace(object): | |||||
self._finish_lock = Lock() | self._finish_lock = Lock() | ||||
self._short_circuit = short_circuit | self._short_circuit = short_circuit | ||||
self._source_args = {"workspace": self, "headers": headers, | self._source_args = {"workspace": self, "headers": headers, | ||||
"timeout": url_timeout} | |||||
"timeout": url_timeout, | |||||
"detect_exclusions": detect_exclusions} | |||||
if _is_globalized: | if _is_globalized: | ||||
self._queues = _global_queues | self._queues = _global_queues | ||||
@@ -311,11 +321,15 @@ class CopyvioWorkspace(object): | |||||
if url in self._handled_urls: | if url in self._handled_urls: | ||||
continue | continue | ||||
self._handled_urls.add(url) | self._handled_urls.add(url) | ||||
if exclude_check and exclude_check(url): | |||||
continue | |||||
source = CopyvioSource(url=url, **self._source_args) | source = CopyvioSource(url=url, **self._source_args) | ||||
self.sources.append(source) | self.sources.append(source) | ||||
if exclude_check and exclude_check(url): | |||||
self._logger.debug(u"enqueue(): exclude {0}".format(url)) | |||||
source.excluded = True | |||||
source.skip() | |||||
continue | |||||
if self._short_circuit and self.finished: | if self._short_circuit and self.finished: | ||||
self._logger.debug(u"enqueue(): auto-skip {0}".format(url)) | self._logger.debug(u"enqueue(): auto-skip {0}".format(url)) | ||||
source.skip() | source.skip() | ||||
@@ -371,6 +385,8 @@ class CopyvioWorkspace(object): | |||||
def cmpfunc(s1, s2): | def cmpfunc(s1, s2): | ||||
if s2.confidence != s1.confidence: | if s2.confidence != s1.confidence: | ||||
return 1 if s2.confidence > s1.confidence else -1 | return 1 if s2.confidence > s1.confidence else -1 | ||||
if s2.excluded != s1.excluded: | |||||
return 1 if s1.excluded else -1 | |||||
return int(s1.skipped) - int(s2.skipped) | return int(s1.skipped) - int(s2.skipped) | ||||
self.sources.sort(cmpfunc) | self.sources.sort(cmpfunc) | ||||