@@ -118,7 +118,7 @@ class CopyvioMixIn(object): | |||
article = MarkovChain(parser.strip()) | |||
workspace = CopyvioWorkspace( | |||
article, min_confidence, max_time, self._logger, self._addheaders, | |||
short_circuit=short_circuit) | |||
short_circuit=short_circuit, detect_exclusions=True) | |||
if self._exclusions_db: | |||
self._exclusions_db.sync(self.site.name) | |||
exclude = lambda u: self._exclusions_db.check(self.site.name, u) | |||
@@ -176,7 +176,7 @@ class CopyvioMixIn(object): | |||
article = MarkovChain(ArticleTextParser(self.get()).strip()) | |||
workspace = CopyvioWorkspace( | |||
article, min_confidence, max_time, self._logger, self._addheaders, | |||
max_time, 1) | |||
max_time, num_workers=1) | |||
workspace.enqueue([url]) | |||
workspace.wait() | |||
result = workspace.get_result() | |||
@@ -28,7 +28,7 @@ from urlparse import urlparse | |||
from earwigbot import exceptions | |||
__all__ = ["ExclusionsDB"] | |||
__all__ = ["ExclusionsDB", "MIRROR_HINTS"] | |||
DEFAULT_SOURCES = { | |||
"all": [ # Applies to all, but located on enwiki | |||
@@ -43,6 +43,8 @@ DEFAULT_SOURCES = { | |||
] | |||
} | |||
MIRROR_HINTS = ["wikipedia.org/w/"] | |||
class ExclusionsDB(object): | |||
""" | |||
**EarwigBot: Wiki Toolset: Exclusions Database Manager** | |||
@@ -28,6 +28,7 @@ import mwparserfromhell | |||
from earwigbot import importer | |||
from earwigbot.exceptions import ParserExclusionError | |||
from earwigbot.copyvios.exclusions import MIRROR_HINTS | |||
bs4 = importer.new("bs4") | |||
nltk = importer.new("nltk") | |||
@@ -186,21 +187,30 @@ class _HTMLParser(_BaseTextParser): | |||
"script", "style" | |||
] | |||
def parse(self): | |||
def parse(self, detect_exclusions=False): | |||
"""Return the actual text contained within an HTML document. | |||
Implemented using :py:mod:`BeautifulSoup <bs4>` | |||
(http://www.crummy.com/software/BeautifulSoup/). | |||
""" | |||
try: | |||
soup = bs4.BeautifulSoup(self.text, "lxml").body | |||
soup = bs4.BeautifulSoup(self.text, "lxml") | |||
except ValueError: | |||
soup = bs4.BeautifulSoup(self.text).body | |||
soup = bs4.BeautifulSoup(self.text) | |||
if not soup: | |||
if not soup.body: | |||
# No <body> tag present in HTML -> | |||
# no scrapable content (possibly JS or <frame> magic): | |||
return "" | |||
if detect_exclusions: | |||
# Look for obvious signs that this is a mirror: | |||
func = lambda attr: attr and any( | |||
hint in attr for hint in MIRROR_HINTS) | |||
if soup.find_all(href=func) or soup.find_all(src=func): | |||
raise ParserExclusionError() | |||
soup = soup.body | |||
is_comment = lambda text: isinstance(text, bs4.element.Comment) | |||
for comment in soup.find_all(text=is_comment): | |||
comment.extract() | |||
@@ -219,7 +229,7 @@ class _PDFParser(_BaseTextParser): | |||
(u"\u2022", u" "), | |||
] | |||
def parse(self): | |||
def parse(self, detect_exclusions=False): | |||
"""Return extracted text from the PDF.""" | |||
output = StringIO() | |||
manager = pdfinterp.PDFResourceManager() | |||
@@ -245,7 +255,7 @@ class _PlainTextParser(_BaseTextParser): | |||
"""A parser that can unicode-ify and strip text from a plain text page.""" | |||
TYPE = "Text" | |||
def parse(self): | |||
def parse(self, detect_exclusions=False): | |||
"""Unicode-ify and strip whitespace from the plain text document.""" | |||
converted = bs4.UnicodeDammit(self.text).unicode_markup | |||
return converted.strip() if converted else "" | |||
@@ -43,11 +43,14 @@ class CopyvioSource(object): | |||
- :py:attr:`excluded`: whether this URL was in the exclusions list | |||
""" | |||
def __init__(self, workspace, url, headers=None, timeout=5): | |||
def __init__(self, workspace, url, headers=None, timeout=5, | |||
detect_exclusions=False): | |||
self.workspace = workspace | |||
self.url = url | |||
self.headers = headers | |||
self.timeout = timeout | |||
self.detect_exclusions = detect_exclusions | |||
self.confidence = 0.0 | |||
self.chains = (EMPTY, EMPTY_INTERSECTION) | |||
self.skipped = False | |||
@@ -156,7 +156,7 @@ class _CopyvioWorker(object): | |||
except (IOError, struct_error): | |||
return None | |||
return handler(content).parse() | |||
return handler(content).parse(source.detect_exclusions) | |||
def _acquire_new_site(self): | |||
"""Block for a new unassigned site queue.""" | |||
@@ -240,7 +240,8 @@ class CopyvioWorkspace(object): | |||
"""Manages a single copyvio check distributed across threads.""" | |||
def __init__(self, article, min_confidence, max_time, logger, headers, | |||
url_timeout=5, num_workers=8, short_circuit=True): | |||
url_timeout=5, num_workers=8, short_circuit=True, | |||
detect_exclusions=False): | |||
self.sources = [] | |||
self.finished = False | |||
self.possible_miss = False | |||
@@ -254,7 +255,8 @@ class CopyvioWorkspace(object): | |||
self._finish_lock = Lock() | |||
self._short_circuit = short_circuit | |||
self._source_args = {"workspace": self, "headers": headers, | |||
"timeout": url_timeout} | |||
"timeout": url_timeout, | |||
"detect_exclusions": detect_exclusions} | |||
if _is_globalized: | |||
self._queues = _global_queues | |||