@@ -118,7 +118,7 @@ class CopyvioMixIn(object): | |||||
article = MarkovChain(parser.strip()) | article = MarkovChain(parser.strip()) | ||||
workspace = CopyvioWorkspace( | workspace = CopyvioWorkspace( | ||||
article, min_confidence, max_time, self._logger, self._addheaders, | article, min_confidence, max_time, self._logger, self._addheaders, | ||||
short_circuit=short_circuit) | |||||
short_circuit=short_circuit, detect_exclusions=True) | |||||
if self._exclusions_db: | if self._exclusions_db: | ||||
self._exclusions_db.sync(self.site.name) | self._exclusions_db.sync(self.site.name) | ||||
exclude = lambda u: self._exclusions_db.check(self.site.name, u) | exclude = lambda u: self._exclusions_db.check(self.site.name, u) | ||||
@@ -176,7 +176,7 @@ class CopyvioMixIn(object): | |||||
article = MarkovChain(ArticleTextParser(self.get()).strip()) | article = MarkovChain(ArticleTextParser(self.get()).strip()) | ||||
workspace = CopyvioWorkspace( | workspace = CopyvioWorkspace( | ||||
article, min_confidence, max_time, self._logger, self._addheaders, | article, min_confidence, max_time, self._logger, self._addheaders, | ||||
max_time, 1) | |||||
max_time, num_workers=1) | |||||
workspace.enqueue([url]) | workspace.enqueue([url]) | ||||
workspace.wait() | workspace.wait() | ||||
result = workspace.get_result() | result = workspace.get_result() | ||||
@@ -28,7 +28,7 @@ from urlparse import urlparse | |||||
from earwigbot import exceptions | from earwigbot import exceptions | ||||
__all__ = ["ExclusionsDB"] | |||||
__all__ = ["ExclusionsDB", "MIRROR_HINTS"] | |||||
DEFAULT_SOURCES = { | DEFAULT_SOURCES = { | ||||
"all": [ # Applies to all, but located on enwiki | "all": [ # Applies to all, but located on enwiki | ||||
@@ -43,6 +43,8 @@ DEFAULT_SOURCES = { | |||||
] | ] | ||||
} | } | ||||
MIRROR_HINTS = ["wikipedia.org/w/"] | |||||
class ExclusionsDB(object): | class ExclusionsDB(object): | ||||
""" | """ | ||||
**EarwigBot: Wiki Toolset: Exclusions Database Manager** | **EarwigBot: Wiki Toolset: Exclusions Database Manager** | ||||
@@ -28,6 +28,7 @@ import mwparserfromhell | |||||
from earwigbot import importer | from earwigbot import importer | ||||
from earwigbot.exceptions import ParserExclusionError | from earwigbot.exceptions import ParserExclusionError | ||||
from earwigbot.copyvios.exclusions import MIRROR_HINTS | |||||
bs4 = importer.new("bs4") | bs4 = importer.new("bs4") | ||||
nltk = importer.new("nltk") | nltk = importer.new("nltk") | ||||
@@ -186,21 +187,30 @@ class _HTMLParser(_BaseTextParser): | |||||
"script", "style" | "script", "style" | ||||
] | ] | ||||
def parse(self): | |||||
def parse(self, detect_exclusions=False): | |||||
"""Return the actual text contained within an HTML document. | """Return the actual text contained within an HTML document. | ||||
Implemented using :py:mod:`BeautifulSoup <bs4>` | Implemented using :py:mod:`BeautifulSoup <bs4>` | ||||
(http://www.crummy.com/software/BeautifulSoup/). | (http://www.crummy.com/software/BeautifulSoup/). | ||||
""" | """ | ||||
try: | try: | ||||
soup = bs4.BeautifulSoup(self.text, "lxml").body | |||||
soup = bs4.BeautifulSoup(self.text, "lxml") | |||||
except ValueError: | except ValueError: | ||||
soup = bs4.BeautifulSoup(self.text).body | |||||
soup = bs4.BeautifulSoup(self.text) | |||||
if not soup: | |||||
if not soup.body: | |||||
# No <body> tag present in HTML -> | # No <body> tag present in HTML -> | ||||
# no scrapable content (possibly JS or <frame> magic): | # no scrapable content (possibly JS or <frame> magic): | ||||
return "" | return "" | ||||
if detect_exclusions: | |||||
# Look for obvious signs that this is a mirror: | |||||
func = lambda attr: attr and any( | |||||
hint in attr for hint in MIRROR_HINTS) | |||||
if soup.find_all(href=func) or soup.find_all(src=func): | |||||
raise ParserExclusionError() | |||||
soup = soup.body | |||||
is_comment = lambda text: isinstance(text, bs4.element.Comment) | is_comment = lambda text: isinstance(text, bs4.element.Comment) | ||||
for comment in soup.find_all(text=is_comment): | for comment in soup.find_all(text=is_comment): | ||||
comment.extract() | comment.extract() | ||||
@@ -219,7 +229,7 @@ class _PDFParser(_BaseTextParser): | |||||
(u"\u2022", u" "), | (u"\u2022", u" "), | ||||
] | ] | ||||
def parse(self): | |||||
def parse(self, detect_exclusions=False): | |||||
"""Return extracted text from the PDF.""" | """Return extracted text from the PDF.""" | ||||
output = StringIO() | output = StringIO() | ||||
manager = pdfinterp.PDFResourceManager() | manager = pdfinterp.PDFResourceManager() | ||||
@@ -245,7 +255,7 @@ class _PlainTextParser(_BaseTextParser): | |||||
"""A parser that can unicode-ify and strip text from a plain text page.""" | """A parser that can unicode-ify and strip text from a plain text page.""" | ||||
TYPE = "Text" | TYPE = "Text" | ||||
def parse(self): | |||||
def parse(self, detect_exclusions=False): | |||||
"""Unicode-ify and strip whitespace from the plain text document.""" | """Unicode-ify and strip whitespace from the plain text document.""" | ||||
converted = bs4.UnicodeDammit(self.text).unicode_markup | converted = bs4.UnicodeDammit(self.text).unicode_markup | ||||
return converted.strip() if converted else "" | return converted.strip() if converted else "" | ||||
@@ -43,11 +43,14 @@ class CopyvioSource(object): | |||||
- :py:attr:`excluded`: whether this URL was in the exclusions list | - :py:attr:`excluded`: whether this URL was in the exclusions list | ||||
""" | """ | ||||
def __init__(self, workspace, url, headers=None, timeout=5): | |||||
def __init__(self, workspace, url, headers=None, timeout=5, | |||||
detect_exclusions=False): | |||||
self.workspace = workspace | self.workspace = workspace | ||||
self.url = url | self.url = url | ||||
self.headers = headers | self.headers = headers | ||||
self.timeout = timeout | self.timeout = timeout | ||||
self.detect_exclusions = detect_exclusions | |||||
self.confidence = 0.0 | self.confidence = 0.0 | ||||
self.chains = (EMPTY, EMPTY_INTERSECTION) | self.chains = (EMPTY, EMPTY_INTERSECTION) | ||||
self.skipped = False | self.skipped = False | ||||
@@ -156,7 +156,7 @@ class _CopyvioWorker(object): | |||||
except (IOError, struct_error): | except (IOError, struct_error): | ||||
return None | return None | ||||
return handler(content).parse() | |||||
return handler(content).parse(source.detect_exclusions) | |||||
def _acquire_new_site(self): | def _acquire_new_site(self): | ||||
"""Block for a new unassigned site queue.""" | """Block for a new unassigned site queue.""" | ||||
@@ -240,7 +240,8 @@ class CopyvioWorkspace(object): | |||||
"""Manages a single copyvio check distributed across threads.""" | """Manages a single copyvio check distributed across threads.""" | ||||
def __init__(self, article, min_confidence, max_time, logger, headers, | def __init__(self, article, min_confidence, max_time, logger, headers, | ||||
url_timeout=5, num_workers=8, short_circuit=True): | |||||
url_timeout=5, num_workers=8, short_circuit=True, | |||||
detect_exclusions=False): | |||||
self.sources = [] | self.sources = [] | ||||
self.finished = False | self.finished = False | ||||
self.possible_miss = False | self.possible_miss = False | ||||
@@ -254,7 +255,8 @@ class CopyvioWorkspace(object): | |||||
self._finish_lock = Lock() | self._finish_lock = Lock() | ||||
self._short_circuit = short_circuit | self._short_circuit = short_circuit | ||||
self._source_args = {"workspace": self, "headers": headers, | self._source_args = {"workspace": self, "headers": headers, | ||||
"timeout": url_timeout} | |||||
"timeout": url_timeout, | |||||
"detect_exclusions": detect_exclusions} | |||||
if _is_globalized: | if _is_globalized: | ||||
self._queues = _global_queues | self._queues = _global_queues | ||||