@@ -118,7 +118,7 @@ class CopyvioMixIn(object): | |||||
article = MarkovChain(parser.strip()) | article = MarkovChain(parser.strip()) | ||||
workspace = CopyvioWorkspace( | workspace = CopyvioWorkspace( | ||||
article, min_confidence, max_time, self._logger, self._addheaders, | article, min_confidence, max_time, self._logger, self._addheaders, | ||||
short_circuit=short_circuit, detect_exclusions=True) | |||||
short_circuit=short_circuit, parser_args={"mirror_hints": ["wikipedia.org/w/"]}) | |||||
if self._exclusions_db: | if self._exclusions_db: | ||||
self._exclusions_db.sync(self.site.name) | self._exclusions_db.sync(self.site.name) | ||||
exclude = lambda u: self._exclusions_db.check(self.site.name, u) | exclude = lambda u: self._exclusions_db.check(self.site.name, u) | ||||
@@ -28,7 +28,7 @@ from urlparse import urlparse | |||||
from earwigbot import exceptions | from earwigbot import exceptions | ||||
__all__ = ["ExclusionsDB", "MIRROR_HINTS"] | |||||
__all__ = ["ExclusionsDB"] | |||||
DEFAULT_SOURCES = { | DEFAULT_SOURCES = { | ||||
"all": [ # Applies to all, but located on enwiki | "all": [ # Applies to all, but located on enwiki | ||||
@@ -43,8 +43,6 @@ DEFAULT_SOURCES = { | |||||
] | ] | ||||
} | } | ||||
MIRROR_HINTS = ["wikipedia.org/w/"] | |||||
class ExclusionsDB(object): | class ExclusionsDB(object): | ||||
""" | """ | ||||
**EarwigBot: Wiki Toolset: Exclusions Database Manager** | **EarwigBot: Wiki Toolset: Exclusions Database Manager** | ||||
@@ -35,7 +35,7 @@ class MarkovChain(object): | |||||
def __init__(self, text): | def __init__(self, text): | ||||
self.text = text | self.text = text | ||||
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) | self.chain = defaultdict(lambda: defaultdict(lambda: 0)) | ||||
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split() | |||||
words = sub(r"[^\w\s-]", "", text.lower(), flags=UNICODE).split() | |||||
padding = self.degree - 1 | padding = self.degree - 1 | ||||
words = ([self.START] * padding) + words + ([self.END] * padding) | words = ([self.START] * padding) + words + ([self.END] * padding) | ||||
@@ -28,7 +28,6 @@ import mwparserfromhell | |||||
from earwigbot import importer | from earwigbot import importer | ||||
from earwigbot.exceptions import ParserExclusionError | from earwigbot.exceptions import ParserExclusionError | ||||
from earwigbot.wiki.copyvios.exclusions import MIRROR_HINTS | |||||
bs4 = importer.new("bs4") | bs4 = importer.new("bs4") | ||||
nltk = importer.new("nltk") | nltk = importer.new("nltk") | ||||
@@ -44,8 +43,9 @@ class _BaseTextParser(object): | |||||
"""Base class for a parser that handles text.""" | """Base class for a parser that handles text.""" | ||||
TYPE = None | TYPE = None | ||||
def __init__(self, text): | |||||
def __init__(self, text, args=None): | |||||
self.text = text | self.text = text | ||||
self._args = args or {} | |||||
def __repr__(self): | def __repr__(self): | ||||
"""Return the canonical string representation of the text parser.""" | """Return the canonical string representation of the text parser.""" | ||||
@@ -187,7 +187,7 @@ class _HTMLParser(_BaseTextParser): | |||||
"script", "style" | "script", "style" | ||||
] | ] | ||||
def parse(self, **kwargs): | |||||
def parse(self): | |||||
"""Return the actual text contained within an HTML document. | """Return the actual text contained within an HTML document. | ||||
Implemented using :py:mod:`BeautifulSoup <bs4>` | Implemented using :py:mod:`BeautifulSoup <bs4>` | ||||
@@ -203,10 +203,10 @@ class _HTMLParser(_BaseTextParser): | |||||
# no scrapable content (possibly JS or <frame> magic): | # no scrapable content (possibly JS or <frame> magic): | ||||
return "" | return "" | ||||
if kwargs["detect_exclusions"]: | |||||
if "mirror_hints" in self._args: | |||||
# Look for obvious signs that this is a mirror: | # Look for obvious signs that this is a mirror: | ||||
func = lambda attr: attr and any( | func = lambda attr: attr and any( | ||||
hint in attr for hint in MIRROR_HINTS) | |||||
hint in attr for hint in self._args["mirror_hints"]) | |||||
if soup.find_all(href=func) or soup.find_all(src=func): | if soup.find_all(href=func) or soup.find_all(src=func): | ||||
raise ParserExclusionError() | raise ParserExclusionError() | ||||
@@ -229,7 +229,7 @@ class _PDFParser(_BaseTextParser): | |||||
(u"\u2022", u" "), | (u"\u2022", u" "), | ||||
] | ] | ||||
def parse(self, **kwargs): | |||||
def parse(self): | |||||
"""Return extracted text from the PDF.""" | """Return extracted text from the PDF.""" | ||||
output = StringIO() | output = StringIO() | ||||
manager = pdfinterp.PDFResourceManager() | manager = pdfinterp.PDFResourceManager() | ||||
@@ -255,7 +255,7 @@ class _PlainTextParser(_BaseTextParser): | |||||
"""A parser that can unicode-ify and strip text from a plain text page.""" | """A parser that can unicode-ify and strip text from a plain text page.""" | ||||
TYPE = "Text" | TYPE = "Text" | ||||
def parse(self, **kwargs): | |||||
def parse(self): | |||||
"""Unicode-ify and strip whitespace from the plain text document.""" | """Unicode-ify and strip whitespace from the plain text document.""" | ||||
converted = bs4.UnicodeDammit(self.text).unicode_markup | converted = bs4.UnicodeDammit(self.text).unicode_markup | ||||
return converted.strip() if converted else "" | return converted.strip() if converted else "" | ||||
@@ -44,12 +44,12 @@ class CopyvioSource(object): | |||||
""" | """ | ||||
def __init__(self, workspace, url, headers=None, timeout=5, | def __init__(self, workspace, url, headers=None, timeout=5, | ||||
detect_exclusions=False): | |||||
parser_args=None): | |||||
self.workspace = workspace | self.workspace = workspace | ||||
self.url = url | self.url = url | ||||
self.headers = headers | self.headers = headers | ||||
self.timeout = timeout | self.timeout = timeout | ||||
self.detect_exclusions = detect_exclusions | |||||
self.parser_args = parser_args | |||||
self.confidence = 0.0 | self.confidence = 0.0 | ||||
self.chains = (EMPTY, EMPTY_INTERSECTION) | self.chains = (EMPTY, EMPTY_INTERSECTION) | ||||
@@ -156,8 +156,7 @@ class _CopyvioWorker(object): | |||||
except (IOError, struct_error): | except (IOError, struct_error): | ||||
return None | return None | ||||
return handler(content).parse( | |||||
detect_exclusions=source.detect_exclusions) | |||||
return handler(content, source.parser_args).parse() | |||||
def _acquire_new_site(self): | def _acquire_new_site(self): | ||||
"""Block for a new unassigned site queue.""" | """Block for a new unassigned site queue.""" | ||||
@@ -242,7 +241,7 @@ class CopyvioWorkspace(object): | |||||
def __init__(self, article, min_confidence, max_time, logger, headers, | def __init__(self, article, min_confidence, max_time, logger, headers, | ||||
url_timeout=5, num_workers=8, short_circuit=True, | url_timeout=5, num_workers=8, short_circuit=True, | ||||
detect_exclusions=False): | |||||
parser_args=None): | |||||
self.sources = [] | self.sources = [] | ||||
self.finished = False | self.finished = False | ||||
self.possible_miss = False | self.possible_miss = False | ||||
@@ -255,9 +254,9 @@ class CopyvioWorkspace(object): | |||||
self._handled_urls = set() | self._handled_urls = set() | ||||
self._finish_lock = Lock() | self._finish_lock = Lock() | ||||
self._short_circuit = short_circuit | self._short_circuit = short_circuit | ||||
self._source_args = {"workspace": self, "headers": headers, | |||||
"timeout": url_timeout, | |||||
"detect_exclusions": detect_exclusions} | |||||
self._source_args = { | |||||
"workspace": self, "headers": headers, "timeout": url_timeout, | |||||
"parser_args": parser_args} | |||||
if _is_globalized: | if _is_globalized: | ||||
self._queues = _global_queues | self._queues = _global_queues | ||||