Browse Source

Refactor out mirror hinting logic in source parsers.

tags/v0.2
Ben Kurtovic 9 years ago
parent
commit
91846ce4fb
6 changed files with 17 additions and 20 deletions
  1. +1
    -1
      earwigbot/wiki/copyvios/__init__.py
  2. +1
    -3
      earwigbot/wiki/copyvios/exclusions.py
  3. +1
    -1
      earwigbot/wiki/copyvios/markov.py
  4. +7
    -7
      earwigbot/wiki/copyvios/parsers.py
  5. +2
    -2
      earwigbot/wiki/copyvios/result.py
  6. +5
    -6
      earwigbot/wiki/copyvios/workers.py

+ 1
- 1
earwigbot/wiki/copyvios/__init__.py View File

@@ -118,7 +118,7 @@ class CopyvioMixIn(object):
article = MarkovChain(parser.strip()) article = MarkovChain(parser.strip())
workspace = CopyvioWorkspace( workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders, article, min_confidence, max_time, self._logger, self._addheaders,
short_circuit=short_circuit, detect_exclusions=True)
short_circuit=short_circuit, parser_args={"mirror_hints": ["wikipedia.org/w/"]})
if self._exclusions_db: if self._exclusions_db:
self._exclusions_db.sync(self.site.name) self._exclusions_db.sync(self.site.name)
exclude = lambda u: self._exclusions_db.check(self.site.name, u) exclude = lambda u: self._exclusions_db.check(self.site.name, u)


+ 1
- 3
earwigbot/wiki/copyvios/exclusions.py View File

@@ -28,7 +28,7 @@ from urlparse import urlparse


from earwigbot import exceptions from earwigbot import exceptions


__all__ = ["ExclusionsDB", "MIRROR_HINTS"]
__all__ = ["ExclusionsDB"]


DEFAULT_SOURCES = { DEFAULT_SOURCES = {
"all": [ # Applies to all, but located on enwiki "all": [ # Applies to all, but located on enwiki
@@ -43,8 +43,6 @@ DEFAULT_SOURCES = {
] ]
} }


MIRROR_HINTS = ["wikipedia.org/w/"]

class ExclusionsDB(object): class ExclusionsDB(object):
""" """
**EarwigBot: Wiki Toolset: Exclusions Database Manager** **EarwigBot: Wiki Toolset: Exclusions Database Manager**


+ 1
- 1
earwigbot/wiki/copyvios/markov.py View File

@@ -35,7 +35,7 @@ class MarkovChain(object):
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) self.chain = defaultdict(lambda: defaultdict(lambda: 0))
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
words = sub(r"[^\w\s-]", "", text.lower(), flags=UNICODE).split()


padding = self.degree - 1 padding = self.degree - 1
words = ([self.START] * padding) + words + ([self.END] * padding) words = ([self.START] * padding) + words + ([self.END] * padding)


+ 7
- 7
earwigbot/wiki/copyvios/parsers.py View File

@@ -28,7 +28,6 @@ import mwparserfromhell


from earwigbot import importer from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError from earwigbot.exceptions import ParserExclusionError
from earwigbot.wiki.copyvios.exclusions import MIRROR_HINTS


bs4 = importer.new("bs4") bs4 = importer.new("bs4")
nltk = importer.new("nltk") nltk = importer.new("nltk")
@@ -44,8 +43,9 @@ class _BaseTextParser(object):
"""Base class for a parser that handles text.""" """Base class for a parser that handles text."""
TYPE = None TYPE = None


def __init__(self, text):
def __init__(self, text, args=None):
self.text = text self.text = text
self._args = args or {}


def __repr__(self): def __repr__(self):
"""Return the canonical string representation of the text parser.""" """Return the canonical string representation of the text parser."""
@@ -187,7 +187,7 @@ class _HTMLParser(_BaseTextParser):
"script", "style" "script", "style"
] ]


def parse(self, **kwargs):
def parse(self):
"""Return the actual text contained within an HTML document. """Return the actual text contained within an HTML document.


Implemented using :py:mod:`BeautifulSoup <bs4>` Implemented using :py:mod:`BeautifulSoup <bs4>`
@@ -203,10 +203,10 @@ class _HTMLParser(_BaseTextParser):
# no scrapable content (possibly JS or <frame> magic): # no scrapable content (possibly JS or <frame> magic):
return "" return ""


if kwargs["detect_exclusions"]:
if "mirror_hints" in self._args:
# Look for obvious signs that this is a mirror: # Look for obvious signs that this is a mirror:
func = lambda attr: attr and any( func = lambda attr: attr and any(
hint in attr for hint in MIRROR_HINTS)
hint in attr for hint in self._args["mirror_hints"])
if soup.find_all(href=func) or soup.find_all(src=func): if soup.find_all(href=func) or soup.find_all(src=func):
raise ParserExclusionError() raise ParserExclusionError()


@@ -229,7 +229,7 @@ class _PDFParser(_BaseTextParser):
(u"\u2022", u" "), (u"\u2022", u" "),
] ]


def parse(self, **kwargs):
def parse(self):
"""Return extracted text from the PDF.""" """Return extracted text from the PDF."""
output = StringIO() output = StringIO()
manager = pdfinterp.PDFResourceManager() manager = pdfinterp.PDFResourceManager()
@@ -255,7 +255,7 @@ class _PlainTextParser(_BaseTextParser):
"""A parser that can unicode-ify and strip text from a plain text page.""" """A parser that can unicode-ify and strip text from a plain text page."""
TYPE = "Text" TYPE = "Text"


def parse(self, **kwargs):
def parse(self):
"""Unicode-ify and strip whitespace from the plain text document.""" """Unicode-ify and strip whitespace from the plain text document."""
converted = bs4.UnicodeDammit(self.text).unicode_markup converted = bs4.UnicodeDammit(self.text).unicode_markup
return converted.strip() if converted else "" return converted.strip() if converted else ""


+ 2
- 2
earwigbot/wiki/copyvios/result.py View File

@@ -44,12 +44,12 @@ class CopyvioSource(object):
""" """


def __init__(self, workspace, url, headers=None, timeout=5, def __init__(self, workspace, url, headers=None, timeout=5,
detect_exclusions=False):
parser_args=None):
self.workspace = workspace self.workspace = workspace
self.url = url self.url = url
self.headers = headers self.headers = headers
self.timeout = timeout self.timeout = timeout
self.detect_exclusions = detect_exclusions
self.parser_args = parser_args


self.confidence = 0.0 self.confidence = 0.0
self.chains = (EMPTY, EMPTY_INTERSECTION) self.chains = (EMPTY, EMPTY_INTERSECTION)


+ 5
- 6
earwigbot/wiki/copyvios/workers.py View File

@@ -156,8 +156,7 @@ class _CopyvioWorker(object):
except (IOError, struct_error): except (IOError, struct_error):
return None return None


return handler(content).parse(
detect_exclusions=source.detect_exclusions)
return handler(content, source.parser_args).parse()


def _acquire_new_site(self): def _acquire_new_site(self):
"""Block for a new unassigned site queue.""" """Block for a new unassigned site queue."""
@@ -242,7 +241,7 @@ class CopyvioWorkspace(object):


def __init__(self, article, min_confidence, max_time, logger, headers, def __init__(self, article, min_confidence, max_time, logger, headers,
url_timeout=5, num_workers=8, short_circuit=True, url_timeout=5, num_workers=8, short_circuit=True,
detect_exclusions=False):
parser_args=None):
self.sources = [] self.sources = []
self.finished = False self.finished = False
self.possible_miss = False self.possible_miss = False
@@ -255,9 +254,9 @@ class CopyvioWorkspace(object):
self._handled_urls = set() self._handled_urls = set()
self._finish_lock = Lock() self._finish_lock = Lock()
self._short_circuit = short_circuit self._short_circuit = short_circuit
self._source_args = {"workspace": self, "headers": headers,
"timeout": url_timeout,
"detect_exclusions": detect_exclusions}
self._source_args = {
"workspace": self, "headers": headers, "timeout": url_timeout,
"parser_args": parser_args}


if _is_globalized: if _is_globalized:
self._queues = _global_queues self._queues = _global_queues


Loading…
Cancel
Save