Browse Source

Refactor out mirror hinting logic in source parsers.

tags/v0.2
Ben Kurtovic 9 years ago
parent
commit
91846ce4fb
6 changed files with 17 additions and 20 deletions
  1. +1
    -1
      earwigbot/wiki/copyvios/__init__.py
  2. +1
    -3
      earwigbot/wiki/copyvios/exclusions.py
  3. +1
    -1
      earwigbot/wiki/copyvios/markov.py
  4. +7
    -7
      earwigbot/wiki/copyvios/parsers.py
  5. +2
    -2
      earwigbot/wiki/copyvios/result.py
  6. +5
    -6
      earwigbot/wiki/copyvios/workers.py

+ 1
- 1
earwigbot/wiki/copyvios/__init__.py View File

@@ -118,7 +118,7 @@ class CopyvioMixIn(object):
article = MarkovChain(parser.strip())
workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders,
short_circuit=short_circuit, detect_exclusions=True)
short_circuit=short_circuit, parser_args={"mirror_hints": ["wikipedia.org/w/"]})
if self._exclusions_db:
self._exclusions_db.sync(self.site.name)
exclude = lambda u: self._exclusions_db.check(self.site.name, u)


+ 1
- 3
earwigbot/wiki/copyvios/exclusions.py View File

@@ -28,7 +28,7 @@ from urlparse import urlparse

from earwigbot import exceptions

__all__ = ["ExclusionsDB", "MIRROR_HINTS"]
__all__ = ["ExclusionsDB"]

DEFAULT_SOURCES = {
"all": [ # Applies to all, but located on enwiki
@@ -43,8 +43,6 @@ DEFAULT_SOURCES = {
]
}

MIRROR_HINTS = ["wikipedia.org/w/"]

class ExclusionsDB(object):
"""
**EarwigBot: Wiki Toolset: Exclusions Database Manager**


+ 1
- 1
earwigbot/wiki/copyvios/markov.py View File

@@ -35,7 +35,7 @@ class MarkovChain(object):
def __init__(self, text):
self.text = text
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
words = sub(r"[^\w\s-]", "", text.lower(), flags=UNICODE).split()

padding = self.degree - 1
words = ([self.START] * padding) + words + ([self.END] * padding)


+ 7
- 7
earwigbot/wiki/copyvios/parsers.py View File

@@ -28,7 +28,6 @@ import mwparserfromhell

from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError
from earwigbot.wiki.copyvios.exclusions import MIRROR_HINTS

bs4 = importer.new("bs4")
nltk = importer.new("nltk")
@@ -44,8 +43,9 @@ class _BaseTextParser(object):
"""Base class for a parser that handles text."""
TYPE = None

def __init__(self, text):
def __init__(self, text, args=None):
self.text = text
self._args = args or {}

def __repr__(self):
"""Return the canonical string representation of the text parser."""
@@ -187,7 +187,7 @@ class _HTMLParser(_BaseTextParser):
"script", "style"
]

def parse(self, **kwargs):
def parse(self):
"""Return the actual text contained within an HTML document.

Implemented using :py:mod:`BeautifulSoup <bs4>`
@@ -203,10 +203,10 @@ class _HTMLParser(_BaseTextParser):
# no scrapable content (possibly JS or <frame> magic):
return ""

if kwargs["detect_exclusions"]:
if "mirror_hints" in self._args:
# Look for obvious signs that this is a mirror:
func = lambda attr: attr and any(
hint in attr for hint in MIRROR_HINTS)
hint in attr for hint in self._args["mirror_hints"])
if soup.find_all(href=func) or soup.find_all(src=func):
raise ParserExclusionError()

@@ -229,7 +229,7 @@ class _PDFParser(_BaseTextParser):
(u"\u2022", u" "),
]

def parse(self, **kwargs):
def parse(self):
"""Return extracted text from the PDF."""
output = StringIO()
manager = pdfinterp.PDFResourceManager()
@@ -255,7 +255,7 @@ class _PlainTextParser(_BaseTextParser):
"""A parser that can unicode-ify and strip text from a plain text page."""
TYPE = "Text"

def parse(self, **kwargs):
def parse(self):
"""Unicode-ify and strip whitespace from the plain text document."""
converted = bs4.UnicodeDammit(self.text).unicode_markup
return converted.strip() if converted else ""


+ 2
- 2
earwigbot/wiki/copyvios/result.py View File

@@ -44,12 +44,12 @@ class CopyvioSource(object):
"""

def __init__(self, workspace, url, headers=None, timeout=5,
detect_exclusions=False):
parser_args=None):
self.workspace = workspace
self.url = url
self.headers = headers
self.timeout = timeout
self.detect_exclusions = detect_exclusions
self.parser_args = parser_args

self.confidence = 0.0
self.chains = (EMPTY, EMPTY_INTERSECTION)


+ 5
- 6
earwigbot/wiki/copyvios/workers.py View File

@@ -156,8 +156,7 @@ class _CopyvioWorker(object):
except (IOError, struct_error):
return None

return handler(content).parse(
detect_exclusions=source.detect_exclusions)
return handler(content, source.parser_args).parse()

def _acquire_new_site(self):
"""Block for a new unassigned site queue."""
@@ -242,7 +241,7 @@ class CopyvioWorkspace(object):

def __init__(self, article, min_confidence, max_time, logger, headers,
url_timeout=5, num_workers=8, short_circuit=True,
detect_exclusions=False):
parser_args=None):
self.sources = []
self.finished = False
self.possible_miss = False
@@ -255,9 +254,9 @@ class CopyvioWorkspace(object):
self._handled_urls = set()
self._finish_lock = Lock()
self._short_circuit = short_circuit
self._source_args = {"workspace": self, "headers": headers,
"timeout": url_timeout,
"detect_exclusions": detect_exclusions}
self._source_args = {
"workspace": self, "headers": headers, "timeout": url_timeout,
"parser_args": parser_args}

if _is_globalized:
self._queues = _global_queues


Loading…
Cancel
Save