Browse Source

Add mirror detection logic to parsers; fixes.

tags/v0.2
Ben Kurtovic 9 years ago
parent
commit
03910b6cb5
5 changed files with 30 additions and 13 deletions
  1. +2
    -2
      earwigbot/wiki/copyvios/__init__.py
  2. +3
    -1
      earwigbot/wiki/copyvios/exclusions.py
  3. +16
    -6
      earwigbot/wiki/copyvios/parsers.py
  4. +4
    -1
      earwigbot/wiki/copyvios/result.py
  5. +5
    -3
      earwigbot/wiki/copyvios/workers.py

+ 2
- 2
earwigbot/wiki/copyvios/__init__.py View File

@@ -118,7 +118,7 @@ class CopyvioMixIn(object):
article = MarkovChain(parser.strip()) article = MarkovChain(parser.strip())
workspace = CopyvioWorkspace( workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders, article, min_confidence, max_time, self._logger, self._addheaders,
short_circuit=short_circuit)
short_circuit=short_circuit, detect_exclusions=True)
if self._exclusions_db: if self._exclusions_db:
self._exclusions_db.sync(self.site.name) self._exclusions_db.sync(self.site.name)
exclude = lambda u: self._exclusions_db.check(self.site.name, u) exclude = lambda u: self._exclusions_db.check(self.site.name, u)
@@ -176,7 +176,7 @@ class CopyvioMixIn(object):
article = MarkovChain(ArticleTextParser(self.get()).strip()) article = MarkovChain(ArticleTextParser(self.get()).strip())
workspace = CopyvioWorkspace( workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders, article, min_confidence, max_time, self._logger, self._addheaders,
max_time, 1)
max_time, num_workers=1)
workspace.enqueue([url]) workspace.enqueue([url])
workspace.wait() workspace.wait()
result = workspace.get_result() result = workspace.get_result()


+ 3
- 1
earwigbot/wiki/copyvios/exclusions.py View File

@@ -28,7 +28,7 @@ from urlparse import urlparse


from earwigbot import exceptions from earwigbot import exceptions


__all__ = ["ExclusionsDB"]
__all__ = ["ExclusionsDB", "MIRROR_HINTS"]


DEFAULT_SOURCES = { DEFAULT_SOURCES = {
"all": [ # Applies to all, but located on enwiki "all": [ # Applies to all, but located on enwiki
@@ -43,6 +43,8 @@ DEFAULT_SOURCES = {
] ]
} }


MIRROR_HINTS = ["wikipedia.org/w/"]

class ExclusionsDB(object): class ExclusionsDB(object):
""" """
**EarwigBot: Wiki Toolset: Exclusions Database Manager** **EarwigBot: Wiki Toolset: Exclusions Database Manager**


+ 16
- 6
earwigbot/wiki/copyvios/parsers.py View File

@@ -28,6 +28,7 @@ import mwparserfromhell


from earwigbot import importer from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError from earwigbot.exceptions import ParserExclusionError
from earwigbot.copyvios.exclusions import MIRROR_HINTS


bs4 = importer.new("bs4") bs4 = importer.new("bs4")
nltk = importer.new("nltk") nltk = importer.new("nltk")
@@ -186,21 +187,30 @@ class _HTMLParser(_BaseTextParser):
"script", "style" "script", "style"
] ]


def parse(self):
def parse(self, detect_exclusions=False):
"""Return the actual text contained within an HTML document. """Return the actual text contained within an HTML document.


Implemented using :py:mod:`BeautifulSoup <bs4>` Implemented using :py:mod:`BeautifulSoup <bs4>`
(http://www.crummy.com/software/BeautifulSoup/). (http://www.crummy.com/software/BeautifulSoup/).
""" """
try: try:
soup = bs4.BeautifulSoup(self.text, "lxml").body
soup = bs4.BeautifulSoup(self.text, "lxml")
except ValueError: except ValueError:
soup = bs4.BeautifulSoup(self.text).body
soup = bs4.BeautifulSoup(self.text)


if not soup:
if not soup.body:
# No <body> tag present in HTML -> # No <body> tag present in HTML ->
# no scrapable content (possibly JS or <frame> magic): # no scrapable content (possibly JS or <frame> magic):
return "" return ""

if detect_exclusions:
# Look for obvious signs that this is a mirror:
func = lambda attr: attr and any(
hint in attr for hint in MIRROR_HINTS)
if soup.find_all(href=func) or soup.find_all(src=func):
raise ParserExclusionError()

soup = soup.body
is_comment = lambda text: isinstance(text, bs4.element.Comment) is_comment = lambda text: isinstance(text, bs4.element.Comment)
for comment in soup.find_all(text=is_comment): for comment in soup.find_all(text=is_comment):
comment.extract() comment.extract()
@@ -219,7 +229,7 @@ class _PDFParser(_BaseTextParser):
(u"\u2022", u" "), (u"\u2022", u" "),
] ]


def parse(self):
def parse(self, detect_exclusions=False):
"""Return extracted text from the PDF.""" """Return extracted text from the PDF."""
output = StringIO() output = StringIO()
manager = pdfinterp.PDFResourceManager() manager = pdfinterp.PDFResourceManager()
@@ -245,7 +255,7 @@ class _PlainTextParser(_BaseTextParser):
"""A parser that can unicode-ify and strip text from a plain text page.""" """A parser that can unicode-ify and strip text from a plain text page."""
TYPE = "Text" TYPE = "Text"


def parse(self):
def parse(self, detect_exclusions=False):
"""Unicode-ify and strip whitespace from the plain text document.""" """Unicode-ify and strip whitespace from the plain text document."""
converted = bs4.UnicodeDammit(self.text).unicode_markup converted = bs4.UnicodeDammit(self.text).unicode_markup
return converted.strip() if converted else "" return converted.strip() if converted else ""


+ 4
- 1
earwigbot/wiki/copyvios/result.py View File

@@ -43,11 +43,14 @@ class CopyvioSource(object):
- :py:attr:`excluded`: whether this URL was in the exclusions list - :py:attr:`excluded`: whether this URL was in the exclusions list
""" """


def __init__(self, workspace, url, headers=None, timeout=5):
def __init__(self, workspace, url, headers=None, timeout=5,
detect_exclusions=False):
self.workspace = workspace self.workspace = workspace
self.url = url self.url = url
self.headers = headers self.headers = headers
self.timeout = timeout self.timeout = timeout
self.detect_exclusions = detect_exclusions

self.confidence = 0.0 self.confidence = 0.0
self.chains = (EMPTY, EMPTY_INTERSECTION) self.chains = (EMPTY, EMPTY_INTERSECTION)
self.skipped = False self.skipped = False


+ 5
- 3
earwigbot/wiki/copyvios/workers.py View File

@@ -156,7 +156,7 @@ class _CopyvioWorker(object):
except (IOError, struct_error): except (IOError, struct_error):
return None return None


return handler(content).parse()
return handler(content).parse(source.detect_exclusions)


def _acquire_new_site(self): def _acquire_new_site(self):
"""Block for a new unassigned site queue.""" """Block for a new unassigned site queue."""
@@ -240,7 +240,8 @@ class CopyvioWorkspace(object):
"""Manages a single copyvio check distributed across threads.""" """Manages a single copyvio check distributed across threads."""


def __init__(self, article, min_confidence, max_time, logger, headers, def __init__(self, article, min_confidence, max_time, logger, headers,
url_timeout=5, num_workers=8, short_circuit=True):
url_timeout=5, num_workers=8, short_circuit=True,
detect_exclusions=False):
self.sources = [] self.sources = []
self.finished = False self.finished = False
self.possible_miss = False self.possible_miss = False
@@ -254,7 +255,8 @@ class CopyvioWorkspace(object):
self._finish_lock = Lock() self._finish_lock = Lock()
self._short_circuit = short_circuit self._short_circuit = short_circuit
self._source_args = {"workspace": self, "headers": headers, self._source_args = {"workspace": self, "headers": headers,
"timeout": url_timeout}
"timeout": url_timeout,
"detect_exclusions": detect_exclusions}


if _is_globalized: if _is_globalized:
self._queues = _global_queues self._queues = _global_queues


Loading…
Cancel
Save