Browse Source

Add mirror detection logic to parsers; fixes.

tags/v0.2
Ben Kurtovic 9 years ago
parent
commit
03910b6cb5
5 changed files with 30 additions and 13 deletions
  1. +2
    -2
      earwigbot/wiki/copyvios/__init__.py
  2. +3
    -1
      earwigbot/wiki/copyvios/exclusions.py
  3. +16
    -6
      earwigbot/wiki/copyvios/parsers.py
  4. +4
    -1
      earwigbot/wiki/copyvios/result.py
  5. +5
    -3
      earwigbot/wiki/copyvios/workers.py

+ 2
- 2
earwigbot/wiki/copyvios/__init__.py View File

@@ -118,7 +118,7 @@ class CopyvioMixIn(object):
article = MarkovChain(parser.strip())
workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders,
short_circuit=short_circuit)
short_circuit=short_circuit, detect_exclusions=True)
if self._exclusions_db:
self._exclusions_db.sync(self.site.name)
exclude = lambda u: self._exclusions_db.check(self.site.name, u)
@@ -176,7 +176,7 @@ class CopyvioMixIn(object):
article = MarkovChain(ArticleTextParser(self.get()).strip())
workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders,
max_time, 1)
max_time, num_workers=1)
workspace.enqueue([url])
workspace.wait()
result = workspace.get_result()


+ 3
- 1
earwigbot/wiki/copyvios/exclusions.py View File

@@ -28,7 +28,7 @@ from urlparse import urlparse

from earwigbot import exceptions

__all__ = ["ExclusionsDB"]
__all__ = ["ExclusionsDB", "MIRROR_HINTS"]

DEFAULT_SOURCES = {
"all": [ # Applies to all, but located on enwiki
@@ -43,6 +43,8 @@ DEFAULT_SOURCES = {
]
}

MIRROR_HINTS = ["wikipedia.org/w/"]

class ExclusionsDB(object):
"""
**EarwigBot: Wiki Toolset: Exclusions Database Manager**


+ 16
- 6
earwigbot/wiki/copyvios/parsers.py View File

@@ -28,6 +28,7 @@ import mwparserfromhell

from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError
from earwigbot.copyvios.exclusions import MIRROR_HINTS

bs4 = importer.new("bs4")
nltk = importer.new("nltk")
@@ -186,21 +187,30 @@ class _HTMLParser(_BaseTextParser):
"script", "style"
]

def parse(self):
def parse(self, detect_exclusions=False):
"""Return the actual text contained within an HTML document.

Implemented using :py:mod:`BeautifulSoup <bs4>`
(http://www.crummy.com/software/BeautifulSoup/).
"""
try:
soup = bs4.BeautifulSoup(self.text, "lxml").body
soup = bs4.BeautifulSoup(self.text, "lxml")
except ValueError:
soup = bs4.BeautifulSoup(self.text).body
soup = bs4.BeautifulSoup(self.text)

if not soup:
if not soup.body:
# No <body> tag present in HTML ->
# no scrapable content (possibly JS or <frame> magic):
return ""

if detect_exclusions:
# Look for obvious signs that this is a mirror:
func = lambda attr: attr and any(
hint in attr for hint in MIRROR_HINTS)
if soup.find_all(href=func) or soup.find_all(src=func):
raise ParserExclusionError()

soup = soup.body
is_comment = lambda text: isinstance(text, bs4.element.Comment)
for comment in soup.find_all(text=is_comment):
comment.extract()
@@ -219,7 +229,7 @@ class _PDFParser(_BaseTextParser):
(u"\u2022", u" "),
]

def parse(self):
def parse(self, detect_exclusions=False):
"""Return extracted text from the PDF."""
output = StringIO()
manager = pdfinterp.PDFResourceManager()
@@ -245,7 +255,7 @@ class _PlainTextParser(_BaseTextParser):
"""A parser that can unicode-ify and strip text from a plain text page."""
TYPE = "Text"

def parse(self):
def parse(self, detect_exclusions=False):
"""Unicode-ify and strip whitespace from the plain text document."""
converted = bs4.UnicodeDammit(self.text).unicode_markup
return converted.strip() if converted else ""


+ 4
- 1
earwigbot/wiki/copyvios/result.py View File

@@ -43,11 +43,14 @@ class CopyvioSource(object):
- :py:attr:`excluded`: whether this URL was in the exclusions list
"""

def __init__(self, workspace, url, headers=None, timeout=5):
def __init__(self, workspace, url, headers=None, timeout=5,
detect_exclusions=False):
self.workspace = workspace
self.url = url
self.headers = headers
self.timeout = timeout
self.detect_exclusions = detect_exclusions

self.confidence = 0.0
self.chains = (EMPTY, EMPTY_INTERSECTION)
self.skipped = False


+ 5
- 3
earwigbot/wiki/copyvios/workers.py View File

@@ -156,7 +156,7 @@ class _CopyvioWorker(object):
except (IOError, struct_error):
return None

return handler(content).parse()
return handler(content).parse(source.detect_exclusions)

def _acquire_new_site(self):
"""Block for a new unassigned site queue."""
@@ -240,7 +240,8 @@ class CopyvioWorkspace(object):
"""Manages a single copyvio check distributed across threads."""

def __init__(self, article, min_confidence, max_time, logger, headers,
url_timeout=5, num_workers=8, short_circuit=True):
url_timeout=5, num_workers=8, short_circuit=True,
detect_exclusions=False):
self.sources = []
self.finished = False
self.possible_miss = False
@@ -254,7 +255,8 @@ class CopyvioWorkspace(object):
self._finish_lock = Lock()
self._short_circuit = short_circuit
self._source_args = {"workspace": self, "headers": headers,
"timeout": url_timeout}
"timeout": url_timeout,
"detect_exclusions": detect_exclusions}

if _is_globalized:
self._queues = _global_queues


Loading…
Cancel
Save