Browse Source

A couple more fixes and cleanup.

tags/v0.2
Ben Kurtovic 9 years ago
parent
commit
147b46f572
2 changed files with 7 additions and 6 deletions
  1. +5
    -5
      earwigbot/wiki/copyvios/parsers.py
  2. +2
    -1
      earwigbot/wiki/copyvios/workers.py

+ 5
- 5
earwigbot/wiki/copyvios/parsers.py View File

@@ -28,7 +28,7 @@ import mwparserfromhell


from earwigbot import importer from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError from earwigbot.exceptions import ParserExclusionError
from earwigbot.copyvios.exclusions import MIRROR_HINTS
from earwigbot.wiki.copyvios.exclusions import MIRROR_HINTS


bs4 = importer.new("bs4") bs4 = importer.new("bs4")
nltk = importer.new("nltk") nltk = importer.new("nltk")
@@ -187,7 +187,7 @@ class _HTMLParser(_BaseTextParser):
"script", "style" "script", "style"
] ]


def parse(self, detect_exclusions=False):
def parse(self, **kwargs):
"""Return the actual text contained within an HTML document. """Return the actual text contained within an HTML document.


Implemented using :py:mod:`BeautifulSoup <bs4>` Implemented using :py:mod:`BeautifulSoup <bs4>`
@@ -203,7 +203,7 @@ class _HTMLParser(_BaseTextParser):
# no scrapable content (possibly JS or <frame> magic): # no scrapable content (possibly JS or <frame> magic):
return "" return ""


if detect_exclusions:
if kwargs["detect_exclusions"]:
# Look for obvious signs that this is a mirror: # Look for obvious signs that this is a mirror:
func = lambda attr: attr and any( func = lambda attr: attr and any(
hint in attr for hint in MIRROR_HINTS) hint in attr for hint in MIRROR_HINTS)
@@ -229,7 +229,7 @@ class _PDFParser(_BaseTextParser):
(u"\u2022", u" "), (u"\u2022", u" "),
] ]


def parse(self, detect_exclusions=False):
def parse(self, **kwargs):
"""Return extracted text from the PDF.""" """Return extracted text from the PDF."""
output = StringIO() output = StringIO()
manager = pdfinterp.PDFResourceManager() manager = pdfinterp.PDFResourceManager()
@@ -255,7 +255,7 @@ class _PlainTextParser(_BaseTextParser):
"""A parser that can unicode-ify and strip text from a plain text page.""" """A parser that can unicode-ify and strip text from a plain text page."""
TYPE = "Text" TYPE = "Text"


def parse(self, detect_exclusions=False):
def parse(self, **kwargs):
"""Unicode-ify and strip whitespace from the plain text document.""" """Unicode-ify and strip whitespace from the plain text document."""
converted = bs4.UnicodeDammit(self.text).unicode_markup converted = bs4.UnicodeDammit(self.text).unicode_markup
return converted.strip() if converted else "" return converted.strip() if converted else ""


+ 2
- 1
earwigbot/wiki/copyvios/workers.py View File

@@ -156,7 +156,8 @@ class _CopyvioWorker(object):
except (IOError, struct_error): except (IOError, struct_error):
return None return None


return handler(content).parse(source.detect_exclusions)
return handler(content).parse(
detect_exclusions=source.detect_exclusions)


def _acquire_new_site(self): def _acquire_new_site(self):
"""Block for a new unassigned site queue.""" """Block for a new unassigned site queue."""


Loading…
Cancel
Save