Browse Source

Refactor parsers; fix empty document behavior.

tags/v0.2
Ben Kurtovic 10 years ago
parent
commit
30f72df470
4 changed files with 48 additions and 20 deletions
  1. +28
    -6
      earwigbot/wiki/copyvios/parsers.py
  2. +5
    -2
      earwigbot/wiki/copyvios/result.py
  3. +14
    -12
      earwigbot/wiki/copyvios/workers.py
  4. +1
    -0
      setup.py

+ 28
- 6
earwigbot/wiki/copyvios/parsers.py View File

@@ -21,6 +21,7 @@
# SOFTWARE. # SOFTWARE.


from os import path from os import path
from StringIO import StringIO


import mwparserfromhell import mwparserfromhell


@@ -28,11 +29,11 @@ from earwigbot import importer


bs4 = importer.new("bs4") bs4 = importer.new("bs4")
nltk = importer.new("nltk") nltk = importer.new("nltk")
PyPDF2 = importer.new("PyPDF2")


__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser",
"PlainTextParser"]
__all__ = ["ArticleTextParser", "get_parser"]


class BaseTextParser(object):
class _BaseTextParser(object):
"""Base class for a parser that handles text.""" """Base class for a parser that handles text."""


def __init__(self, text): def __init__(self, text):
@@ -48,7 +49,7 @@ class BaseTextParser(object):
return "<{0} of text with size {1}>".format(name, len(self.text)) return "<{0} of text with size {1}>".format(name, len(self.text))




class ArticleTextParser(BaseTextParser):
class ArticleTextParser(_BaseTextParser):
"""A parser that can strip and chunk wikicode article text.""" """A parser that can strip and chunk wikicode article text."""


def strip(self): def strip(self):
@@ -152,7 +153,7 @@ class ArticleTextParser(BaseTextParser):
if link.url.startswith(schemes)] if link.url.startswith(schemes)]




class HTMLTextParser(BaseTextParser):
class _HTMLParser(_BaseTextParser):
"""A parser that can extract the text from an HTML document.""" """A parser that can extract the text from an HTML document."""
hidden_tags = [ hidden_tags = [
"script", "style" "script", "style"
@@ -183,9 +184,30 @@ class HTMLTextParser(BaseTextParser):
return "\n".join(soup.stripped_strings) return "\n".join(soup.stripped_strings)




class PlainTextParser(BaseTextParser):
class _PDFParser(_BaseTextParser):
"""A parser that can extract text from a PDF file."""

def parse(self):
"""Return extracted text from the PDF."""
raise NotImplementedError()


class _PlainTextParser(_BaseTextParser):
"""A parser that can unicode-ify and strip text from a plain text page.""" """A parser that can unicode-ify and strip text from a plain text page."""


def parse(self): def parse(self):
"""Unicode-ify and strip whitespace from the plain text document.""" """Unicode-ify and strip whitespace from the plain text document."""
return bs4.UnicodeDammit(self.text).unicode_markup.strip() return bs4.UnicodeDammit(self.text).unicode_markup.strip()


_CONTENT_TYPES = {
"text/html": _HTMLParser,
"application/xhtml+xml": _HTMLParser,
"application/pdf": _PDFParser,
"application/x-pdf": _PDFParser,
"text/plain": _PlainTextParser
}

def get_parser(content_type):
"""Return the parser most able to handle a given content type, or None."""
return _CONTENT_TYPES.get(content_type.split(";", 1)[0])

+ 5
- 2
earwigbot/wiki/copyvios/result.py View File

@@ -72,10 +72,13 @@ class CopyvioSource(object):
self._event2.clear() self._event2.clear()
self._event1.set() self._event1.set()


def finish_work(self, confidence, source_chain, delta_chain):
"""Complete the confidence information inside this source."""
def update(self, confidence, source_chain, delta_chain):
"""Fill out the confidence and chain information inside this source."""
self.confidence = confidence self.confidence = confidence
self.chains = (source_chain, delta_chain) self.chains = (source_chain, delta_chain)

def finish_work(self):
"""Mark this source as finished."""
self._event2.set() self._event2.set()


def skip(self): def skip(self):


+ 14
- 12
earwigbot/wiki/copyvios/workers.py View File

@@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError


from earwigbot import importer from earwigbot import importer
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.parsers import HTMLTextParser, PlainTextParser
from earwigbot.wiki.copyvios.parsers import get_parser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource


tldextract = importer.new("tldextract") tldextract = importer.new("tldextract")
@@ -136,13 +136,9 @@ class _CopyvioWorker(object):
if size > 1024 ** 2: # Ignore URLs larger than a megabyte if size > 1024 ** 2: # Ignore URLs larger than a megabyte
return None return None


ctype_full = response.headers.get("Content-Type", "text/plain")
ctype = ctype_full.split(";", 1)[0]
if ctype in ["text/html", "application/xhtml+xml"]:
handler = HTMLTextParser
elif ctype == "text/plain":
handler = PlainTextParser
else:
content_type = response.headers.get("Content-Type", "text/plain")
handler = get_parser(content_type)
if not handler:
return None return None


try: try:
@@ -222,7 +218,8 @@ class _CopyvioWorker(object):
self._logger.debug("Exiting: got stop signal") self._logger.debug("Exiting: got stop signal")
return return
text = self._open_url(source) text = self._open_url(source)
source.workspace.compare(source, MarkovChain(text or ""))
chain = MarkovChain(text) if text else None
source.workspace.compare(source, chain)


def start(self): def start(self):
"""Start the copyvio worker in a new thread.""" """Start the copyvio worker in a new thread."""
@@ -339,11 +336,16 @@ class CopyvioWorkspace(object):


def compare(self, source, source_chain): def compare(self, source, source_chain):
"""Compare a source to the article; call _finish_early if necessary.""" """Compare a source to the article; call _finish_early if necessary."""
delta = MarkovChainIntersection(self._article, source_chain)
conf = self._calculate_confidence(delta)
if source_chain:
delta = MarkovChainIntersection(self._article, source_chain)
conf = self._calculate_confidence(delta)
else:
conf = 0.0
self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf)) self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf))
with self._finish_lock: with self._finish_lock:
source.finish_work(conf, source_chain, delta)
if source_chain:
source.update(conf, source_chain, delta)
source.finish_work()
if not self.finished and conf >= self._min_confidence: if not self.finished and conf >= self._min_confidence:
if self._short_circuit: if self._short_circuit:
self._finish_early() self._finish_early()


+ 1
- 0
setup.py View File

@@ -44,6 +44,7 @@ extra_deps = {
"lxml >= 2.3.5", # Faster parser for BeautifulSoup "lxml >= 2.3.5", # Faster parser for BeautifulSoup
"nltk >= 2.0.2", # Parsing sentences to split article content "nltk >= 2.0.2", # Parsing sentences to split article content
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search "oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search
"PyPDF2 >= 1.23", # Extracting text from PDF files
"tldextract >= 1.4", # Getting domains for the multithreaded workers "tldextract >= 1.4", # Getting domains for the multithreaded workers
], ],
"time": [ "time": [


Loading…
Cancel
Save