浏览代码

Refactor parsers; fix empty document behavior.

tags/v0.2
Ben Kurtovic 10 年前
父节点
当前提交
30f72df470
共有 4 个文件被更改,包括 48 次插入20 次删除
  1. +28
    -6
      earwigbot/wiki/copyvios/parsers.py
  2. +5
    -2
      earwigbot/wiki/copyvios/result.py
  3. +14
    -12
      earwigbot/wiki/copyvios/workers.py
  4. +1
    -0
      setup.py

+ 28
- 6
earwigbot/wiki/copyvios/parsers.py 查看文件

@@ -21,6 +21,7 @@
# SOFTWARE.

from os import path
from StringIO import StringIO

import mwparserfromhell

@@ -28,11 +29,11 @@ from earwigbot import importer

bs4 = importer.new("bs4")
nltk = importer.new("nltk")
PyPDF2 = importer.new("PyPDF2")

__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser",
"PlainTextParser"]
__all__ = ["ArticleTextParser", "get_parser"]

class BaseTextParser(object):
class _BaseTextParser(object):
"""Base class for a parser that handles text."""

def __init__(self, text):
@@ -48,7 +49,7 @@ class BaseTextParser(object):
return "<{0} of text with size {1}>".format(name, len(self.text))


class ArticleTextParser(BaseTextParser):
class ArticleTextParser(_BaseTextParser):
"""A parser that can strip and chunk wikicode article text."""

def strip(self):
@@ -152,7 +153,7 @@ class ArticleTextParser(BaseTextParser):
if link.url.startswith(schemes)]


class HTMLTextParser(BaseTextParser):
class _HTMLParser(_BaseTextParser):
"""A parser that can extract the text from an HTML document."""
hidden_tags = [
"script", "style"
@@ -183,9 +184,30 @@ class HTMLTextParser(BaseTextParser):
return "\n".join(soup.stripped_strings)


class PlainTextParser(BaseTextParser):
class _PDFParser(_BaseTextParser):
"""A parser that can extract text from a PDF file."""

def parse(self):
"""Return extracted text from the PDF."""
raise NotImplementedError()


class _PlainTextParser(_BaseTextParser):
"""A parser that can unicode-ify and strip text from a plain text page."""

def parse(self):
"""Unicode-ify and strip whitespace from the plain text document."""
return bs4.UnicodeDammit(self.text).unicode_markup.strip()


_CONTENT_TYPES = {
"text/html": _HTMLParser,
"application/xhtml+xml": _HTMLParser,
"application/pdf": _PDFParser,
"application/x-pdf": _PDFParser,
"text/plain": _PlainTextParser
}

def get_parser(content_type):
"""Return the parser most able to handle a given content type, or None."""
return _CONTENT_TYPES.get(content_type.split(";", 1)[0])

+ 5
- 2
earwigbot/wiki/copyvios/result.py 查看文件

@@ -72,10 +72,13 @@ class CopyvioSource(object):
self._event2.clear()
self._event1.set()

def finish_work(self, confidence, source_chain, delta_chain):
"""Complete the confidence information inside this source."""
def update(self, confidence, source_chain, delta_chain):
"""Fill out the confidence and chain information inside this source."""
self.confidence = confidence
self.chains = (source_chain, delta_chain)

def finish_work(self):
"""Mark this source as finished."""
self._event2.set()

def skip(self):


+ 14
- 12
earwigbot/wiki/copyvios/workers.py 查看文件

@@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError

from earwigbot import importer
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.parsers import HTMLTextParser, PlainTextParser
from earwigbot.wiki.copyvios.parsers import get_parser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource

tldextract = importer.new("tldextract")
@@ -136,13 +136,9 @@ class _CopyvioWorker(object):
if size > 1024 ** 2: # Ignore URLs larger than a megabyte
return None

ctype_full = response.headers.get("Content-Type", "text/plain")
ctype = ctype_full.split(";", 1)[0]
if ctype in ["text/html", "application/xhtml+xml"]:
handler = HTMLTextParser
elif ctype == "text/plain":
handler = PlainTextParser
else:
content_type = response.headers.get("Content-Type", "text/plain")
handler = get_parser(content_type)
if not handler:
return None

try:
@@ -222,7 +218,8 @@ class _CopyvioWorker(object):
self._logger.debug("Exiting: got stop signal")
return
text = self._open_url(source)
source.workspace.compare(source, MarkovChain(text or ""))
chain = MarkovChain(text) if text else None
source.workspace.compare(source, chain)

def start(self):
"""Start the copyvio worker in a new thread."""
@@ -339,11 +336,16 @@ class CopyvioWorkspace(object):

def compare(self, source, source_chain):
"""Compare a source to the article; call _finish_early if necessary."""
delta = MarkovChainIntersection(self._article, source_chain)
conf = self._calculate_confidence(delta)
if source_chain:
delta = MarkovChainIntersection(self._article, source_chain)
conf = self._calculate_confidence(delta)
else:
conf = 0.0
self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf))
with self._finish_lock:
source.finish_work(conf, source_chain, delta)
if source_chain:
source.update(conf, source_chain, delta)
source.finish_work()
if not self.finished and conf >= self._min_confidence:
if self._short_circuit:
self._finish_early()


+ 1
- 0
setup.py 查看文件

@@ -44,6 +44,7 @@ extra_deps = {
"lxml >= 2.3.5", # Faster parser for BeautifulSoup
"nltk >= 2.0.2", # Parsing sentences to split article content
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search
"PyPDF2 >= 1.23", # Extracting text from PDF files
"tldextract >= 1.4", # Getting domains for the multithreaded workers
],
"time": [


正在加载...
取消
保存