Browse Source

Fix parsing of plain text documents (earwig/copyvios#3)

tags/v0.2
Ben Kurtovic 10 years ago
parent
commit
5349179088
3 changed files with 16 additions and 6 deletions
  1. +11
    -2
      earwigbot/wiki/copyvios/parsers.py
  2. +4
    -4
      earwigbot/wiki/copyvios/workers.py
  3. +1
    -0
      setup.py

+ 11
- 2
earwigbot/wiki/copyvios/parsers.py View File

@@ -29,7 +29,8 @@ from earwigbot import importer
bs4 = importer.new("bs4") bs4 = importer.new("bs4")
nltk = importer.new("nltk") nltk = importer.new("nltk")


__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"]
__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser",
"PlainTextParser"]


class BaseTextParser(object): class BaseTextParser(object):
"""Base class for a parser that handles text.""" """Base class for a parser that handles text."""
@@ -157,7 +158,7 @@ class HTMLTextParser(BaseTextParser):
"script", "style" "script", "style"
] ]


def strip(self):
def parse(self):
"""Return the actual text contained within an HTML document. """Return the actual text contained within an HTML document.


Implemented using :py:mod:`BeautifulSoup <bs4>` Implemented using :py:mod:`BeautifulSoup <bs4>`
@@ -180,3 +181,11 @@ class HTMLTextParser(BaseTextParser):
element.extract() element.extract()


return "\n".join(soup.stripped_strings) return "\n".join(soup.stripped_strings)


class PlainTextParser(BaseTextParser):
"""A parser that can unicode-ify and strip text from a plain text page."""

def parse(self):
"""Unicode-ify and strip whitespace from the plain text document."""
return bs4.UnicodeDammit(self.text).unicode_markup.strip()

+ 4
- 4
earwigbot/wiki/copyvios/workers.py View File

@@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError


from earwigbot import importer from earwigbot import importer
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.parsers import HTMLTextParser
from earwigbot.wiki.copyvios.parsers import HTMLTextParser, PlainTextParser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource


tldextract = importer.new("tldextract") tldextract = importer.new("tldextract")
@@ -139,9 +139,9 @@ class _CopyvioWorker(object):
ctype_full = response.headers.get("Content-Type", "text/plain") ctype_full = response.headers.get("Content-Type", "text/plain")
ctype = ctype_full.split(";", 1)[0] ctype = ctype_full.split(";", 1)[0]
if ctype in ["text/html", "application/xhtml+xml"]: if ctype in ["text/html", "application/xhtml+xml"]:
handler = lambda res: HTMLTextParser(res).strip()
handler = HTMLTextParser
elif ctype == "text/plain": elif ctype == "text/plain":
handler = lambda res: res.strip()
handler = PlainTextParser
else: else:
return None return None


@@ -158,7 +158,7 @@ class _CopyvioWorker(object):
except (IOError, struct_error): except (IOError, struct_error):
return None return None


return handler(content)
return handler(content).parse()


def _acquire_new_site(self): def _acquire_new_site(self):
"""Block for a new unassigned site queue.""" """Block for a new unassigned site queue."""


+ 1
- 0
setup.py View File

@@ -40,6 +40,7 @@ extra_deps = {
], ],
"copyvios": [ "copyvios": [
"beautifulsoup4 >= 4.1.1", # Parsing/scraping HTML "beautifulsoup4 >= 4.1.1", # Parsing/scraping HTML
"cchardet >= 0.3.5", # Encoding detection for BeautifulSoup
"lxml >= 2.3.5", # Faster parser for BeautifulSoup "lxml >= 2.3.5", # Faster parser for BeautifulSoup
"nltk >= 2.0.2", # Parsing sentences to split article content "nltk >= 2.0.2", # Parsing sentences to split article content
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search "oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search


Loading…
Cancel
Save