Przeglądaj źródła

Fix parsing of plain text documents (earwig/copyvios#3)

tags/v0.2
Ben Kurtovic 10 lat temu
rodzic
commit
5349179088
3 zmienionych plików z 16 dodań i 6 usunięć
  1. +11
    -2
      earwigbot/wiki/copyvios/parsers.py
  2. +4
    -4
      earwigbot/wiki/copyvios/workers.py
  3. +1
    -0
      setup.py

+ 11
- 2
earwigbot/wiki/copyvios/parsers.py Wyświetl plik

@@ -29,7 +29,8 @@ from earwigbot import importer
bs4 = importer.new("bs4")
nltk = importer.new("nltk")

__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"]
__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser",
"PlainTextParser"]

class BaseTextParser(object):
"""Base class for a parser that handles text."""
@@ -157,7 +158,7 @@ class HTMLTextParser(BaseTextParser):
"script", "style"
]

def strip(self):
def parse(self):
"""Return the actual text contained within an HTML document.

Implemented using :py:mod:`BeautifulSoup <bs4>`
@@ -180,3 +181,11 @@ class HTMLTextParser(BaseTextParser):
element.extract()

return "\n".join(soup.stripped_strings)


class PlainTextParser(BaseTextParser):
"""A parser that can unicode-ify and strip text from a plain text page."""

def parse(self):
"""Unicode-ify and strip whitespace from the plain text document."""
return bs4.UnicodeDammit(self.text).unicode_markup.strip()

+ 4
- 4
earwigbot/wiki/copyvios/workers.py Wyświetl plik

@@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError

from earwigbot import importer
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.parsers import HTMLTextParser
from earwigbot.wiki.copyvios.parsers import HTMLTextParser, PlainTextParser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource

tldextract = importer.new("tldextract")
@@ -139,9 +139,9 @@ class _CopyvioWorker(object):
ctype_full = response.headers.get("Content-Type", "text/plain")
ctype = ctype_full.split(";", 1)[0]
if ctype in ["text/html", "application/xhtml+xml"]:
handler = lambda res: HTMLTextParser(res).strip()
handler = HTMLTextParser
elif ctype == "text/plain":
handler = lambda res: res.strip()
handler = PlainTextParser
else:
return None

@@ -158,7 +158,7 @@ class _CopyvioWorker(object):
except (IOError, struct_error):
return None

return handler(content)
return handler(content).parse()

def _acquire_new_site(self):
"""Block for a new unassigned site queue."""


+ 1
- 0
setup.py Wyświetl plik

@@ -40,6 +40,7 @@ extra_deps = {
],
"copyvios": [
"beautifulsoup4 >= 4.1.1", # Parsing/scraping HTML
"cchardet >= 0.3.5", # Encoding detection for BeautifulSoup
"lxml >= 2.3.5", # Faster parser for BeautifulSoup
"nltk >= 2.0.2", # Parsing sentences to split article content
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search


Ładowanie…
Anuluj
Zapisz