浏览代码

Rudimentary solution for PDF parsing (closes earwig/copyvios#18)

tags/v0.2
Ben Kurtovic 10 年前
父节点
当前提交
0bdcbca8b0
共有 2 个文件被更改,包括 20 次插入4 次删除
  1. +19
    -3
      earwigbot/wiki/copyvios/parsers.py
  2. +1
    -1
      setup.py

+ 19
- 3
earwigbot/wiki/copyvios/parsers.py 查看文件

@@ -21,6 +21,7 @@
# SOFTWARE.

from os import path
import re
from StringIO import StringIO

import mwparserfromhell
@@ -29,7 +30,10 @@ from earwigbot import importer

bs4 = importer.new("bs4")
nltk = importer.new("nltk")
PyPDF2 = importer.new("PyPDF2")
converter = importer.new("pdfminer.converter")
pdfinterp = importer.new("pdfminer.pdfinterp")
pdfpage = importer.new("pdfminer.pdfpage")
pdftypes = importer.new("pdfminer.pdftypes")

__all__ = ["ArticleTextParser", "get_parser"]

@@ -88,7 +92,7 @@ class ArticleTextParser(_BaseTextParser):
remove(wikicode, tag)

clean = wikicode.strip_code(normalize=True, collapse=True)
self.clean = clean.replace("\n\n", "\n").strip()
self.clean = re.sub("\n\n+", "\n", clean).strip()
return self.clean

def chunk(self, nltk_dir, max_chunks, min_query=8, max_query=128):
@@ -189,7 +193,19 @@ class _PDFParser(_BaseTextParser):

def parse(self):
"""Return extracted text from the PDF."""
raise NotImplementedError()
output = StringIO()
manager = pdfinterp.PDFResourceManager()
conv = converter.TextConverter(manager, output)
interp = pdfinterp.PDFPageInterpreter(manager, conv)
try:
pages = pdfpage.PDFPage.get_pages(StringIO(self.text))
for page in pages:
interp.process_page(page)
except pdftypes.PDFException:
return output.getvalue().decode("utf8")
conv.close()
value = output.getvalue().decode("utf8")
return re.sub("\n\n+", "\n", value.replace("\x0c", "\n")).strip()


class _PlainTextParser(_BaseTextParser):


+ 1
- 1
setup.py 查看文件

@@ -44,7 +44,7 @@ extra_deps = {
"lxml >= 2.3.5", # Faster parser for BeautifulSoup
"nltk >= 2.0.2", # Parsing sentences to split article content
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search
"PyPDF2 >= 1.23", # Extracting text from PDF files
"pdfminer >= 20140328", # Extracting text from PDF files
"tldextract >= 1.4", # Getting domains for the multithreaded workers
],
"time": [


正在加载...
取消
保存