From 0bdcbca8b03bb405b9131bcf3bcb7d0a88c9804c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 20 Sep 2014 04:50:27 -0500 Subject: [PATCH] Rudimentary solution for PDF parsing (closes earwig/copyvios#18) --- earwigbot/wiki/copyvios/parsers.py | 22 +++++++++++++++++++--- setup.py | 2 +- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 594caeb..caacdcd 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -21,6 +21,7 @@ # SOFTWARE. from os import path +import re from StringIO import StringIO import mwparserfromhell @@ -29,7 +30,10 @@ from earwigbot import importer bs4 = importer.new("bs4") nltk = importer.new("nltk") -PyPDF2 = importer.new("PyPDF2") +converter = importer.new("pdfminer.converter") +pdfinterp = importer.new("pdfminer.pdfinterp") +pdfpage = importer.new("pdfminer.pdfpage") +pdftypes = importer.new("pdfminer.pdftypes") __all__ = ["ArticleTextParser", "get_parser"] @@ -88,7 +92,7 @@ class ArticleTextParser(_BaseTextParser): remove(wikicode, tag) clean = wikicode.strip_code(normalize=True, collapse=True) - self.clean = clean.replace("\n\n", "\n").strip() + self.clean = re.sub("\n\n+", "\n", clean).strip() return self.clean def chunk(self, nltk_dir, max_chunks, min_query=8, max_query=128): @@ -189,7 +193,19 @@ class _PDFParser(_BaseTextParser): def parse(self): """Return extracted text from the PDF.""" - raise NotImplementedError() + output = StringIO() + manager = pdfinterp.PDFResourceManager() + conv = converter.TextConverter(manager, output) + interp = pdfinterp.PDFPageInterpreter(manager, conv) + try: + pages = pdfpage.PDFPage.get_pages(StringIO(self.text)) + for page in pages: + interp.process_page(page) + except pdftypes.PDFException: + return output.getvalue().decode("utf8") + conv.close() + value = output.getvalue().decode("utf8") + return re.sub("\n\n+", "\n", value.replace("\x0c", "\n")).strip() class _PlainTextParser(_BaseTextParser): diff --git a/setup.py b/setup.py index e881651..8ac6b8f 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ extra_deps = { "lxml >= 2.3.5", # Faster parser for BeautifulSoup "nltk >= 2.0.2", # Parsing sentences to split article content "oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search - "PyPDF2 >= 1.23", # Extracting text from PDF files + "pdfminer >= 20140328", # Extracting text from PDF files "tldextract >= 1.4", # Getting domains for the multithreaded workers ], "time": [