Browse Source

Add another PDF string substitution.

tags/v0.2
Ben Kurtovic 10 years ago
parent
commit
77514ee925
1 changed files with 11 additions and 2 deletions
  1. +11
    -2
      earwigbot/wiki/copyvios/parsers.py

+ 11
- 2
earwigbot/wiki/copyvios/parsers.py View File

@@ -190,6 +190,10 @@ class _HTMLParser(_BaseTextParser):


class _PDFParser(_BaseTextParser): class _PDFParser(_BaseTextParser):
"""A parser that can extract text from a PDF file.""" """A parser that can extract text from a PDF file."""
substitutions = [
(u"\x0c", u"\n"),
(u"\u2022", u" "),
]


def parse(self): def parse(self):
"""Return extracted text from the PDF.""" """Return extracted text from the PDF."""
@@ -197,15 +201,20 @@ class _PDFParser(_BaseTextParser):
manager = pdfinterp.PDFResourceManager() manager = pdfinterp.PDFResourceManager()
conv = converter.TextConverter(manager, output) conv = converter.TextConverter(manager, output)
interp = pdfinterp.PDFPageInterpreter(manager, conv) interp = pdfinterp.PDFPageInterpreter(manager, conv)

try: try:
pages = pdfpage.PDFPage.get_pages(StringIO(self.text)) pages = pdfpage.PDFPage.get_pages(StringIO(self.text))
for page in pages: for page in pages:
interp.process_page(page) interp.process_page(page)
except pdftypes.PDFException: except pdftypes.PDFException:
return output.getvalue().decode("utf8") return output.getvalue().decode("utf8")
conv.close()
finally:
conv.close()

value = output.getvalue().decode("utf8") value = output.getvalue().decode("utf8")
return re.sub("\n\n+", "\n", value.replace("\x0c", "\n")).strip()
for orig, new in self.substitutions:
value = value.replace(orig, new)
return re.sub("\n\n+", "\n", value).strip()




class _PlainTextParser(_BaseTextParser): class _PlainTextParser(_BaseTextParser):


Loading…
Cancel
Save