Browse Source

Fix a few bugs

legacy-python2
Ben Kurtovic 3 years ago
parent
commit
a49a82e263
3 changed files with 10 additions and 9 deletions
  1. +1
    -1
      earwigbot/wiki/copyvios/__init__.py
  2. +8
    -7
      earwigbot/wiki/copyvios/parsers.py
  3. +1
    -1
      earwigbot/wiki/copyvios/workers.py

+ 1
- 1
earwigbot/wiki/copyvios/__init__.py View File

@@ -117,7 +117,7 @@ class CopyvioMixIn(object):
log = u"Starting copyvio check for [[{0}]]"
self._logger.info(log.format(self.title))
searcher = self._get_search_engine()
parser = ArticleTextParser(self.get(), {
parser = ArticleTextParser(self.get(), args={
"nltk_dir": self._search_config["nltk_dir"],
"lang": self._site.lang
})


+ 8
- 7
earwigbot/wiki/copyvios/parsers.py View File

@@ -42,9 +42,9 @@ class _BaseTextParser(object):
"""Base class for a parser that handles text."""
TYPE = None

def __init__(self, url, text, args=None):
self.url = url
def __init__(self, text, url=None, args=None):
self.text = text
self.url = url
self._args = args or {}

def __repr__(self):
@@ -265,11 +265,12 @@ class _HTMLParser(_BaseTextParser):
self._fail_if_mirror(soup)
soup = soup.body

url = urlparse.urlparse(self.url)
if url.netloc == "web.archive.org" and url.path.endswith(".pdf"):
playback = soup.find(id="playback")
if playback and "src" in playback.attrs:
raise ParserRedirectError(playback.attrs["src"])
if self.url:
url = urlparse.urlparse(self.url)
if url.netloc == "web.archive.org" and url.path.endswith(".pdf"):
playback = soup.find(id="playback")
if playback and "src" in playback.attrs:
raise ParserRedirectError(playback.attrs["src"])

is_comment = lambda text: isinstance(text, bs4.element.Comment)
for comment in soup.find_all(text=is_comment):


+ 1
- 1
earwigbot/wiki/copyvios/workers.py View File

@@ -158,7 +158,7 @@ class _CopyvioWorker(object):
except (IOError, struct_error):
return None

parser = parser_class(content, source.parser_args)
parser = parser_class(content, url=url, args=source.parser_args)
try:
return parser.parse()
except ParserRedirectError as exc:


Loading…
Cancel
Save