diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index a93cc6e..059a90b 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -117,7 +117,7 @@ class CopyvioMixIn(object): log = u"Starting copyvio check for [[{0}]]" self._logger.info(log.format(self.title)) searcher = self._get_search_engine() - parser = ArticleTextParser(self.get(), { + parser = ArticleTextParser(self.get(), args={ "nltk_dir": self._search_config["nltk_dir"], "lang": self._site.lang }) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index ed94882..1b2dfae 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -42,9 +42,9 @@ class _BaseTextParser(object): """Base class for a parser that handles text.""" TYPE = None - def __init__(self, url, text, args=None): - self.url = url + def __init__(self, text, url=None, args=None): self.text = text + self.url = url self._args = args or {} def __repr__(self): @@ -265,11 +265,12 @@ class _HTMLParser(_BaseTextParser): self._fail_if_mirror(soup) soup = soup.body - url = urlparse.urlparse(self.url) - if url.netloc == "web.archive.org" and url.path.endswith(".pdf"): - playback = soup.find(id="playback") - if playback and "src" in playback.attrs: - raise ParserRedirectError(playback.attrs["src"]) + if self.url: + url = urlparse.urlparse(self.url) + if url.netloc == "web.archive.org" and url.path.endswith(".pdf"): + playback = soup.find(id="playback") + if playback and "src" in playback.attrs: + raise ParserRedirectError(playback.attrs["src"]) is_comment = lambda text: isinstance(text, bs4.element.Comment) for comment in soup.find_all(text=is_comment): diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index e16baf9..af39a6a 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -158,7 +158,7 @@ class _CopyvioWorker(object): except (IOError, struct_error): return None - parser = parser_class(content, source.parser_args) + parser = parser_class(content, url=url, args=source.parser_args) try: return parser.parse() except ParserRedirectError as exc: