From 81a090c923db15ae76a11e6f2f8b759709a7cedc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 29 Sep 2015 03:26:59 -0500 Subject: [PATCH] Allow content parsers to signal that a source should be excluded. --- earwigbot/exceptions.py | 21 ++++++++++++--------- earwigbot/wiki/copyvios/parsers.py | 1 + earwigbot/wiki/copyvios/workers.py | 13 ++++++++++--- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/earwigbot/exceptions.py b/earwigbot/exceptions.py index 56bdfaa..ad34ae0 100644 --- a/earwigbot/exceptions.py +++ b/earwigbot/exceptions.py @@ -52,6 +52,7 @@ This module contains all exceptions used by EarwigBot:: +-- UnknownSearchEngineError +-- UnsupportedSearchEngineError +-- SearchQueryError + +-- ParserExclusionError """ class EarwigBotError(Exception): @@ -231,9 +232,7 @@ class UnknownSearchEngineError(CopyvioCheckError): :py:attr:`config.wiki["search"]["engine"]`. Raised by :py:meth:`Page.copyvio_check - ` and - :py:meth:`Page.copyvio_compare - `. + `. """ class UnsupportedSearchEngineError(CopyvioCheckError): @@ -243,16 +242,20 @@ class UnsupportedSearchEngineError(CopyvioCheckError): couldn't be imported. Raised by :py:meth:`Page.copyvio_check - ` and - :py:meth:`Page.copyvio_compare - `. + `. """ class SearchQueryError(CopyvioCheckError): """Some error ocurred while doing a search query. Raised by :py:meth:`Page.copyvio_check - ` and - :py:meth:`Page.copyvio_compare - `. + `. + """ + +class ParserExclusionError(CopyvioCheckError): + """A content parser detected that the given source should be excluded. + + Raised internally by :py:meth:`Page.copyvio_check + `; should not be + exposed in client code. """ diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index cafc746..dbd103e 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -27,6 +27,7 @@ from StringIO import StringIO import mwparserfromhell from earwigbot import importer +from earwigbot.exceptions import ParserExclusionError bs4 = importer.new("bs4") nltk = importer.new("nltk") diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index 5230a44..4ba25bf 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -34,6 +34,7 @@ from time import time from urllib2 import build_opener, URLError from earwigbot import importer +from earwigbot.exceptions import ParserExclusionError from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection from earwigbot.wiki.copyvios.parsers import get_parser from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource @@ -218,9 +219,15 @@ class _CopyvioWorker(object): except StopIteration: self._logger.debug("Exiting: got stop signal") return - text = self._open_url(source) - chain = MarkovChain(text) if text else None - source.workspace.compare(source, chain) + + try: + text = self._open_url(source) + except ParserExclusionError: + source.skipped = source.excluded = True + source.finish_work() + else: + chain = MarkovChain(text) if text else None + source.workspace.compare(source, chain) def start(self): """Start the copyvio worker in a new thread."""