From 7afb484cead528bde39daa82b02ef6716a7eaedb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 4 Sep 2014 13:27:08 -0500 Subject: [PATCH] Refactor a bunch of copyvio internals. Store all sources with a result object. --- earwigbot/wiki/copyvios/__init__.py | 25 +++-------- earwigbot/wiki/copyvios/exclusions.py | 9 ++-- earwigbot/wiki/copyvios/parsers.py | 1 - earwigbot/wiki/copyvios/result.py | 78 ++++++++++++++++++++++++----------- earwigbot/wiki/copyvios/workers.py | 62 ++++++++++++++-------------- 5 files changed, 97 insertions(+), 78 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index c6ebfc6..4a7ab1a 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -26,7 +26,6 @@ from urllib2 import build_opener from earwigbot import exceptions, importer from earwigbot.wiki.copyvios.markov import MarkovChain from earwigbot.wiki.copyvios.parsers import ArticleTextParser -from earwigbot.wiki.copyvios.result import CopyvioCheckResult from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine from earwigbot.wiki.copyvios.workers import ( globalize, localize, CopyvioWorkspace) @@ -109,12 +108,10 @@ class CopyvioMixIn(object): """ log = u"Starting copyvio check for [[{0}]]" self._logger.info(log.format(self.title)) - start_time = time() - until = (start_time + max_time) if max_time > 0 else None searcher = self._get_search_engine() parser = ArticleTextParser(self.get()) article = MarkovChain(parser.strip()) - workspace = CopyvioWorkspace(article, min_confidence, until, + workspace = CopyvioWorkspace(article, min_confidence, max_time, self._logger, self._addheaders) if self._exclusions_db: self._exclusions_db.sync(self.site.name) @@ -123,8 +120,7 @@ class CopyvioMixIn(object): exclude = None if article.size < 20: # Auto-fail very small articles - result = CopyvioCheckResult(False, 0.0, None, 0, 0, article, - workspace.best.chains) + result = workspace.get_result() self._logger.info(result.get_log_message(self.title)) return result @@ -134,19 +130,15 @@ class CopyvioMixIn(object): if not no_searches: chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) for chunk in chunks: - if workspace.best.confidence >= min_confidence: + if workspace.finished: break log = u"[[{0}]] -> querying {1} for {2!r}" self._logger.debug(log.format(self.title, searcher.name, chunk)) workspace.enqueue(searcher.search(chunk), exclude) num_queries += 1 sleep(1) - workspace.wait() - result = CopyvioCheckResult( - workspace.best.confidence >= min_confidence, - workspace.best.confidence, workspace.best.url, num_queries, - time() - start_time, article, workspace.best.chains) + result = workspace.get_result(num_queries) self._logger.info(result.get_log_message(self.title)) return result @@ -173,17 +165,12 @@ class CopyvioMixIn(object): """ log = u"Starting copyvio compare for [[{0}]] against {1}" self._logger.info(log.format(self.title, url)) - start_time = time() - until = (start_time + max_time) if max_time > 0 else None article = MarkovChain(ArticleTextParser(self.get()).strip()) workspace = CopyvioWorkspace( - article, min_confidence, until, self._logger, self._addheaders, + article, min_confidence, max_time, self._logger, self._addheaders, max_time, 1) workspace.enqueue([url]) workspace.wait() - best = workspace.best - result = CopyvioCheckResult( - best.confidence >= min_confidence, best.confidence, best.url, 0, - time() - start_time, article, best.chains) + result = workspace.get_result() self._logger.info(result.get_log_message(self.title)) return result diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py index 68c1d14..4205f86 100644 --- a/earwigbot/wiki/copyvios/exclusions.py +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -30,7 +30,7 @@ from earwigbot import exceptions __all__ = ["ExclusionsDB"] -default_sources = { +DEFAULT_SOURCES = { "all": [ # Applies to all, but located on enwiki "User:EarwigBot/Copyvios/Exclusions" ], @@ -74,7 +74,7 @@ class ExclusionsDB(object): """ query = "INSERT INTO sources VALUES (?, ?);" sources = [] - for sitename, pages in default_sources.iteritems(): + for sitename, pages in DEFAULT_SOURCES.iteritems(): for page in pages: sources.append((sitename, page)) @@ -95,8 +95,9 @@ class ExclusionsDB(object): r"\*\s*Site:\s*(?:\[|\)?(?:https?:)?(?://)?(.*?)(?:\].*?|\.*?)?\s*$" ] for regex in regexes: - find = re.findall(regex, data, re.I|re.M) - [urls.add(url.lower().strip()) for url in find if url.strip()] + for url in re.findall(regex, data, re.I|re.M): + if url.strip(): + urls.add(url.lower().strip()) return urls def _update(self, sitename): diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index df61e43..750e917 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -20,7 +20,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import errno from os import path import mwparserfromhell diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py index 7beef73..83eaafd 100644 --- a/earwigbot/wiki/copyvios/result.py +++ b/earwigbot/wiki/copyvios/result.py @@ -28,7 +28,12 @@ from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION __all__ = ["CopyvioSource", "CopyvioCheckResult"] class CopyvioSource(object): - """Represents a single suspected violation source (a URL).""" + """ + **EarwigBot: Wiki Toolset: Copyvio Source** + + A class that represents a single possible source of a copyright violation, + i.e., a URL. + """ def __init__(self, workspace, url, key, headers=None, timeout=5): self.workspace = workspace @@ -38,14 +43,23 @@ class CopyvioSource(object): self.timeout = timeout self.confidence = 0.0 self.chains = (EMPTY, EMPTY_INTERSECTION) + self.skipped = False self._event1 = Event() self._event2 = Event() self._event2.set() - def touched(self): - """Return whether one of start_work() and cancel() have been called.""" - return self._event1.is_set() + def __repr__(self): + """Return the canonical string representation of the source.""" + res = "CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r})" + return res.format(self.url, self.confidence, self.skipped) + + def __str__(self): + """Return a nice string representation of the source.""" + if self.skipped: + return "".format(self.url) + res = "" + return res.format(self.url, self.confidence) def start_work(self): """Mark this source as being worked on right now.""" @@ -58,8 +72,11 @@ class CopyvioSource(object): self.chains = (source_chain, delta_chain) self._event2.set() - def cancel(self): + def skip(self): """Deactivate this source without filling in the relevant data.""" + if self._event1.is_set(): + return + self.skipped = True self._event1.set() def join(self, until): @@ -70,6 +87,8 @@ class CopyvioSource(object): if timeout <= 0: return event.wait(timeout) + else: + event.wait() class CopyvioCheckResult(object): @@ -81,40 +100,51 @@ class CopyvioCheckResult(object): *Attributes:* - :py:attr:`violation`: ``True`` if this is a violation, else ``False`` - - :py:attr:`confidence`: a float between 0 and 1 indicating accuracy - - :py:attr:`url`: the URL of the violated page + - :py:attr:`sources`: a list of CopyvioSources, sorted by confidence - :py:attr:`queries`: the number of queries used to reach a result - :py:attr:`time`: the amount of time the check took to complete - :py:attr:`article_chain`: the MarkovChain of the article text - - :py:attr:`source_chain`: the MarkovChain of the violated page text - - :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two """ - def __init__(self, violation, confidence, url, queries, time, article, - chains): + def __init__(self, violation, sources, queries, check_time, article_chain): self.violation = violation - self.confidence = confidence - self.url = url + self.sources = sources self.queries = queries - self.time = time - self.article_chain = article - self.source_chain = chains[0] - self.delta_chain = chains[1] + self.time = check_time + self.article_chain = article_chain def __repr__(self): """Return the canonical string representation of the result.""" - res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3!r})" - return res.format(self.violation, self.confidence, self.url, - self.queries) + res = "CopyvioCheckResult(violation={0!r}, sources={1!r}, queries={2!r}, time={3!r})" + return res.format(self.violation, self.sources, self.queries, + self.time) def __str__(self): """Return a nice string representation of the result.""" - res = "" - return res.format(self.violation, self.confidence) + res = "" + return res.format(self.violation, self.best) + + @property + def best(self): + """The best known source, or None if no sources exist.""" + return self.sources[0] if self.sources else None + + @property + def confidence(self): + """The confidence of the best source, or 0 if no sources exist.""" + return self.best.confidence if self.best else 0.0 + + @property + def url(self): + """The url of the best source, or None if no sources exist.""" + return self.best.url if self.best else None def get_log_message(self, title): """Build a relevant log message for this copyvio check result.""" - log = u"{0} for [[{1}]] (confidence: {2}; URL: {3}; {4} queries; {5} seconds)" + if not self.sources: + log = u"No violation for [[{0}]] (no sources; {1} queries; {2} seconds)" + return log.format(title, self.queries, self.time) + log = u"{0} for [[{1}]] (best: {2} ({3} confidence); {4} queries; {5} seconds)" is_vio = "Violation detected" if self.violation else "No violation" - return log.format(is_vio, title, self.confidence, self.url, + return log.format(is_vio, title, self.url, self.confidence, self.queries, self.time) diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index eaaec16..157e3ec 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError from earwigbot import importer from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection from earwigbot.wiki.copyvios.parsers import HTMLTextParser -from earwigbot.wiki.copyvios.result import CopyvioSource +from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource tldextract = importer.new("tldextract") @@ -120,7 +120,8 @@ class _CopyvioWorker(object): If a URLError was raised while opening the URL or an IOError was raised while decompressing, None will be returned. """ - self._opener.addheaders = source.headers + if source.headers: + self._opener.addheaders = source.headers url = source.url.encode("utf8") try: response = self._opener.open(url, timeout=source.timeout) @@ -194,8 +195,8 @@ class _CopyvioWorker(object): return self._dequeue() self._logger.debug(u"Got source URL: {0}".format(source.url)) - if source.touched(): - self._logger.debug("Source has been cancelled") + if source.skipped: + self._logger.debug("Source has been skipped") self._queues.lock.release() return self._dequeue() @@ -232,18 +233,18 @@ class _CopyvioWorker(object): class CopyvioWorkspace(object): """Manages a single copyvio check distributed across threads.""" - def __init__(self, article, min_confidence, until, logger, headers, + def __init__(self, article, min_confidence, max_time, logger, headers, url_timeout=5, num_workers=8): - self.best = CopyvioSource(self, None, None) self.sources = [] + self.finished = False self._article = article self._logger = logger.getChild("copyvios") self._min_confidence = min_confidence - self._until = until + self._start_time = time() + self._until = (self._start_time + max_time) if max_time > 0 else None self._handled_urls = [] - self._is_finished = False - self._compare_lock = Lock() + self._finish_lock = Lock() self._source_args = {"workspace": self, "headers": headers, "timeout": url_timeout} @@ -254,7 +255,7 @@ class CopyvioWorkspace(object): self._num_workers = num_workers for i in xrange(num_workers): name = "local-{0:04}.{1}".format(id(self) % 10000, i) - _CopyvioWorker(name, self._queues, until).start() + _CopyvioWorker(name, self._queues, self._until).start() def _calculate_confidence(self, delta): """Return the confidence of a violation as a float between 0 and 1.""" @@ -294,13 +295,11 @@ class CopyvioWorkspace(object): def _finish_early(self): """Finish handling links prematurely (if we've hit min_confidence).""" - if self._is_finished: - return - self._logger.debug("Confidence threshold met; cancelling remaining sources") + self._logger.debug("Confidence threshold met; skipping remaining sources") with self._queues.lock: for source in self.sources: - source.cancel() - self._is_finished = True + source.skip() + self.finished = True def enqueue(self, urls, exclude_check=None): """Put a list of URLs into the various worker queues. @@ -310,9 +309,9 @@ class CopyvioWorkspace(object): """ for url in urls: with self._queues.lock: - if self._is_finished: + if self.finished: break - if not url or url in self._handled_urls: + if url in self._handled_urls: continue self._handled_urls.append(url) if exclude_check and exclude_check(url): @@ -336,26 +335,29 @@ class CopyvioWorkspace(object): queue.append(source) self._queues.unassigned.put((key, queue)) + def compare(self, source, source_chain): + """Compare a source to the article; call _finish_early if necessary.""" + delta = MarkovChainIntersection(self._article, source_chain) + conf = self._calculate_confidence(delta) + self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf)) + with self._finish_lock: + source.finish_work(conf, source_chain, delta) + if not self.finished and conf >= self._min_confidence: + self._finish_early() + def wait(self): """Wait for the workers to finish handling the sources.""" self._logger.debug("Waiting on {0} sources".format(len(self.sources))) for source in self.sources: source.join(self._until) - with self._compare_lock: + with self._finish_lock: pass # Wait for any remaining comparisons to be finished if not _is_globalized: for i in xrange(self._num_workers): self._queues.unassigned.put((StopIteration, None)) - def compare(self, source, source_chain): - """Compare a source to the article, and update the best known one.""" - delta = MarkovChainIntersection(self._article, source_chain) - conf = self._calculate_confidence(delta) - self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf)) - - with self._compare_lock: - source.finish_work(conf, source_chain, delta) - if conf > self.best.confidence: - self.best = source - if conf >= self._min_confidence: - self._finish_early() + def get_result(self, num_queries=0): + """Return a CopyvioCheckResult containing the results of this check.""" + self.sources.sort(key=lambda source: source.confidence, reverse=True) + return CopyvioCheckResult(self.finished, self.sources, num_queries, + time() - self._start_time, self._article)