Explorar el Código

Add an option to disable short-circuiting.

tags/v0.2
Ben Kurtovic hace 10 años
padre
commit
303c39c8c7
Se han modificado 2 ficheros con 17 adiciones y 7 borrados
  1. +10
    -4
      earwigbot/wiki/copyvios/__init__.py
  2. +7
    -3
      earwigbot/wiki/copyvios/workers.py

+ 10
- 4
earwigbot/wiki/copyvios/__init__.py Ver fichero

@@ -78,7 +78,7 @@ class CopyvioMixIn(object):
raise exceptions.UnknownSearchEngineError(engine)

def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1,
no_searches=False, no_links=False):
no_searches=False, no_links=False, short_circuit=True):
"""Check the page for copyright violations.

Returns a :class:`.CopyvioCheckResult` object with information on the
@@ -102,6 +102,11 @@ class CopyvioMixIn(object):
in the wikitext will be ignored; search engine queries will be made
only. Setting both of these to ``True`` is pointless.

Normally, the checker will short-circuit if it finds a URL that meets
*min_confidence*. This behavior normally causes it to skip any
remaining URLs and web queries, but setting *short_circuit* to
``False`` will prevent this.

Raises :exc:`.CopyvioCheckError` or subclasses
(:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on
errors.
@@ -111,8 +116,9 @@ class CopyvioMixIn(object):
searcher = self._get_search_engine()
parser = ArticleTextParser(self.get())
article = MarkovChain(parser.strip())
workspace = CopyvioWorkspace(article, min_confidence, max_time,
self._logger, self._addheaders)
workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders,
short_circuit=short_circuit)
if self._exclusions_db:
self._exclusions_db.sync(self.site.name)
exclude = lambda u: self._exclusions_db.check(self.site.name, u)
@@ -130,7 +136,7 @@ class CopyvioMixIn(object):
if not no_searches:
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
for chunk in chunks:
if workspace.finished:
if short_circuit and workspace.finished:
break
log = u"[[{0}]] -> querying {1} for {2!r}"
self._logger.debug(log.format(self.title, searcher.name, chunk))


+ 7
- 3
earwigbot/wiki/copyvios/workers.py Ver fichero

@@ -234,7 +234,7 @@ class CopyvioWorkspace(object):
"""Manages a single copyvio check distributed across threads."""

def __init__(self, article, min_confidence, max_time, logger, headers,
url_timeout=5, num_workers=8):
url_timeout=5, num_workers=8, short_circuit=True):
self.sources = []
self.finished = False

@@ -245,6 +245,7 @@ class CopyvioWorkspace(object):
self._until = (self._start_time + max_time) if max_time > 0 else None
self._handled_urls = []
self._finish_lock = Lock()
self._short_circuit = short_circuit
self._source_args = {"workspace": self, "headers": headers,
"timeout": url_timeout}

@@ -309,7 +310,7 @@ class CopyvioWorkspace(object):
"""
for url in urls:
with self._queues.lock:
if self.finished:
if self._short_circuit and self.finished:
break
if url in self._handled_urls:
continue
@@ -343,7 +344,10 @@ class CopyvioWorkspace(object):
with self._finish_lock:
source.finish_work(conf, source_chain, delta)
if not self.finished and conf >= self._min_confidence:
self._finish_early()
if self._short_circuit:
self._finish_early()
else:
self.finished = True

def wait(self):
"""Wait for the workers to finish handling the sources."""


Cargando…
Cancelar
Guardar