Преглед на файлове

Add no_links and no_searches to copyvio_check().

tags/v0.2
Ben Kurtovic преди 9 години
родител
ревизия
12247dd756
променени са 1 файла, в които са добавени 21 реда и са изтрити 12 реда
  1. +21
    -12
      earwigbot/wiki/copyvios/__init__.py

+ 21
- 12
earwigbot/wiki/copyvios/__init__.py Целия файл

@@ -78,7 +78,8 @@ class CopyvioMixIn(object):


raise exceptions.UnknownSearchEngineError(engine) raise exceptions.UnknownSearchEngineError(engine)


def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1):
def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1,
no_searches=False, no_links=False):
"""Check the page for copyright violations. """Check the page for copyright violations.


Returns a :class:`.CopyvioCheckResult` object with information on the Returns a :class:`.CopyvioCheckResult` object with information on the
@@ -96,6 +97,12 @@ class CopyvioMixIn(object):
if checks are called through a web server with timeouts. We will stop if checks are called through a web server with timeouts. We will stop
checking new URLs as soon as this limit is reached. checking new URLs as soon as this limit is reached.


Setting *no_searches* to ``True`` will cause only URLs in the wikitext
of the page to be checked; no search engine queries will be made.
Setting *no_links* to ``True`` will cause the opposite to happen: URLs
in the wikitext will be ignored; search engine queries will be made
only. Setting both of these to ``True`` is pointless.

Raises :exc:`.CopyvioCheckError` or subclasses Raises :exc:`.CopyvioCheckError` or subclasses
(:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on (:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on
errors. errors.
@@ -121,17 +128,19 @@ class CopyvioMixIn(object):
self._logger.info(result.get_log_message(self.title)) self._logger.info(result.get_log_message(self.title))
return result return result


workspace.enqueue(parser.get_links(), exclude)
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
num_queries = 0
for chunk in chunks:
if workspace.best.confidence >= min_confidence:
break
log = u"[[{0}]] -> querying {1} for {2!r}"
self._logger.debug(log.format(self.title, searcher.name, chunk))
workspace.enqueue(searcher.search(chunk), exclude)
num_queries += 1
sleep(1)
if not no_links:
workspace.enqueue(parser.get_links(), exclude)
if not no_searches:
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
num_queries = 0
for chunk in chunks:
if workspace.best.confidence >= min_confidence:
break
log = u"[[{0}]] -> querying {1} for {2!r}"
self._logger.debug(log.format(self.title, searcher.name, chunk))
workspace.enqueue(searcher.search(chunk), exclude)
num_queries += 1
sleep(1)


workspace.wait() workspace.wait()
result = CopyvioCheckResult( result = CopyvioCheckResult(


Зареждане…
Отказ
Запис