Преглед изворни кода

Update copyvio checker to not make concurrent requests to a single domain.

tags/v0.2
Ben Kurtovic пре 9 година
родитељ
комит
7137dda920
2 измењених фајлова са 71 додато и 62 уклоњено
  1. +70
    -62
      earwigbot/wiki/copyvios/__init__.py
  2. +1
    -0
      setup.py

+ 70
- 62
earwigbot/wiki/copyvios/__init__.py Прегледај датотеку

@@ -25,7 +25,7 @@ from gzip import GzipFile
from Queue import Empty, Queue
from socket import timeout
from StringIO import StringIO
from threading import Lock, Thread
from threading import Lock, Semaphore, Thread
from time import sleep, time
from urllib2 import build_opener, URLError

@@ -37,6 +37,7 @@ from earwigbot.wiki.copyvios.result import CopyvioCheckResult
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine

oauth = importer.new("oauth2")
tldextract = importer.new("tldextract")

__all__ = ["CopyvioMixIn"]

@@ -46,47 +47,34 @@ class _CopyvioWorkspace(object):
"""Manages a single copyvio check distributed across threads."""

def __init__(self, article, min_confidence, until, logger, headers,
url_timeout=5):
url_timeout=5, max_concurrent_requests=6):
self.best = _WorkingResult(None, 0.0, (EMPTY, EMPTY_INTERSECTION))
self.until = until
self.request_semaphore = Semaphore(max_concurrent_requests)

self._article = article
self._handled_urls = []
self._headers = headers
self._logger = logger.getChild("copyvios")
self._min_confidence = min_confidence
self._queue = Queue()
self._handled_urls = []
self._is_finished = False
self._enqueue_lock = Lock()
self._result_lock = Lock()
self._url_timeout = url_timeout
self._workers = []

self._workers = {}
self._worker_args = (self, until, headers, url_timeout)

def _calculate_confidence(self, delta):
"""Return the confidence of a violation as a float between 0 and 1."""
return float(delta.size()) / self._article.size()

def _finish_early(self):
"""Finish handling links prematurely, e.g. if we've met min confidence.

This works by inserting an additional ``None`` into the queue (the
"exit" signal for workers) and then dequeueing until we reach the first
``None``. This way, every worker will dequeue an exit signal signal on
their next turn.
"""
self._queue.put(None)
try:
while self._queue.get(block=False):
pass
except Empty: # Might happen if we haven't called wait() yet, but NBD.
pass

def spawn(self, workers):
"""Spawn *workers* number of worker threads."""
for i in xrange(workers):
worker = _CopyvioWorker(self, self._headers, self._url_timeout)
thread = Thread(target=worker.run)
thread.daemon = True
thread.start()
self._workers.append(thread)
"""Finish handling links prematurely (if we've hit min_confidence)."""
self._logger.debug("Confidence threshold met; clearing worker queues")
with self._enqueue_lock:
for worker in self._workers.itervalues():
with worker.queue.mutex:
worker.queue.clear()
worker.queue.put(None)
self._is_finished = True

def enqueue(self, urls, exclude_check=None):
"""Put a list of URLs into the worker queue.
@@ -95,29 +83,48 @@ class _CopyvioWorkspace(object):
returns ``True`` if we should skip it and ``False`` otherwise.
"""
for url in urls:
if url in self._handled_urls:
continue
self._handled_urls.append(url)
if exclude_check and exclude_check(url):
continue
self._queue.put(url)
with self._enqueue_lock:
if self._is_finished:
break
if url in self._handled_urls:
continue
self._handled_urls.append(url)
if exclude_check and exclude_check(url):
continue

def dequeue(self, max_time=None):
"""Get an element from the worker queue, with an optional timeout."""
return self._queue.get(timeout=max_time)
try:
key = tldextract.extract(url).registered_domain
except ImportError: # Fall back on very naive method
from urlparse import urlparse
key = u".".join(urlparse(url).netloc.split(".")[-2:])

logmsg = "enqueue(): {0} {1} -> {2}"
if key in self._workers:
self._logger.debug(logmsg.format("PUT", key, url))
self._workers[key].queue.put(url)
else:
self._logger.debug(logmsg.format("NEW", key, url))
worker = _CopyvioWorker(*self._worker_args)
worker.queue.put(url)
thread = Thread(target=worker.run)
thread.name = "cvworker-" + key.encode("utf8")
thread.daemon = True
thread.start()
self._workers[key] = worker

def wait(self):
"""Wait for the workers to finish handling the queue."""
for i in xrange(len(self._workers)):
self._queue.put(None) # Exit signal to workers
for worker in self._workers:
self._logger.debug("Waiting on {0} workers".format(len(self._workers)))
for worker in self._workers.itervalues():
worker.queue.put(None) # Exit signal to workers
for worker in self._workers.itervalues():
worker.join()

def compare(self, url, source):
"""Compare a source to the article, and update the working result."""
delta = MarkovChainIntersection(self._article, source)
confidence = self._calculate_confidence(delta)
self._logger.debug("{0} -> {1}".format(url, confidence))
self._logger.debug("compare(): {0} -> {1}".format(url, confidence))
with self._result_lock:
if confidence > self.best.confidence:
self.best = _WorkingResult(url, confidence, (source, delta))
@@ -128,8 +135,11 @@ class _CopyvioWorkspace(object):
class _CopyvioWorker(object):
"""A multithreaded URL opener/parser instance."""

def __init__(self, workspace, headers, url_timeout):
def __init__(self, workspace, until, headers, url_timeout):
self.queue = Queue()

self._workspace = workspace
self._until = until
self._opener = build_opener()
self._opener.addheaders = headers
self._url_timeout = url_timeout
@@ -146,11 +156,12 @@ class _CopyvioWorker(object):
If a URLError was raised while opening the URL or an IOError was raised
while decompressing, None will be returned.
"""
try:
response = self._opener.open(url, timeout=self._url_timeout)
result = response.read()
except (URLError, timeout):
return None
with self._workspace.request_semaphore:
try:
response = self._opener.open(url, timeout=self._url_timeout)
result = response.read()
except (URLError, timeout):
return None

if response.headers.get("Content-Encoding") == "gzip":
stream = StringIO(result)
@@ -177,17 +188,17 @@ class _CopyvioWorker(object):
now empty.
"""
while True:
if self._workspace.until:
max_time = self._workspace.until - time()
if self._until:
max_time = self._until - time()
if max_time <= 0:
return
try:
url = self._workspace.dequeue(max_time)
url = self.queue.get(timeout=max_time)
except Empty:
return
else:
url = self._workspace.dequeue()
if url is None: # Exit signal from workspace.wait()
url = self.queue.get()
if url is None: # Exit signal
return
text = self._open_url(url.encode("utf8"))
if text:
@@ -237,8 +248,7 @@ class CopyvioMixIn(object):

raise exceptions.UnknownSearchEngineError(engine)

def copyvio_check(self, min_confidence=0.5, max_queries=15, max_time=-1,
worker_threads=4):
def copyvio_check(self, min_confidence=0.5, max_queries=15, max_time=-1):
"""Check the page for copyright violations.

Returns a :class:`.CopyvioCheckResult` object with information on the
@@ -256,13 +266,12 @@ class CopyvioMixIn(object):
if checks are called through a web server with timeouts. We will stop
checking new URLs as soon as this limit is reached.

*worker_threads* is the number of threads we will spawn to handle URL
fetching and parsing simultaneously. Between 1 and 8 is recommended.

Raises :exc:`.CopyvioCheckError` or subclasses
(:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on
errors.
"""
log = "Starting copyvio check for [[{0}]]"
self._logger.info(log.format(self.title))
start_time = time()
until = (start_time + max_time) if max_time > 0 else None
searcher = self._get_search_engine()
@@ -282,9 +291,7 @@ class CopyvioMixIn(object):
self._logger.info(result.get_log_message(self.title))
return result

workspace.spawn(worker_threads)
workspace.enqueue(parser.get_links(), exclude)

chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
num_queries = 0
for chunk in chunks:
@@ -325,13 +332,14 @@ class CopyvioMixIn(object):
Since no searching is done, neither :exc:`.UnknownSearchEngineError`
nor :exc:`.SearchQueryError` will be raised.
"""
log = "Starting copyvio compare for [[{0}]] against {1}"
self._logger.info(log.format(self.title, url))
start_time = time()
until = (start_time + max_time) if max_time > 0 else None
article = MarkovChain(ArticleTextParser(self.get()).strip())
workspace = _CopyvioWorkspace(article, min_confidence, until,
self._logger, self._addheaders, max_time)
workspace.enqueue([url])
workspace.spawn(1)
workspace.wait()
url, conf, chains = workspace.best
result = CopyvioCheckResult(conf >= min_confidence, conf, url, 0,


+ 1
- 0
setup.py Прегледај датотеку

@@ -43,6 +43,7 @@ extra_deps = {
"lxml >= 2.3.5", # Faster parser for BeautifulSoup
"nltk >= 2.0.2", # Parsing sentences to split article content
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search
"tldextract >= 1.4", # Getting domains for the multithreaded workers
],
"time": [
"pytz >= 2012d", # Handling timezones for the !time IRC command


Loading…
Откажи
Сачувај