Browse Source

Make CopyvioSource public; tweaks.

tags/v0.2
Ben Kurtovic 10 years ago
parent
commit
54ddff049f
3 changed files with 66 additions and 60 deletions
  1. +1
    -3
      earwigbot/wiki/copyvios/parsers.py
  2. +51
    -1
      earwigbot/wiki/copyvios/result.py
  3. +14
    -56
      earwigbot/wiki/copyvios/workers.py

+ 1
- 3
earwigbot/wiki/copyvios/parsers.py View File

@@ -107,9 +107,7 @@ class ArticleTextParser(BaseTextParser):
datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
try: try:
tokenizer = nltk.data.load("file:" + datafile) tokenizer = nltk.data.load("file:" + datafile)
except IOError as exc:
if exc.errno != errno.ENOENT:
raise
except LookupError:
nltk.download("punkt", nltk_dir) nltk.download("punkt", nltk_dir)
tokenizer = nltk.data.load("file:" + datafile) tokenizer = nltk.data.load("file:" + datafile)




+ 51
- 1
earwigbot/wiki/copyvios/result.py View File

@@ -20,7 +20,57 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


__all__ = ["CopyvioCheckResult"]
from threading import Event
from time import time

from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION

__all__ = ["CopyvioSource", "CopyvioCheckResult"]

class CopyvioSource(object):
"""Represents a single suspected violation source (a URL)."""

def __init__(self, workspace, url, key, headers=None, timeout=5):
self.workspace = workspace
self.url = url
self.key = key
self.headers = headers
self.timeout = timeout
self.confidence = 0.0
self.chains = (EMPTY, EMPTY_INTERSECTION)

self._event1 = Event()
self._event2 = Event()
self._event2.set()

def touched(self):
"""Return whether one of start_work() and cancel() have been called."""
return self._event1.is_set()

def start_work(self):
"""Mark this source as being worked on right now."""
self._event2.clear()
self._event1.set()

def finish_work(self, confidence, source_chain, delta_chain):
"""Complete the confidence information inside this source."""
self.confidence = confidence
self.chains = (source_chain, delta_chain)
self._event2.set()

def cancel(self):
"""Deactivate this source without filling in the relevant data."""
self._event1.set()

def join(self, until):
"""Block until this violation result is filled out."""
for event in [self._event1, self._event2]:
if until:
timeout = until - time()
if timeout <= 0:
return
event.wait(timeout)



class CopyvioCheckResult(object): class CopyvioCheckResult(object):
""" """


+ 14
- 56
earwigbot/wiki/copyvios/workers.py View File

@@ -27,14 +27,14 @@ from math import log
from Queue import Empty, Queue from Queue import Empty, Queue
from socket import error from socket import error
from StringIO import StringIO from StringIO import StringIO
from threading import Event, Lock, Thread
from threading import Lock, Thread
from time import time from time import time
from urllib2 import build_opener, URLError from urllib2 import build_opener, URLError


from earwigbot import importer from earwigbot import importer
from earwigbot.wiki.copyvios.markov import (
EMPTY, EMPTY_INTERSECTION, MarkovChain, MarkovChainIntersection)
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.parsers import HTMLTextParser from earwigbot.wiki.copyvios.parsers import HTMLTextParser
from earwigbot.wiki.copyvios.result import CopyvioSource


tldextract = importer.new("tldextract") tldextract = importer.new("tldextract")


@@ -62,6 +62,7 @@ def globalize(num_workers=8):
_global_queues = _CopyvioQueues() _global_queues = _CopyvioQueues()
for i in xrange(num_workers): for i in xrange(num_workers):
worker = _CopyvioWorker("global-{0}".format(i), _global_queues) worker = _CopyvioWorker("global-{0}".format(i), _global_queues)
worker.start()
_global_workers.append(worker) _global_workers.append(worker)
_is_globalized = True _is_globalized = True


@@ -85,51 +86,6 @@ def localize():
_is_globalized = False _is_globalized = False




class _CopyvioSource(object):
"""Represents a single suspected violation source (a URL)."""

def __init__(self, workspace, url, key, headers=None, timeout=5):
self.workspace = workspace
self.url = url
self.key = key
self.headers = headers
self.timeout = timeout
self.confidence = 0.0
self.chains = (EMPTY, EMPTY_INTERSECTION)

self._event1 = Event()
self._event2 = Event()
self._event2.set()

def touched(self):
"""Return whether one of start_work() and cancel() have been called."""
return self._event1.is_set()

def start_work(self):
"""Mark this source as being worked on right now."""
self._event2.clear()
self._event1.set()

def finish_work(self, confidence, source_chain, delta_chain):
"""Complete the confidence information inside this source."""
self.confidence = confidence
self.chains = (source_chain, delta_chain)
self._event2.set()

def cancel(self):
"""Deactivate this source without filling in the relevant data."""
self._event1.set()

def join(self, until):
"""Block until this violation result is filled out."""
for event in [self._event1, self._event2]:
if until:
timeout = until - time()
if timeout <= 0:
return
event.wait(timeout)


class _CopyvioQueues(object): class _CopyvioQueues(object):
"""Stores data necessary to maintain the various queues during a check.""" """Stores data necessary to maintain the various queues during a check."""


@@ -143,19 +99,15 @@ class _CopyvioWorker(object):
"""A multithreaded URL opener/parser instance.""" """A multithreaded URL opener/parser instance."""


def __init__(self, name, queues, until=None): def __init__(self, name, queues, until=None):
self._name = name
self._queues = queues self._queues = queues
self._until = until self._until = until


self._thread = thread = Thread(target=self._run)
self._site = None self._site = None
self._queue = None self._queue = None
self._opener = build_opener() self._opener = build_opener()
self._logger = getLogger("earwigbot.wiki.cvworker." + name) self._logger = getLogger("earwigbot.wiki.cvworker." + name)


thread.name = "cvworker-" + name
thread.daemon = True
thread.start()

def _open_url(self, source): def _open_url(self, source):
"""Open a URL and return its parsed content, or None. """Open a URL and return its parsed content, or None.


@@ -270,13 +222,19 @@ class _CopyvioWorker(object):
text = self._open_url(source) text = self._open_url(source)
source.workspace.compare(source, MarkovChain(text or "")) source.workspace.compare(source, MarkovChain(text or ""))


def start(self):
"""Start the copyvio worker in a new thread."""
thread = Thread(target=self._run, name="cvworker-" + self._name)
thread.daemon = True
thread.start()



class CopyvioWorkspace(object): class CopyvioWorkspace(object):
"""Manages a single copyvio check distributed across threads.""" """Manages a single copyvio check distributed across threads."""


def __init__(self, article, min_confidence, until, logger, headers, def __init__(self, article, min_confidence, until, logger, headers,
url_timeout=5, num_workers=8): url_timeout=5, num_workers=8):
self.best = _CopyvioSource(self, None, None)
self.best = CopyvioSource(self, None, None)
self.sources = [] self.sources = []


self._article = article self._article = article
@@ -296,7 +254,7 @@ class CopyvioWorkspace(object):
self._num_workers = num_workers self._num_workers = num_workers
for i in xrange(num_workers): for i in xrange(num_workers):
name = "local-{0:04}.{1}".format(id(self) % 10000, i) name = "local-{0:04}.{1}".format(id(self) % 10000, i)
_CopyvioWorker(name, self._queues, until)
_CopyvioWorker(name, self._queues, until).start()


def _calculate_confidence(self, delta): def _calculate_confidence(self, delta):
"""Return the confidence of a violation as a float between 0 and 1.""" """Return the confidence of a violation as a float between 0 and 1."""
@@ -366,7 +324,7 @@ class CopyvioWorkspace(object):
from urlparse import urlparse from urlparse import urlparse
key = u".".join(urlparse(url).netloc.split(".")[-2:]) key = u".".join(urlparse(url).netloc.split(".")[-2:])


source = _CopyvioSource(url=url, key=key, **self._source_args)
source = CopyvioSource(url=url, key=key, **self._source_args)
self.sources.append(source) self.sources.append(source)
logmsg = u"enqueue(): {0} {1} -> {2}" logmsg = u"enqueue(): {0} {1} -> {2}"
if key in self._queues.sites: if key in self._queues.sites:


Loading…
Cancel
Save