浏览代码

Refactor a bunch of copyvio internals. Store all sources with a result object.

tags/v0.2
Ben Kurtovic 10 年前
父节点
当前提交
7afb484cea
共有 5 个文件被更改,包括 97 次插入78 次删除
  1. +6
    -19
      earwigbot/wiki/copyvios/__init__.py
  2. +5
    -4
      earwigbot/wiki/copyvios/exclusions.py
  3. +0
    -1
      earwigbot/wiki/copyvios/parsers.py
  4. +54
    -24
      earwigbot/wiki/copyvios/result.py
  5. +32
    -30
      earwigbot/wiki/copyvios/workers.py

+ 6
- 19
earwigbot/wiki/copyvios/__init__.py 查看文件

@@ -26,7 +26,6 @@ from urllib2 import build_opener
from earwigbot import exceptions, importer
from earwigbot.wiki.copyvios.markov import MarkovChain
from earwigbot.wiki.copyvios.parsers import ArticleTextParser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
from earwigbot.wiki.copyvios.workers import (
globalize, localize, CopyvioWorkspace)
@@ -109,12 +108,10 @@ class CopyvioMixIn(object):
"""
log = u"Starting copyvio check for [[{0}]]"
self._logger.info(log.format(self.title))
start_time = time()
until = (start_time + max_time) if max_time > 0 else None
searcher = self._get_search_engine()
parser = ArticleTextParser(self.get())
article = MarkovChain(parser.strip())
workspace = CopyvioWorkspace(article, min_confidence, until,
workspace = CopyvioWorkspace(article, min_confidence, max_time,
self._logger, self._addheaders)
if self._exclusions_db:
self._exclusions_db.sync(self.site.name)
@@ -123,8 +120,7 @@ class CopyvioMixIn(object):
exclude = None

if article.size < 20: # Auto-fail very small articles
result = CopyvioCheckResult(False, 0.0, None, 0, 0, article,
workspace.best.chains)
result = workspace.get_result()
self._logger.info(result.get_log_message(self.title))
return result

@@ -134,19 +130,15 @@ class CopyvioMixIn(object):
if not no_searches:
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
for chunk in chunks:
if workspace.best.confidence >= min_confidence:
if workspace.finished:
break
log = u"[[{0}]] -> querying {1} for {2!r}"
self._logger.debug(log.format(self.title, searcher.name, chunk))
workspace.enqueue(searcher.search(chunk), exclude)
num_queries += 1
sleep(1)

workspace.wait()
result = CopyvioCheckResult(
workspace.best.confidence >= min_confidence,
workspace.best.confidence, workspace.best.url, num_queries,
time() - start_time, article, workspace.best.chains)
result = workspace.get_result(num_queries)
self._logger.info(result.get_log_message(self.title))
return result

@@ -173,17 +165,12 @@ class CopyvioMixIn(object):
"""
log = u"Starting copyvio compare for [[{0}]] against {1}"
self._logger.info(log.format(self.title, url))
start_time = time()
until = (start_time + max_time) if max_time > 0 else None
article = MarkovChain(ArticleTextParser(self.get()).strip())
workspace = CopyvioWorkspace(
article, min_confidence, until, self._logger, self._addheaders,
article, min_confidence, max_time, self._logger, self._addheaders,
max_time, 1)
workspace.enqueue([url])
workspace.wait()
best = workspace.best
result = CopyvioCheckResult(
best.confidence >= min_confidence, best.confidence, best.url, 0,
time() - start_time, article, best.chains)
result = workspace.get_result()
self._logger.info(result.get_log_message(self.title))
return result

+ 5
- 4
earwigbot/wiki/copyvios/exclusions.py 查看文件

@@ -30,7 +30,7 @@ from earwigbot import exceptions

__all__ = ["ExclusionsDB"]

default_sources = {
DEFAULT_SOURCES = {
"all": [ # Applies to all, but located on enwiki
"User:EarwigBot/Copyvios/Exclusions"
],
@@ -74,7 +74,7 @@ class ExclusionsDB(object):
"""
query = "INSERT INTO sources VALUES (?, ?);"
sources = []
for sitename, pages in default_sources.iteritems():
for sitename, pages in DEFAULT_SOURCES.iteritems():
for page in pages:
sources.append((sitename, page))

@@ -95,8 +95,9 @@ class ExclusionsDB(object):
r"\*\s*Site:\s*(?:\[|\<nowiki\>)?(?:https?:)?(?://)?(.*?)(?:\].*?|\</nowiki\>.*?)?\s*$"
]
for regex in regexes:
find = re.findall(regex, data, re.I|re.M)
[urls.add(url.lower().strip()) for url in find if url.strip()]
for url in re.findall(regex, data, re.I|re.M):
if url.strip():
urls.add(url.lower().strip())
return urls

def _update(self, sitename):


+ 0
- 1
earwigbot/wiki/copyvios/parsers.py 查看文件

@@ -20,7 +20,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import errno
from os import path

import mwparserfromhell


+ 54
- 24
earwigbot/wiki/copyvios/result.py 查看文件

@@ -28,7 +28,12 @@ from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION
__all__ = ["CopyvioSource", "CopyvioCheckResult"]

class CopyvioSource(object):
"""Represents a single suspected violation source (a URL)."""
"""
**EarwigBot: Wiki Toolset: Copyvio Source**

A class that represents a single possible source of a copyright violation,
i.e., a URL.
"""

def __init__(self, workspace, url, key, headers=None, timeout=5):
self.workspace = workspace
@@ -38,14 +43,23 @@ class CopyvioSource(object):
self.timeout = timeout
self.confidence = 0.0
self.chains = (EMPTY, EMPTY_INTERSECTION)
self.skipped = False

self._event1 = Event()
self._event2 = Event()
self._event2.set()

def touched(self):
"""Return whether one of start_work() and cancel() have been called."""
return self._event1.is_set()
def __repr__(self):
"""Return the canonical string representation of the source."""
res = "CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r})"
return res.format(self.url, self.confidence, self.skipped)

def __str__(self):
"""Return a nice string representation of the source."""
if self.skipped:
return "<CopyvioSource ({0}, skipped)>".format(self.url)
res = "<CopyvioSource ({0} with {1} conf)>"
return res.format(self.url, self.confidence)

def start_work(self):
"""Mark this source as being worked on right now."""
@@ -58,8 +72,11 @@ class CopyvioSource(object):
self.chains = (source_chain, delta_chain)
self._event2.set()

def cancel(self):
def skip(self):
"""Deactivate this source without filling in the relevant data."""
if self._event1.is_set():
return
self.skipped = True
self._event1.set()

def join(self, until):
@@ -70,6 +87,8 @@ class CopyvioSource(object):
if timeout <= 0:
return
event.wait(timeout)
else:
event.wait()


class CopyvioCheckResult(object):
@@ -81,40 +100,51 @@ class CopyvioCheckResult(object):
*Attributes:*

- :py:attr:`violation`: ``True`` if this is a violation, else ``False``
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy
- :py:attr:`url`: the URL of the violated page
- :py:attr:`sources`: a list of CopyvioSources, sorted by confidence
- :py:attr:`queries`: the number of queries used to reach a result
- :py:attr:`time`: the amount of time the check took to complete
- :py:attr:`article_chain`: the MarkovChain of the article text
- :py:attr:`source_chain`: the MarkovChain of the violated page text
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two
"""

def __init__(self, violation, confidence, url, queries, time, article,
chains):
def __init__(self, violation, sources, queries, check_time, article_chain):
self.violation = violation
self.confidence = confidence
self.url = url
self.sources = sources
self.queries = queries
self.time = time
self.article_chain = article
self.source_chain = chains[0]
self.delta_chain = chains[1]
self.time = check_time
self.article_chain = article_chain

def __repr__(self):
"""Return the canonical string representation of the result."""
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3!r})"
return res.format(self.violation, self.confidence, self.url,
self.queries)
res = "CopyvioCheckResult(violation={0!r}, sources={1!r}, queries={2!r}, time={3!r})"
return res.format(self.violation, self.sources, self.queries,
self.time)

def __str__(self):
"""Return a nice string representation of the result."""
res = "<CopyvioCheckResult ({0} with {1} conf)>"
return res.format(self.violation, self.confidence)
res = "<CopyvioCheckResult ({0} with best {1})>"
return res.format(self.violation, self.best)

@property
def best(self):
"""The best known source, or None if no sources exist."""
return self.sources[0] if self.sources else None

@property
def confidence(self):
"""The confidence of the best source, or 0 if no sources exist."""
return self.best.confidence if self.best else 0.0

@property
def url(self):
"""The url of the best source, or None if no sources exist."""
return self.best.url if self.best else None

def get_log_message(self, title):
"""Build a relevant log message for this copyvio check result."""
log = u"{0} for [[{1}]] (confidence: {2}; URL: {3}; {4} queries; {5} seconds)"
if not self.sources:
log = u"No violation for [[{0}]] (no sources; {1} queries; {2} seconds)"
return log.format(title, self.queries, self.time)
log = u"{0} for [[{1}]] (best: {2} ({3} confidence); {4} queries; {5} seconds)"
is_vio = "Violation detected" if self.violation else "No violation"
return log.format(is_vio, title, self.confidence, self.url,
return log.format(is_vio, title, self.url, self.confidence,
self.queries, self.time)

+ 32
- 30
earwigbot/wiki/copyvios/workers.py 查看文件

@@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError
from earwigbot import importer
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.parsers import HTMLTextParser
from earwigbot.wiki.copyvios.result import CopyvioSource
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource

tldextract = importer.new("tldextract")

@@ -120,7 +120,8 @@ class _CopyvioWorker(object):
If a URLError was raised while opening the URL or an IOError was raised
while decompressing, None will be returned.
"""
self._opener.addheaders = source.headers
if source.headers:
self._opener.addheaders = source.headers
url = source.url.encode("utf8")
try:
response = self._opener.open(url, timeout=source.timeout)
@@ -194,8 +195,8 @@ class _CopyvioWorker(object):
return self._dequeue()

self._logger.debug(u"Got source URL: {0}".format(source.url))
if source.touched():
self._logger.debug("Source has been cancelled")
if source.skipped:
self._logger.debug("Source has been skipped")
self._queues.lock.release()
return self._dequeue()

@@ -232,18 +233,18 @@ class _CopyvioWorker(object):
class CopyvioWorkspace(object):
"""Manages a single copyvio check distributed across threads."""

def __init__(self, article, min_confidence, until, logger, headers,
def __init__(self, article, min_confidence, max_time, logger, headers,
url_timeout=5, num_workers=8):
self.best = CopyvioSource(self, None, None)
self.sources = []
self.finished = False

self._article = article
self._logger = logger.getChild("copyvios")
self._min_confidence = min_confidence
self._until = until
self._start_time = time()
self._until = (self._start_time + max_time) if max_time > 0 else None
self._handled_urls = []
self._is_finished = False
self._compare_lock = Lock()
self._finish_lock = Lock()
self._source_args = {"workspace": self, "headers": headers,
"timeout": url_timeout}

@@ -254,7 +255,7 @@ class CopyvioWorkspace(object):
self._num_workers = num_workers
for i in xrange(num_workers):
name = "local-{0:04}.{1}".format(id(self) % 10000, i)
_CopyvioWorker(name, self._queues, until).start()
_CopyvioWorker(name, self._queues, self._until).start()

def _calculate_confidence(self, delta):
"""Return the confidence of a violation as a float between 0 and 1."""
@@ -294,13 +295,11 @@ class CopyvioWorkspace(object):

def _finish_early(self):
"""Finish handling links prematurely (if we've hit min_confidence)."""
if self._is_finished:
return
self._logger.debug("Confidence threshold met; cancelling remaining sources")
self._logger.debug("Confidence threshold met; skipping remaining sources")
with self._queues.lock:
for source in self.sources:
source.cancel()
self._is_finished = True
source.skip()
self.finished = True

def enqueue(self, urls, exclude_check=None):
"""Put a list of URLs into the various worker queues.
@@ -310,9 +309,9 @@ class CopyvioWorkspace(object):
"""
for url in urls:
with self._queues.lock:
if self._is_finished:
if self.finished:
break
if not url or url in self._handled_urls:
if url in self._handled_urls:
continue
self._handled_urls.append(url)
if exclude_check and exclude_check(url):
@@ -336,26 +335,29 @@ class CopyvioWorkspace(object):
queue.append(source)
self._queues.unassigned.put((key, queue))

def compare(self, source, source_chain):
"""Compare a source to the article; call _finish_early if necessary."""
delta = MarkovChainIntersection(self._article, source_chain)
conf = self._calculate_confidence(delta)
self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf))
with self._finish_lock:
source.finish_work(conf, source_chain, delta)
if not self.finished and conf >= self._min_confidence:
self._finish_early()

def wait(self):
"""Wait for the workers to finish handling the sources."""
self._logger.debug("Waiting on {0} sources".format(len(self.sources)))
for source in self.sources:
source.join(self._until)
with self._compare_lock:
with self._finish_lock:
pass # Wait for any remaining comparisons to be finished
if not _is_globalized:
for i in xrange(self._num_workers):
self._queues.unassigned.put((StopIteration, None))

def compare(self, source, source_chain):
"""Compare a source to the article, and update the best known one."""
delta = MarkovChainIntersection(self._article, source_chain)
conf = self._calculate_confidence(delta)
self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf))

with self._compare_lock:
source.finish_work(conf, source_chain, delta)
if conf > self.best.confidence:
self.best = source
if conf >= self._min_confidence:
self._finish_early()
def get_result(self, num_queries=0):
"""Return a CopyvioCheckResult containing the results of this check."""
self.sources.sort(key=lambda source: source.confidence, reverse=True)
return CopyvioCheckResult(self.finished, self.sources, num_queries,
time() - self._start_time, self._article)

正在加载...
取消
保存