Browse Source

Refactor a bunch of copyvio internals. Store all sources with a result object.

tags/v0.2
Ben Kurtovic 10 years ago
parent
commit
7afb484cea
5 changed files with 97 additions and 78 deletions
  1. +6
    -19
      earwigbot/wiki/copyvios/__init__.py
  2. +5
    -4
      earwigbot/wiki/copyvios/exclusions.py
  3. +0
    -1
      earwigbot/wiki/copyvios/parsers.py
  4. +54
    -24
      earwigbot/wiki/copyvios/result.py
  5. +32
    -30
      earwigbot/wiki/copyvios/workers.py

+ 6
- 19
earwigbot/wiki/copyvios/__init__.py View File

@@ -26,7 +26,6 @@ from urllib2 import build_opener
from earwigbot import exceptions, importer from earwigbot import exceptions, importer
from earwigbot.wiki.copyvios.markov import MarkovChain from earwigbot.wiki.copyvios.markov import MarkovChain
from earwigbot.wiki.copyvios.parsers import ArticleTextParser from earwigbot.wiki.copyvios.parsers import ArticleTextParser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
from earwigbot.wiki.copyvios.workers import ( from earwigbot.wiki.copyvios.workers import (
globalize, localize, CopyvioWorkspace) globalize, localize, CopyvioWorkspace)
@@ -109,12 +108,10 @@ class CopyvioMixIn(object):
""" """
log = u"Starting copyvio check for [[{0}]]" log = u"Starting copyvio check for [[{0}]]"
self._logger.info(log.format(self.title)) self._logger.info(log.format(self.title))
start_time = time()
until = (start_time + max_time) if max_time > 0 else None
searcher = self._get_search_engine() searcher = self._get_search_engine()
parser = ArticleTextParser(self.get()) parser = ArticleTextParser(self.get())
article = MarkovChain(parser.strip()) article = MarkovChain(parser.strip())
workspace = CopyvioWorkspace(article, min_confidence, until,
workspace = CopyvioWorkspace(article, min_confidence, max_time,
self._logger, self._addheaders) self._logger, self._addheaders)
if self._exclusions_db: if self._exclusions_db:
self._exclusions_db.sync(self.site.name) self._exclusions_db.sync(self.site.name)
@@ -123,8 +120,7 @@ class CopyvioMixIn(object):
exclude = None exclude = None


if article.size < 20: # Auto-fail very small articles if article.size < 20: # Auto-fail very small articles
result = CopyvioCheckResult(False, 0.0, None, 0, 0, article,
workspace.best.chains)
result = workspace.get_result()
self._logger.info(result.get_log_message(self.title)) self._logger.info(result.get_log_message(self.title))
return result return result


@@ -134,19 +130,15 @@ class CopyvioMixIn(object):
if not no_searches: if not no_searches:
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
for chunk in chunks: for chunk in chunks:
if workspace.best.confidence >= min_confidence:
if workspace.finished:
break break
log = u"[[{0}]] -> querying {1} for {2!r}" log = u"[[{0}]] -> querying {1} for {2!r}"
self._logger.debug(log.format(self.title, searcher.name, chunk)) self._logger.debug(log.format(self.title, searcher.name, chunk))
workspace.enqueue(searcher.search(chunk), exclude) workspace.enqueue(searcher.search(chunk), exclude)
num_queries += 1 num_queries += 1
sleep(1) sleep(1)

workspace.wait() workspace.wait()
result = CopyvioCheckResult(
workspace.best.confidence >= min_confidence,
workspace.best.confidence, workspace.best.url, num_queries,
time() - start_time, article, workspace.best.chains)
result = workspace.get_result(num_queries)
self._logger.info(result.get_log_message(self.title)) self._logger.info(result.get_log_message(self.title))
return result return result


@@ -173,17 +165,12 @@ class CopyvioMixIn(object):
""" """
log = u"Starting copyvio compare for [[{0}]] against {1}" log = u"Starting copyvio compare for [[{0}]] against {1}"
self._logger.info(log.format(self.title, url)) self._logger.info(log.format(self.title, url))
start_time = time()
until = (start_time + max_time) if max_time > 0 else None
article = MarkovChain(ArticleTextParser(self.get()).strip()) article = MarkovChain(ArticleTextParser(self.get()).strip())
workspace = CopyvioWorkspace( workspace = CopyvioWorkspace(
article, min_confidence, until, self._logger, self._addheaders,
article, min_confidence, max_time, self._logger, self._addheaders,
max_time, 1) max_time, 1)
workspace.enqueue([url]) workspace.enqueue([url])
workspace.wait() workspace.wait()
best = workspace.best
result = CopyvioCheckResult(
best.confidence >= min_confidence, best.confidence, best.url, 0,
time() - start_time, article, best.chains)
result = workspace.get_result()
self._logger.info(result.get_log_message(self.title)) self._logger.info(result.get_log_message(self.title))
return result return result

+ 5
- 4
earwigbot/wiki/copyvios/exclusions.py View File

@@ -30,7 +30,7 @@ from earwigbot import exceptions


__all__ = ["ExclusionsDB"] __all__ = ["ExclusionsDB"]


default_sources = {
DEFAULT_SOURCES = {
"all": [ # Applies to all, but located on enwiki "all": [ # Applies to all, but located on enwiki
"User:EarwigBot/Copyvios/Exclusions" "User:EarwigBot/Copyvios/Exclusions"
], ],
@@ -74,7 +74,7 @@ class ExclusionsDB(object):
""" """
query = "INSERT INTO sources VALUES (?, ?);" query = "INSERT INTO sources VALUES (?, ?);"
sources = [] sources = []
for sitename, pages in default_sources.iteritems():
for sitename, pages in DEFAULT_SOURCES.iteritems():
for page in pages: for page in pages:
sources.append((sitename, page)) sources.append((sitename, page))


@@ -95,8 +95,9 @@ class ExclusionsDB(object):
r"\*\s*Site:\s*(?:\[|\<nowiki\>)?(?:https?:)?(?://)?(.*?)(?:\].*?|\</nowiki\>.*?)?\s*$" r"\*\s*Site:\s*(?:\[|\<nowiki\>)?(?:https?:)?(?://)?(.*?)(?:\].*?|\</nowiki\>.*?)?\s*$"
] ]
for regex in regexes: for regex in regexes:
find = re.findall(regex, data, re.I|re.M)
[urls.add(url.lower().strip()) for url in find if url.strip()]
for url in re.findall(regex, data, re.I|re.M):
if url.strip():
urls.add(url.lower().strip())
return urls return urls


def _update(self, sitename): def _update(self, sitename):


+ 0
- 1
earwigbot/wiki/copyvios/parsers.py View File

@@ -20,7 +20,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


import errno
from os import path from os import path


import mwparserfromhell import mwparserfromhell


+ 54
- 24
earwigbot/wiki/copyvios/result.py View File

@@ -28,7 +28,12 @@ from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION
__all__ = ["CopyvioSource", "CopyvioCheckResult"] __all__ = ["CopyvioSource", "CopyvioCheckResult"]


class CopyvioSource(object): class CopyvioSource(object):
"""Represents a single suspected violation source (a URL)."""
"""
**EarwigBot: Wiki Toolset: Copyvio Source**

A class that represents a single possible source of a copyright violation,
i.e., a URL.
"""


def __init__(self, workspace, url, key, headers=None, timeout=5): def __init__(self, workspace, url, key, headers=None, timeout=5):
self.workspace = workspace self.workspace = workspace
@@ -38,14 +43,23 @@ class CopyvioSource(object):
self.timeout = timeout self.timeout = timeout
self.confidence = 0.0 self.confidence = 0.0
self.chains = (EMPTY, EMPTY_INTERSECTION) self.chains = (EMPTY, EMPTY_INTERSECTION)
self.skipped = False


self._event1 = Event() self._event1 = Event()
self._event2 = Event() self._event2 = Event()
self._event2.set() self._event2.set()


def touched(self):
"""Return whether one of start_work() and cancel() have been called."""
return self._event1.is_set()
def __repr__(self):
"""Return the canonical string representation of the source."""
res = "CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r})"
return res.format(self.url, self.confidence, self.skipped)

def __str__(self):
"""Return a nice string representation of the source."""
if self.skipped:
return "<CopyvioSource ({0}, skipped)>".format(self.url)
res = "<CopyvioSource ({0} with {1} conf)>"
return res.format(self.url, self.confidence)


def start_work(self): def start_work(self):
"""Mark this source as being worked on right now.""" """Mark this source as being worked on right now."""
@@ -58,8 +72,11 @@ class CopyvioSource(object):
self.chains = (source_chain, delta_chain) self.chains = (source_chain, delta_chain)
self._event2.set() self._event2.set()


def cancel(self):
def skip(self):
"""Deactivate this source without filling in the relevant data.""" """Deactivate this source without filling in the relevant data."""
if self._event1.is_set():
return
self.skipped = True
self._event1.set() self._event1.set()


def join(self, until): def join(self, until):
@@ -70,6 +87,8 @@ class CopyvioSource(object):
if timeout <= 0: if timeout <= 0:
return return
event.wait(timeout) event.wait(timeout)
else:
event.wait()




class CopyvioCheckResult(object): class CopyvioCheckResult(object):
@@ -81,40 +100,51 @@ class CopyvioCheckResult(object):
*Attributes:* *Attributes:*


- :py:attr:`violation`: ``True`` if this is a violation, else ``False`` - :py:attr:`violation`: ``True`` if this is a violation, else ``False``
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy
- :py:attr:`url`: the URL of the violated page
- :py:attr:`sources`: a list of CopyvioSources, sorted by confidence
- :py:attr:`queries`: the number of queries used to reach a result - :py:attr:`queries`: the number of queries used to reach a result
- :py:attr:`time`: the amount of time the check took to complete - :py:attr:`time`: the amount of time the check took to complete
- :py:attr:`article_chain`: the MarkovChain of the article text - :py:attr:`article_chain`: the MarkovChain of the article text
- :py:attr:`source_chain`: the MarkovChain of the violated page text
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two
""" """


def __init__(self, violation, confidence, url, queries, time, article,
chains):
def __init__(self, violation, sources, queries, check_time, article_chain):
self.violation = violation self.violation = violation
self.confidence = confidence
self.url = url
self.sources = sources
self.queries = queries self.queries = queries
self.time = time
self.article_chain = article
self.source_chain = chains[0]
self.delta_chain = chains[1]
self.time = check_time
self.article_chain = article_chain


def __repr__(self): def __repr__(self):
"""Return the canonical string representation of the result.""" """Return the canonical string representation of the result."""
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3!r})"
return res.format(self.violation, self.confidence, self.url,
self.queries)
res = "CopyvioCheckResult(violation={0!r}, sources={1!r}, queries={2!r}, time={3!r})"
return res.format(self.violation, self.sources, self.queries,
self.time)


def __str__(self): def __str__(self):
"""Return a nice string representation of the result.""" """Return a nice string representation of the result."""
res = "<CopyvioCheckResult ({0} with {1} conf)>"
return res.format(self.violation, self.confidence)
res = "<CopyvioCheckResult ({0} with best {1})>"
return res.format(self.violation, self.best)

@property
def best(self):
"""The best known source, or None if no sources exist."""
return self.sources[0] if self.sources else None

@property
def confidence(self):
"""The confidence of the best source, or 0 if no sources exist."""
return self.best.confidence if self.best else 0.0

@property
def url(self):
"""The url of the best source, or None if no sources exist."""
return self.best.url if self.best else None


def get_log_message(self, title): def get_log_message(self, title):
"""Build a relevant log message for this copyvio check result.""" """Build a relevant log message for this copyvio check result."""
log = u"{0} for [[{1}]] (confidence: {2}; URL: {3}; {4} queries; {5} seconds)"
if not self.sources:
log = u"No violation for [[{0}]] (no sources; {1} queries; {2} seconds)"
return log.format(title, self.queries, self.time)
log = u"{0} for [[{1}]] (best: {2} ({3} confidence); {4} queries; {5} seconds)"
is_vio = "Violation detected" if self.violation else "No violation" is_vio = "Violation detected" if self.violation else "No violation"
return log.format(is_vio, title, self.confidence, self.url,
return log.format(is_vio, title, self.url, self.confidence,
self.queries, self.time) self.queries, self.time)

+ 32
- 30
earwigbot/wiki/copyvios/workers.py View File

@@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError
from earwigbot import importer from earwigbot import importer
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.parsers import HTMLTextParser from earwigbot.wiki.copyvios.parsers import HTMLTextParser
from earwigbot.wiki.copyvios.result import CopyvioSource
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource


tldextract = importer.new("tldextract") tldextract = importer.new("tldextract")


@@ -120,7 +120,8 @@ class _CopyvioWorker(object):
If a URLError was raised while opening the URL or an IOError was raised If a URLError was raised while opening the URL or an IOError was raised
while decompressing, None will be returned. while decompressing, None will be returned.
""" """
self._opener.addheaders = source.headers
if source.headers:
self._opener.addheaders = source.headers
url = source.url.encode("utf8") url = source.url.encode("utf8")
try: try:
response = self._opener.open(url, timeout=source.timeout) response = self._opener.open(url, timeout=source.timeout)
@@ -194,8 +195,8 @@ class _CopyvioWorker(object):
return self._dequeue() return self._dequeue()


self._logger.debug(u"Got source URL: {0}".format(source.url)) self._logger.debug(u"Got source URL: {0}".format(source.url))
if source.touched():
self._logger.debug("Source has been cancelled")
if source.skipped:
self._logger.debug("Source has been skipped")
self._queues.lock.release() self._queues.lock.release()
return self._dequeue() return self._dequeue()


@@ -232,18 +233,18 @@ class _CopyvioWorker(object):
class CopyvioWorkspace(object): class CopyvioWorkspace(object):
"""Manages a single copyvio check distributed across threads.""" """Manages a single copyvio check distributed across threads."""


def __init__(self, article, min_confidence, until, logger, headers,
def __init__(self, article, min_confidence, max_time, logger, headers,
url_timeout=5, num_workers=8): url_timeout=5, num_workers=8):
self.best = CopyvioSource(self, None, None)
self.sources = [] self.sources = []
self.finished = False


self._article = article self._article = article
self._logger = logger.getChild("copyvios") self._logger = logger.getChild("copyvios")
self._min_confidence = min_confidence self._min_confidence = min_confidence
self._until = until
self._start_time = time()
self._until = (self._start_time + max_time) if max_time > 0 else None
self._handled_urls = [] self._handled_urls = []
self._is_finished = False
self._compare_lock = Lock()
self._finish_lock = Lock()
self._source_args = {"workspace": self, "headers": headers, self._source_args = {"workspace": self, "headers": headers,
"timeout": url_timeout} "timeout": url_timeout}


@@ -254,7 +255,7 @@ class CopyvioWorkspace(object):
self._num_workers = num_workers self._num_workers = num_workers
for i in xrange(num_workers): for i in xrange(num_workers):
name = "local-{0:04}.{1}".format(id(self) % 10000, i) name = "local-{0:04}.{1}".format(id(self) % 10000, i)
_CopyvioWorker(name, self._queues, until).start()
_CopyvioWorker(name, self._queues, self._until).start()


def _calculate_confidence(self, delta): def _calculate_confidence(self, delta):
"""Return the confidence of a violation as a float between 0 and 1.""" """Return the confidence of a violation as a float between 0 and 1."""
@@ -294,13 +295,11 @@ class CopyvioWorkspace(object):


def _finish_early(self): def _finish_early(self):
"""Finish handling links prematurely (if we've hit min_confidence).""" """Finish handling links prematurely (if we've hit min_confidence)."""
if self._is_finished:
return
self._logger.debug("Confidence threshold met; cancelling remaining sources")
self._logger.debug("Confidence threshold met; skipping remaining sources")
with self._queues.lock: with self._queues.lock:
for source in self.sources: for source in self.sources:
source.cancel()
self._is_finished = True
source.skip()
self.finished = True


def enqueue(self, urls, exclude_check=None): def enqueue(self, urls, exclude_check=None):
"""Put a list of URLs into the various worker queues. """Put a list of URLs into the various worker queues.
@@ -310,9 +309,9 @@ class CopyvioWorkspace(object):
""" """
for url in urls: for url in urls:
with self._queues.lock: with self._queues.lock:
if self._is_finished:
if self.finished:
break break
if not url or url in self._handled_urls:
if url in self._handled_urls:
continue continue
self._handled_urls.append(url) self._handled_urls.append(url)
if exclude_check and exclude_check(url): if exclude_check and exclude_check(url):
@@ -336,26 +335,29 @@ class CopyvioWorkspace(object):
queue.append(source) queue.append(source)
self._queues.unassigned.put((key, queue)) self._queues.unassigned.put((key, queue))


def compare(self, source, source_chain):
"""Compare a source to the article; call _finish_early if necessary."""
delta = MarkovChainIntersection(self._article, source_chain)
conf = self._calculate_confidence(delta)
self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf))
with self._finish_lock:
source.finish_work(conf, source_chain, delta)
if not self.finished and conf >= self._min_confidence:
self._finish_early()

def wait(self): def wait(self):
"""Wait for the workers to finish handling the sources.""" """Wait for the workers to finish handling the sources."""
self._logger.debug("Waiting on {0} sources".format(len(self.sources))) self._logger.debug("Waiting on {0} sources".format(len(self.sources)))
for source in self.sources: for source in self.sources:
source.join(self._until) source.join(self._until)
with self._compare_lock:
with self._finish_lock:
pass # Wait for any remaining comparisons to be finished pass # Wait for any remaining comparisons to be finished
if not _is_globalized: if not _is_globalized:
for i in xrange(self._num_workers): for i in xrange(self._num_workers):
self._queues.unassigned.put((StopIteration, None)) self._queues.unassigned.put((StopIteration, None))


def compare(self, source, source_chain):
"""Compare a source to the article, and update the best known one."""
delta = MarkovChainIntersection(self._article, source_chain)
conf = self._calculate_confidence(delta)
self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf))

with self._compare_lock:
source.finish_work(conf, source_chain, delta)
if conf > self.best.confidence:
self.best = source
if conf >= self._min_confidence:
self._finish_early()
def get_result(self, num_queries=0):
"""Return a CopyvioCheckResult containing the results of this check."""
self.sources.sort(key=lambda source: source.confidence, reverse=True)
return CopyvioCheckResult(self.finished, self.sources, num_queries,
time() - self._start_time, self._article)

Loading…
Cancel
Save