Bladeren bron

Update with new confidence function; fix unicode.

tags/v0.2
Ben Kurtovic 10 jaren geleden
bovenliggende
commit
e2d7c7aef6
3 gewijzigde bestanden met toevoegingen van 42 en 19 verwijderingen
  1. +2
    -1
      CHANGELOG
  2. +28
    -8
      earwigbot/wiki/copyvios/__init__.py
  3. +12
    -10
      earwigbot/wiki/copyvios/markov.py

+ 2
- 1
CHANGELOG Bestand weergeven

@@ -5,7 +5,8 @@ v0.2 (unreleased):
circumstances.
- Added copyvio detector functionality: specifying a max time for checks;
improved exclusion support. URL loading and parsing is parallelized to speed
up check times. Fixed assorted bugs.
up check times, with a multi-threaded worker model that avoids concurrent
requests to the same domain. Fixed assorted bugs.
- Added support for Wikimedia Labs when creating a config file.
- Added and improved lazy importing for various dependencies.
- Fixed a bug in job scheduling.


+ 28
- 8
earwigbot/wiki/copyvios/__init__.py Bestand weergeven

@@ -64,7 +64,27 @@ class _CopyvioWorkspace(object):

def _calculate_confidence(self, delta):
"""Return the confidence of a violation as a float between 0 and 1."""
return float(delta.size()) / self._article.size()
def conf_with_article_and_delta(article, delta):
"""Calculate confidence using the article and delta chain sizes."""
return float(delta) / article

def conf_with_delta(delta):
"""Calculate confidence using just the delta chain size."""
# This piecewise function, CΔ(Δ), was derived from experimental
# data using reference points at (0, 0), (100, 0.5), (250, 0.75),
# (500, 0.9), and (1000, 0.95) with lim Δ→+∞ CΔ(Δ) = 1.
# A graph can be viewed here: ...
if delta <= 100:
return delta / (delta + 100)
elif delta <= 250:
return (delta - 25) / (delta + 50)
elif delta <= 500:
return (10.5 * delta - 750) / (10 * delta)
else:
return (delta - 50) / delta

return max(conf_with_article_and_delta(self._article.size, delta.size),
conf_with_delta(delta.size))

def _finish_early(self):
"""Finish handling links prematurely (if we've hit min_confidence)."""
@@ -98,7 +118,7 @@ class _CopyvioWorkspace(object):
from urlparse import urlparse
key = u".".join(urlparse(url).netloc.split(".")[-2:])

logmsg = "enqueue(): {0} {1} -> {2}"
logmsg = u"enqueue(): {0} {1} -> {2}"
if key in self._workers:
self._logger.debug(logmsg.format("PUT", key, url))
self._workers[key].queue.put(url)
@@ -121,7 +141,7 @@ class _CopyvioWorkspace(object):
"""Compare a source to the article, and update the working result."""
delta = MarkovChainIntersection(self._article, source)
confidence = self._calculate_confidence(delta)
self._logger.debug("compare(): {0} -> {1}".format(url, confidence))
self._logger.debug(u"compare(): {0} -> {1}".format(url, confidence))
with self._result_lock:
if confidence > self.best.confidence:
self.best = _WorkingResult(url, confidence, (source, delta))
@@ -268,7 +288,7 @@ class CopyvioMixIn(object):

raise exceptions.UnknownSearchEngineError(engine)

def copyvio_check(self, min_confidence=0.5, max_queries=15, max_time=-1):
def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1):
"""Check the page for copyright violations.

Returns a :class:`.CopyvioCheckResult` object with information on the
@@ -290,7 +310,7 @@ class CopyvioMixIn(object):
(:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on
errors.
"""
log = "Starting copyvio check for [[{0}]]"
log = u"Starting copyvio check for [[{0}]]"
self._logger.info(log.format(self.title))
start_time = time()
until = (start_time + max_time) if max_time > 0 else None
@@ -305,7 +325,7 @@ class CopyvioMixIn(object):
else:
exclude = None

if article.size() < 20: # Auto-fail very small articles
if article.size < 20: # Auto-fail very small articles
result = CopyvioCheckResult(False, 0.0, None, 0, 0, article,
workspace.best.chains)
self._logger.info(result.get_log_message(self.title))
@@ -331,7 +351,7 @@ class CopyvioMixIn(object):
self._logger.info(result.get_log_message(self.title))
return result

def copyvio_compare(self, url, min_confidence=0.5, max_time=30):
def copyvio_compare(self, url, min_confidence=0.75, max_time=30):
"""Check the page like :py:meth:`copyvio_check` against a specific URL.

This is essentially a reduced version of :meth:`copyvio_check` - a
@@ -352,7 +372,7 @@ class CopyvioMixIn(object):
Since no searching is done, neither :exc:`.UnknownSearchEngineError`
nor :exc:`.SearchQueryError` will be raised.
"""
log = "Starting copyvio compare for [[{0}]] against {1}"
log = u"Starting copyvio compare for [[{0}]] against {1}"
self._logger.info(log.format(self.title, url))
start_time = time()
until = (start_time + max_time) if max_time > 0 else None


+ 12
- 10
earwigbot/wiki/copyvios/markov.py Bestand weergeven

@@ -42,6 +42,15 @@ class MarkovChain(object):
for i in range(len(words) - self.degree + 1):
last = i + self.degree - 1
self.chain[tuple(words[i:last])][words[last]] += 1
self.size = self._get_size()

def _get_size(self):
"""Return the size of the Markov chain: the total number of nodes."""
size = 0
for node in self.chain.itervalues():
for hits in node.itervalues():
size += hits
return size

def __repr__(self):
"""Return the canonical string representation of the MarkovChain."""
@@ -49,15 +58,7 @@ class MarkovChain(object):

def __str__(self):
"""Return a nice string representation of the MarkovChain."""
return "<MarkovChain of size {0}>".format(self.size())

def size(self):
"""Return the size of the Markov chain: the total number of nodes."""
count = 0
for node in self.chain.itervalues():
for hits in node.itervalues():
count += hits
return count
return "<MarkovChain of size {0}>".format(self.size)


class MarkovChainIntersection(MarkovChain):
@@ -76,6 +77,7 @@ class MarkovChainIntersection(MarkovChain):
if node in nodes2:
count2 = nodes2[node]
self.chain[word][node] = min(count1, count2)
self.size = self._get_size()

def __repr__(self):
"""Return the canonical string representation of the intersection."""
@@ -85,7 +87,7 @@ class MarkovChainIntersection(MarkovChain):
def __str__(self):
"""Return a nice string representation of the intersection."""
res = "<MarkovChainIntersection of size {0} ({1} ^ {2})>"
return res.format(self.size(), self.mc1, self.mc2)
return res.format(self.size, self.mc1, self.mc2)


EMPTY = MarkovChain("")


Laden…
Annuleren
Opslaan