Browse Source

Update with new confidence function; fix unicode.

tags/v0.2
Ben Kurtovic 10 years ago
parent
commit
e2d7c7aef6
3 changed files with 42 additions and 19 deletions
  1. +2
    -1
      CHANGELOG
  2. +28
    -8
      earwigbot/wiki/copyvios/__init__.py
  3. +12
    -10
      earwigbot/wiki/copyvios/markov.py

+ 2
- 1
CHANGELOG View File

@@ -5,7 +5,8 @@ v0.2 (unreleased):
circumstances. circumstances.
- Added copyvio detector functionality: specifying a max time for checks; - Added copyvio detector functionality: specifying a max time for checks;
improved exclusion support. URL loading and parsing is parallelized to speed improved exclusion support. URL loading and parsing is parallelized to speed
up check times. Fixed assorted bugs.
up check times, with a multi-threaded worker model that avoids concurrent
requests to the same domain. Fixed assorted bugs.
- Added support for Wikimedia Labs when creating a config file. - Added support for Wikimedia Labs when creating a config file.
- Added and improved lazy importing for various dependencies. - Added and improved lazy importing for various dependencies.
- Fixed a bug in job scheduling. - Fixed a bug in job scheduling.


+ 28
- 8
earwigbot/wiki/copyvios/__init__.py View File

@@ -64,7 +64,27 @@ class _CopyvioWorkspace(object):


def _calculate_confidence(self, delta): def _calculate_confidence(self, delta):
"""Return the confidence of a violation as a float between 0 and 1.""" """Return the confidence of a violation as a float between 0 and 1."""
return float(delta.size()) / self._article.size()
def conf_with_article_and_delta(article, delta):
"""Calculate confidence using the article and delta chain sizes."""
return float(delta) / article

def conf_with_delta(delta):
"""Calculate confidence using just the delta chain size."""
# This piecewise function, CΔ(Δ), was derived from experimental
# data using reference points at (0, 0), (100, 0.5), (250, 0.75),
# (500, 0.9), and (1000, 0.95) with lim Δ→+∞ CΔ(Δ) = 1.
# A graph can be viewed here: ...
if delta <= 100:
return delta / (delta + 100)
elif delta <= 250:
return (delta - 25) / (delta + 50)
elif delta <= 500:
return (10.5 * delta - 750) / (10 * delta)
else:
return (delta - 50) / delta

return max(conf_with_article_and_delta(self._article.size, delta.size),
conf_with_delta(delta.size))


def _finish_early(self): def _finish_early(self):
"""Finish handling links prematurely (if we've hit min_confidence).""" """Finish handling links prematurely (if we've hit min_confidence)."""
@@ -98,7 +118,7 @@ class _CopyvioWorkspace(object):
from urlparse import urlparse from urlparse import urlparse
key = u".".join(urlparse(url).netloc.split(".")[-2:]) key = u".".join(urlparse(url).netloc.split(".")[-2:])


logmsg = "enqueue(): {0} {1} -> {2}"
logmsg = u"enqueue(): {0} {1} -> {2}"
if key in self._workers: if key in self._workers:
self._logger.debug(logmsg.format("PUT", key, url)) self._logger.debug(logmsg.format("PUT", key, url))
self._workers[key].queue.put(url) self._workers[key].queue.put(url)
@@ -121,7 +141,7 @@ class _CopyvioWorkspace(object):
"""Compare a source to the article, and update the working result.""" """Compare a source to the article, and update the working result."""
delta = MarkovChainIntersection(self._article, source) delta = MarkovChainIntersection(self._article, source)
confidence = self._calculate_confidence(delta) confidence = self._calculate_confidence(delta)
self._logger.debug("compare(): {0} -> {1}".format(url, confidence))
self._logger.debug(u"compare(): {0} -> {1}".format(url, confidence))
with self._result_lock: with self._result_lock:
if confidence > self.best.confidence: if confidence > self.best.confidence:
self.best = _WorkingResult(url, confidence, (source, delta)) self.best = _WorkingResult(url, confidence, (source, delta))
@@ -268,7 +288,7 @@ class CopyvioMixIn(object):


raise exceptions.UnknownSearchEngineError(engine) raise exceptions.UnknownSearchEngineError(engine)


def copyvio_check(self, min_confidence=0.5, max_queries=15, max_time=-1):
def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1):
"""Check the page for copyright violations. """Check the page for copyright violations.


Returns a :class:`.CopyvioCheckResult` object with information on the Returns a :class:`.CopyvioCheckResult` object with information on the
@@ -290,7 +310,7 @@ class CopyvioMixIn(object):
(:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on (:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on
errors. errors.
""" """
log = "Starting copyvio check for [[{0}]]"
log = u"Starting copyvio check for [[{0}]]"
self._logger.info(log.format(self.title)) self._logger.info(log.format(self.title))
start_time = time() start_time = time()
until = (start_time + max_time) if max_time > 0 else None until = (start_time + max_time) if max_time > 0 else None
@@ -305,7 +325,7 @@ class CopyvioMixIn(object):
else: else:
exclude = None exclude = None


if article.size() < 20: # Auto-fail very small articles
if article.size < 20: # Auto-fail very small articles
result = CopyvioCheckResult(False, 0.0, None, 0, 0, article, result = CopyvioCheckResult(False, 0.0, None, 0, 0, article,
workspace.best.chains) workspace.best.chains)
self._logger.info(result.get_log_message(self.title)) self._logger.info(result.get_log_message(self.title))
@@ -331,7 +351,7 @@ class CopyvioMixIn(object):
self._logger.info(result.get_log_message(self.title)) self._logger.info(result.get_log_message(self.title))
return result return result


def copyvio_compare(self, url, min_confidence=0.5, max_time=30):
def copyvio_compare(self, url, min_confidence=0.75, max_time=30):
"""Check the page like :py:meth:`copyvio_check` against a specific URL. """Check the page like :py:meth:`copyvio_check` against a specific URL.


This is essentially a reduced version of :meth:`copyvio_check` - a This is essentially a reduced version of :meth:`copyvio_check` - a
@@ -352,7 +372,7 @@ class CopyvioMixIn(object):
Since no searching is done, neither :exc:`.UnknownSearchEngineError` Since no searching is done, neither :exc:`.UnknownSearchEngineError`
nor :exc:`.SearchQueryError` will be raised. nor :exc:`.SearchQueryError` will be raised.
""" """
log = "Starting copyvio compare for [[{0}]] against {1}"
log = u"Starting copyvio compare for [[{0}]] against {1}"
self._logger.info(log.format(self.title, url)) self._logger.info(log.format(self.title, url))
start_time = time() start_time = time()
until = (start_time + max_time) if max_time > 0 else None until = (start_time + max_time) if max_time > 0 else None


+ 12
- 10
earwigbot/wiki/copyvios/markov.py View File

@@ -42,6 +42,15 @@ class MarkovChain(object):
for i in range(len(words) - self.degree + 1): for i in range(len(words) - self.degree + 1):
last = i + self.degree - 1 last = i + self.degree - 1
self.chain[tuple(words[i:last])][words[last]] += 1 self.chain[tuple(words[i:last])][words[last]] += 1
self.size = self._get_size()

def _get_size(self):
"""Return the size of the Markov chain: the total number of nodes."""
size = 0
for node in self.chain.itervalues():
for hits in node.itervalues():
size += hits
return size


def __repr__(self): def __repr__(self):
"""Return the canonical string representation of the MarkovChain.""" """Return the canonical string representation of the MarkovChain."""
@@ -49,15 +58,7 @@ class MarkovChain(object):


def __str__(self): def __str__(self):
"""Return a nice string representation of the MarkovChain.""" """Return a nice string representation of the MarkovChain."""
return "<MarkovChain of size {0}>".format(self.size())

def size(self):
"""Return the size of the Markov chain: the total number of nodes."""
count = 0
for node in self.chain.itervalues():
for hits in node.itervalues():
count += hits
return count
return "<MarkovChain of size {0}>".format(self.size)




class MarkovChainIntersection(MarkovChain): class MarkovChainIntersection(MarkovChain):
@@ -76,6 +77,7 @@ class MarkovChainIntersection(MarkovChain):
if node in nodes2: if node in nodes2:
count2 = nodes2[node] count2 = nodes2[node]
self.chain[word][node] = min(count1, count2) self.chain[word][node] = min(count1, count2)
self.size = self._get_size()


def __repr__(self): def __repr__(self):
"""Return the canonical string representation of the intersection.""" """Return the canonical string representation of the intersection."""
@@ -85,7 +87,7 @@ class MarkovChainIntersection(MarkovChain):
def __str__(self): def __str__(self):
"""Return a nice string representation of the intersection.""" """Return a nice string representation of the intersection."""
res = "<MarkovChainIntersection of size {0} ({1} ^ {2})>" res = "<MarkovChainIntersection of size {0} ({1} ^ {2})>"
return res.format(self.size(), self.mc1, self.mc2)
return res.format(self.size, self.mc1, self.mc2)




EMPTY = MarkovChain("") EMPTY = MarkovChain("")


Loading…
Cancel
Save