Browse Source

Updates to copyright violation stuff.

tags/v0.1^2
Ben Kurtovic 12 years ago
parent
commit
df7868da3e
1 changed files with 30 additions and 17 deletions
  1. +30
    -17
      earwigbot/wiki/copyright.py

+ 30
- 17
earwigbot/wiki/copyright.py View File

@@ -37,20 +37,27 @@ except ImportError:
from earwigbot.wiki.exceptions import * from earwigbot.wiki.exceptions import *


class _CopyvioCheckResult(object): class _CopyvioCheckResult(object):
def __init__(self, violation, confidence, url, queries):
def __init__(self, violation, confidence, url, queries, article, chains):
self.violation = violation self.violation = violation
self.confidence = confidence self.confidence = confidence
self.url = url self.url = url
self.queries = queries self.queries = queries
self.article_chain = article
self.source_chain = chains[0]
self.delta_chain = chains[1]


def __repr__(self): def __repr__(self):
r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
return r.format(self.violation, self.confidence, self.url, self.queries)
r = ", ".join(("_CopyvioCheckResult(violation={0!r}",
"confidence={1!r}", "url={2!r}", "queries={3|r}",
"article={4|r}", "chains={5!r})"))
return r.format(self.violation, self.confidence, self.url,
self.queries, self.article_chain,
(self.source_chain, self.delta_chain))




class _MarkovChain(object): class _MarkovChain(object):
START = "MRKV_CHAIN_START"
END = "MRKV_CHAIN_END"
START = -1
END = -2


def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
@@ -60,7 +67,10 @@ class _MarkovChain(object):
for word in words: for word in words:
self.chain[prev][word] += 1 self.chain[prev][word] += 1
prev = word prev = word
self.chain[word][self.END] += 1
try: # This won't work if the source text is completely blank
self.chain[word][self.END] += 1
except KeyError:
pass


def size(self): def size(self):
count = 0 count = 0
@@ -180,21 +190,19 @@ class CopyrightMixin(object):
def _copyvio_strip_article(self, content): def _copyvio_strip_article(self, content):
return content return content


def _copyvio_chunk_article(self, content):
def _copyvio_chunk_article(self, content, max_chunks):
return [content] return [content]


def _copyvio_compare_content(self, content, url):
def _copyvio_compare_content(self, article, url):
html = self._open_url_ignoring_errors(url) html = self._open_url_ignoring_errors(url)
if not html: if not html:
return 0 return 0


article = _MarkovChain(content)
source = _MarkovChain(self._copyvio_strip_html(html)) source = _MarkovChain(self._copyvio_strip_html(html))
delta = _MarkovChainIntersection(article, source) delta = _MarkovChainIntersection(article, source)
return delta.size() / article.size(), (source, delta)


return delta.size() / min(article.size(), source.size())

def copyvio_check(self, engine, credentials, min_confidence=0.75,
def copyvio_check(self, engine, credentials, min_confidence=0.5,
max_queries=-1, interquery_sleep=1, force=False): max_queries=-1, interquery_sleep=1, force=False):
"""Check the page for copyright violations. """Check the page for copyright violations.


@@ -225,9 +233,12 @@ class CopyrightMixin(object):
best_confidence = 0 best_confidence = 0
best_match = None best_match = None
num_queries = 0 num_queries = 0
empty = _MarkovChain("")
best_chains = (empty, _MarkovChainIntersection(empty, empty))
content = self.get(force) content = self.get(force)
clean = self._copyvio_strip_article(content) clean = self._copyvio_strip_article(content)
chunks = self._copyvio_chunk_article(clean)
chunks = self._copyvio_chunk_article(clean, max_queries)
article_chain = _MarkovChain(clean)
last_query = time() last_query = time()


while (chunks and best_confidence < min_confidence and while (chunks and best_confidence < min_confidence and
@@ -236,10 +247,11 @@ class CopyrightMixin(object):
urls = [url for url in urls if url not in handled_urls] urls = [url for url in urls if url not in handled_urls]
for url in urls: for url in urls:
handled_urls.append(url) handled_urls.append(url)
confidence = self._copyvio_compare_content(clean, url)
if confidence > best_confidence:
best_confidence = confidence
conf, chains = self._copyvio_compare_content(article_chain, url)
if conf > best_confidence:
best_confidence = conf
best_match = url best_match = url
best_chains = chains
num_queries += 1 num_queries += 1
diff = time() - last_query diff = time() - last_query
if diff < interquery_sleep: if diff < interquery_sleep:
@@ -250,4 +262,5 @@ class CopyrightMixin(object):
v = True v = True
else: else:
v = False v = False
return _CopyvioCheckResult(v, best_confidence, best_match, num_queries)
return _CopyvioCheckResult(v, best_confidence, best_match, num_queries,
article_chain, best_chains)

Loading…
Cancel
Save