|
|
@@ -1,17 +1,17 @@ |
|
|
|
# -*- coding: utf-8 -*- |
|
|
|
# |
|
|
|
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net> |
|
|
|
# |
|
|
|
# |
|
|
|
# Permission is hereby granted, free of charge, to any person obtaining a copy |
|
|
|
# of this software and associated documentation files (the "Software"), to deal |
|
|
|
# in the Software without restriction, including without limitation the rights |
|
|
|
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
|
|
|
# copies of the Software, and to permit persons to whom the Software is |
|
|
|
# copies of the Software, and to permit persons to whom the Software is |
|
|
|
# furnished to do so, subject to the following conditions: |
|
|
|
# |
|
|
|
# |
|
|
|
# The above copyright notice and this permission notice shall be included in |
|
|
|
# all copies or substantial portions of the Software. |
|
|
|
# |
|
|
|
# |
|
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
|
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|
|
@@ -20,11 +20,9 @@ |
|
|
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
|
|
# SOFTWARE. |
|
|
|
|
|
|
|
from collections import defaultdict |
|
|
|
from functools import partial |
|
|
|
from gzip import GzipFile |
|
|
|
from json import loads |
|
|
|
from re import sub, UNICODE |
|
|
|
from StringIO import StringIO |
|
|
|
from time import sleep, time |
|
|
|
from urllib import quote_plus, urlencode |
|
|
@@ -36,8 +34,9 @@ except ImportError: |
|
|
|
oauth = None |
|
|
|
|
|
|
|
from earwigbot.wiki.exceptions import * |
|
|
|
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection |
|
|
|
|
|
|
|
class _CopyvioCheckResult(object): |
|
|
|
class CopyvioCheckResult(object): |
|
|
|
def __init__(self, violation, confidence, url, queries, article, chains): |
|
|
|
self.violation = violation |
|
|
|
self.confidence = confidence |
|
|
@@ -48,51 +47,11 @@ class _CopyvioCheckResult(object): |
|
|
|
self.delta_chain = chains[1] |
|
|
|
|
|
|
|
def __repr__(self): |
|
|
|
r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" |
|
|
|
r = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" |
|
|
|
return r.format(self.violation, self.confidence, self.url, self.queries) |
|
|
|
|
|
|
|
|
|
|
|
class _MarkovChain(object): |
|
|
|
START = -1 |
|
|
|
END = -2 |
|
|
|
|
|
|
|
def __init__(self, text): |
|
|
|
self.text = text |
|
|
|
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) |
|
|
|
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split() |
|
|
|
prev = self.START |
|
|
|
for word in words: |
|
|
|
self.chain[prev][word] += 1 |
|
|
|
prev = word |
|
|
|
try: # This won't work if the source text is completely blank |
|
|
|
self.chain[word][self.END] += 1 |
|
|
|
except KeyError: |
|
|
|
pass |
|
|
|
|
|
|
|
def size(self): |
|
|
|
count = 0 |
|
|
|
for node in self.chain.itervalues(): |
|
|
|
for hits in node.itervalues(): |
|
|
|
count += hits |
|
|
|
return count |
|
|
|
|
|
|
|
|
|
|
|
class _MarkovChainIntersection(_MarkovChain): |
|
|
|
def __init__(self, mc1, mc2): |
|
|
|
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) |
|
|
|
c1 = mc1.chain |
|
|
|
c2 = mc2.chain |
|
|
|
|
|
|
|
for word, nodes1 in c1.iteritems(): |
|
|
|
if word in c2: |
|
|
|
nodes2 = c2[word] |
|
|
|
for node, count1 in nodes1.iteritems(): |
|
|
|
if node in nodes2: |
|
|
|
count2 = nodes2[node] |
|
|
|
self.chain[word][node] = min(count1, count2) |
|
|
|
|
|
|
|
|
|
|
|
class CopyrightMixin(object): |
|
|
|
class CopyvioMixin(object): |
|
|
|
""" |
|
|
|
EarwigBot's Wiki Toolset: Copyright Violation Mixin |
|
|
|
|
|
|
@@ -220,8 +179,8 @@ class CopyrightMixin(object): |
|
|
|
if not html: |
|
|
|
return 0 |
|
|
|
|
|
|
|
source = _MarkovChain(self._copyvio_strip_html(html)) |
|
|
|
delta = _MarkovChainIntersection(article, source) |
|
|
|
source = MarkovChain(self._copyvio_strip_html(html)) |
|
|
|
delta = MarkovChainIntersection(article, source) |
|
|
|
return float(delta.size()) / article.size(), (source, delta) |
|
|
|
|
|
|
|
def copyvio_check(self, min_confidence=0.5, max_queries=-1, |
|
|
@@ -255,17 +214,17 @@ class CopyrightMixin(object): |
|
|
|
best_confidence = 0 |
|
|
|
best_match = None |
|
|
|
num_queries = 0 |
|
|
|
empty = _MarkovChain("") |
|
|
|
best_chains = (empty, _MarkovChainIntersection(empty, empty)) |
|
|
|
empty = MarkovChain("") |
|
|
|
best_chains = (empty, MarkovChainIntersection(empty, empty)) |
|
|
|
content = self.get(force) |
|
|
|
clean = self._copyvio_strip_article(content) |
|
|
|
chunks = self._copyvio_chunk_article(clean, max_queries) |
|
|
|
article_chain = _MarkovChain(clean) |
|
|
|
article_chain = MarkovChain(clean) |
|
|
|
last_query = time() |
|
|
|
|
|
|
|
if article_chain.size() < 20: # Auto-fail very small articles |
|
|
|
return _CopyvioCheckResult(False, best_confidence, best_match, |
|
|
|
num_queries, article_chain, best_chains) |
|
|
|
return CopyvioCheckResult(False, best_confidence, best_match, |
|
|
|
num_queries, article_chain, best_chains) |
|
|
|
|
|
|
|
while (chunks and best_confidence < min_confidence and |
|
|
|
(max_queries < 0 or num_queries < max_queries)): |
|
|
@@ -288,8 +247,8 @@ class CopyrightMixin(object): |
|
|
|
v = True |
|
|
|
else: |
|
|
|
v = False |
|
|
|
return _CopyvioCheckResult(v, best_confidence, best_match, num_queries, |
|
|
|
article_chain, best_chains) |
|
|
|
return CopyvioCheckResult(v, best_confidence, best_match, num_queries, |
|
|
|
article_chain, best_chains) |
|
|
|
|
|
|
|
def copyvio_compare(self, url, min_confidence=0.5, force=False): |
|
|
|
"""Check the page like copyvio_check(), but against a specific URL. |
|
|
@@ -298,7 +257,7 @@ class CopyrightMixin(object): |
|
|
|
comparison is made using Markov chains and the result is returned in a |
|
|
|
_CopyvioCheckResult object - without using a search engine, as the |
|
|
|
suspected "violated" URL is supplied from the start. |
|
|
|
|
|
|
|
|
|
|
|
Its primary use is to generate a result when the URL is retrieved from |
|
|
|
a cache, like the one used in EarwigBot's Toolserver site. After a |
|
|
|
search is done, the resulting URL is stored in a cache for 24 hours so |
|
|
@@ -313,12 +272,12 @@ class CopyrightMixin(object): |
|
|
|
""" |
|
|
|
content = self.get(force) |
|
|
|
clean = self._copyvio_strip_article(content) |
|
|
|
article_chain = _MarkovChain(clean) |
|
|
|
article_chain = MarkovChain(clean) |
|
|
|
confidence, chains = self._copyvio_compare_content(article_chain, url) |
|
|
|
|
|
|
|
if confidence >= min_confidence: |
|
|
|
is_violation = True |
|
|
|
else: |
|
|
|
is_violation = False |
|
|
|
return _CopyvioCheckResult(is_violation, confidence, url, 0, |
|
|
|
article_chain, chains) |
|
|
|
return CopyvioCheckResult(is_violation, confidence, url, 0, |
|
|
|
article_chain, chains) |