Browse Source

Restructuring copyvio stuff as its own package.

tags/v0.1^2
Ben Kurtovic 13 years ago
parent
commit
e6a381f3f7
3 changed files with 93 additions and 71 deletions
  1. +21
    -62
      earwigbot/wiki/copyvios/__init__.py
  2. +63
    -0
      earwigbot/wiki/copyvios/markov.py
  3. +9
    -9
      earwigbot/wiki/page.py

earwigbot/wiki/copyright.py → earwigbot/wiki/copyvios/__init__.py View File

@@ -1,17 +1,17 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
#
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -20,11 +20,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from collections import defaultdict
from functools import partial
from gzip import GzipFile
from json import loads
from re import sub, UNICODE
from StringIO import StringIO
from time import sleep, time
from urllib import quote_plus, urlencode
@@ -36,8 +34,9 @@ except ImportError:
oauth = None

from earwigbot.wiki.exceptions import *
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection

class _CopyvioCheckResult(object):
class CopyvioCheckResult(object):
def __init__(self, violation, confidence, url, queries, article, chains):
self.violation = violation
self.confidence = confidence
@@ -48,51 +47,11 @@ class _CopyvioCheckResult(object):
self.delta_chain = chains[1]

def __repr__(self):
r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
r = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
return r.format(self.violation, self.confidence, self.url, self.queries)


class _MarkovChain(object):
START = -1
END = -2

def __init__(self, text):
self.text = text
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
prev = self.START
for word in words:
self.chain[prev][word] += 1
prev = word
try: # This won't work if the source text is completely blank
self.chain[word][self.END] += 1
except KeyError:
pass

def size(self):
count = 0
for node in self.chain.itervalues():
for hits in node.itervalues():
count += hits
return count


class _MarkovChainIntersection(_MarkovChain):
def __init__(self, mc1, mc2):
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
c1 = mc1.chain
c2 = mc2.chain

for word, nodes1 in c1.iteritems():
if word in c2:
nodes2 = c2[word]
for node, count1 in nodes1.iteritems():
if node in nodes2:
count2 = nodes2[node]
self.chain[word][node] = min(count1, count2)


class CopyrightMixin(object):
class CopyvioMixin(object):
"""
EarwigBot's Wiki Toolset: Copyright Violation Mixin

@@ -220,8 +179,8 @@ class CopyrightMixin(object):
if not html:
return 0

source = _MarkovChain(self._copyvio_strip_html(html))
delta = _MarkovChainIntersection(article, source)
source = MarkovChain(self._copyvio_strip_html(html))
delta = MarkovChainIntersection(article, source)
return float(delta.size()) / article.size(), (source, delta)

def copyvio_check(self, min_confidence=0.5, max_queries=-1,
@@ -255,17 +214,17 @@ class CopyrightMixin(object):
best_confidence = 0
best_match = None
num_queries = 0
empty = _MarkovChain("")
best_chains = (empty, _MarkovChainIntersection(empty, empty))
empty = MarkovChain("")
best_chains = (empty, MarkovChainIntersection(empty, empty))
content = self.get(force)
clean = self._copyvio_strip_article(content)
chunks = self._copyvio_chunk_article(clean, max_queries)
article_chain = _MarkovChain(clean)
article_chain = MarkovChain(clean)
last_query = time()

if article_chain.size() < 20: # Auto-fail very small articles
return _CopyvioCheckResult(False, best_confidence, best_match,
num_queries, article_chain, best_chains)
return CopyvioCheckResult(False, best_confidence, best_match,
num_queries, article_chain, best_chains)

while (chunks and best_confidence < min_confidence and
(max_queries < 0 or num_queries < max_queries)):
@@ -288,8 +247,8 @@ class CopyrightMixin(object):
v = True
else:
v = False
return _CopyvioCheckResult(v, best_confidence, best_match, num_queries,
article_chain, best_chains)
return CopyvioCheckResult(v, best_confidence, best_match, num_queries,
article_chain, best_chains)

def copyvio_compare(self, url, min_confidence=0.5, force=False):
"""Check the page like copyvio_check(), but against a specific URL.
@@ -298,7 +257,7 @@ class CopyrightMixin(object):
comparison is made using Markov chains and the result is returned in a
_CopyvioCheckResult object - without using a search engine, as the
suspected "violated" URL is supplied from the start.
Its primary use is to generate a result when the URL is retrieved from
a cache, like the one used in EarwigBot's Toolserver site. After a
search is done, the resulting URL is stored in a cache for 24 hours so
@@ -313,12 +272,12 @@ class CopyrightMixin(object):
"""
content = self.get(force)
clean = self._copyvio_strip_article(content)
article_chain = _MarkovChain(clean)
article_chain = MarkovChain(clean)
confidence, chains = self._copyvio_compare_content(article_chain, url)

if confidence >= min_confidence:
is_violation = True
else:
is_violation = False
return _CopyvioCheckResult(is_violation, confidence, url, 0,
article_chain, chains)
return CopyvioCheckResult(is_violation, confidence, url, 0,
article_chain, chains)

+ 63
- 0
earwigbot/wiki/copyvios/markov.py View File

@@ -0,0 +1,63 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from collections import defaultdict
from re import sub, UNICODE

class MarkovChain(object):
START = -1
END = -2

def __init__(self, text):
self.text = text
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
prev = self.START
for word in words:
self.chain[prev][word] += 1
prev = word
try: # This won't work if the source text is completely blank
self.chain[word][self.END] += 1
except KeyError:
pass

def size(self):
count = 0
for node in self.chain.itervalues():
for hits in node.itervalues():
count += hits
return count


class MarkovChainIntersection(MarkovChain):
def __init__(self, mc1, mc2):
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
c1 = mc1.chain
c2 = mc2.chain

for word, nodes1 in c1.iteritems():
if word in c2:
nodes2 = c2[word]
for node, count1 in nodes1.iteritems():
if node in nodes2:
count2 = nodes2[node]
self.chain[word][node] = min(count1, count2)

+ 9
- 9
earwigbot/wiki/page.py View File

@@ -1,17 +1,17 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
#
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -25,10 +25,10 @@ import re
from time import gmtime, strftime
from urllib import quote

from earwigbot.wiki.copyright import CopyrightMixin
from earwigbot.wiki.copyvios import CopyvioMixin
from earwigbot.wiki.exceptions import *

class Page(CopyrightMixin):
class Page(CopyvioMixin):
"""
EarwigBot's Wiki Toolset: Page Class

@@ -264,7 +264,7 @@ class Page(CopyrightMixin):
If `params` is given, we'll use it as our API query parameters.
Otherwise, we'll build params using the given kwargs via
_build_edit_params().
We'll then try to do the API query, and catch any errors the API raises
in _handle_edit_errors(). We'll then throw these back as subclasses of
EditError.
@@ -275,7 +275,7 @@ class Page(CopyrightMixin):
if not self._token:
e = "You don't have permission to edit this page."
raise PermissionsError(e)
# Weed out invalid pages before we get too far:
self._force_validity()

@@ -336,7 +336,7 @@ class Page(CopyrightMixin):
# Page does not exist; don't edit if it already exists:
params["createonly"] = "true"
else:
params["recreate"] = "true"
params["recreate"] = "true"

return params



Loading…
Cancel
Save