Browse Source

Restructuring copyvio stuff as its own package.

tags/v0.1^2
Ben Kurtovic 12 years ago
parent
commit
e6a381f3f7
3 changed files with 93 additions and 71 deletions
  1. +21
    -62
      earwigbot/wiki/copyvios/__init__.py
  2. +63
    -0
      earwigbot/wiki/copyvios/markov.py
  3. +9
    -9
      earwigbot/wiki/page.py

earwigbot/wiki/copyright.py → earwigbot/wiki/copyvios/__init__.py View File

@@ -1,17 +1,17 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# #
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net> # Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
#
#
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights # in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions: # furnished to do so, subject to the following conditions:
#
#
# The above copyright notice and this permission notice shall be included in # The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software. # all copies or substantial portions of the Software.
#
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -20,11 +20,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from collections import defaultdict
from functools import partial from functools import partial
from gzip import GzipFile from gzip import GzipFile
from json import loads from json import loads
from re import sub, UNICODE
from StringIO import StringIO from StringIO import StringIO
from time import sleep, time from time import sleep, time
from urllib import quote_plus, urlencode from urllib import quote_plus, urlencode
@@ -36,8 +34,9 @@ except ImportError:
oauth = None oauth = None


from earwigbot.wiki.exceptions import * from earwigbot.wiki.exceptions import *
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection


class _CopyvioCheckResult(object):
class CopyvioCheckResult(object):
def __init__(self, violation, confidence, url, queries, article, chains): def __init__(self, violation, confidence, url, queries, article, chains):
self.violation = violation self.violation = violation
self.confidence = confidence self.confidence = confidence
@@ -48,51 +47,11 @@ class _CopyvioCheckResult(object):
self.delta_chain = chains[1] self.delta_chain = chains[1]


def __repr__(self): def __repr__(self):
r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
r = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
return r.format(self.violation, self.confidence, self.url, self.queries) return r.format(self.violation, self.confidence, self.url, self.queries)




class _MarkovChain(object):
START = -1
END = -2

def __init__(self, text):
self.text = text
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
prev = self.START
for word in words:
self.chain[prev][word] += 1
prev = word
try: # This won't work if the source text is completely blank
self.chain[word][self.END] += 1
except KeyError:
pass

def size(self):
count = 0
for node in self.chain.itervalues():
for hits in node.itervalues():
count += hits
return count


class _MarkovChainIntersection(_MarkovChain):
def __init__(self, mc1, mc2):
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
c1 = mc1.chain
c2 = mc2.chain

for word, nodes1 in c1.iteritems():
if word in c2:
nodes2 = c2[word]
for node, count1 in nodes1.iteritems():
if node in nodes2:
count2 = nodes2[node]
self.chain[word][node] = min(count1, count2)


class CopyrightMixin(object):
class CopyvioMixin(object):
""" """
EarwigBot's Wiki Toolset: Copyright Violation Mixin EarwigBot's Wiki Toolset: Copyright Violation Mixin


@@ -220,8 +179,8 @@ class CopyrightMixin(object):
if not html: if not html:
return 0 return 0


source = _MarkovChain(self._copyvio_strip_html(html))
delta = _MarkovChainIntersection(article, source)
source = MarkovChain(self._copyvio_strip_html(html))
delta = MarkovChainIntersection(article, source)
return float(delta.size()) / article.size(), (source, delta) return float(delta.size()) / article.size(), (source, delta)


def copyvio_check(self, min_confidence=0.5, max_queries=-1, def copyvio_check(self, min_confidence=0.5, max_queries=-1,
@@ -255,17 +214,17 @@ class CopyrightMixin(object):
best_confidence = 0 best_confidence = 0
best_match = None best_match = None
num_queries = 0 num_queries = 0
empty = _MarkovChain("")
best_chains = (empty, _MarkovChainIntersection(empty, empty))
empty = MarkovChain("")
best_chains = (empty, MarkovChainIntersection(empty, empty))
content = self.get(force) content = self.get(force)
clean = self._copyvio_strip_article(content) clean = self._copyvio_strip_article(content)
chunks = self._copyvio_chunk_article(clean, max_queries) chunks = self._copyvio_chunk_article(clean, max_queries)
article_chain = _MarkovChain(clean)
article_chain = MarkovChain(clean)
last_query = time() last_query = time()


if article_chain.size() < 20: # Auto-fail very small articles if article_chain.size() < 20: # Auto-fail very small articles
return _CopyvioCheckResult(False, best_confidence, best_match,
num_queries, article_chain, best_chains)
return CopyvioCheckResult(False, best_confidence, best_match,
num_queries, article_chain, best_chains)


while (chunks and best_confidence < min_confidence and while (chunks and best_confidence < min_confidence and
(max_queries < 0 or num_queries < max_queries)): (max_queries < 0 or num_queries < max_queries)):
@@ -288,8 +247,8 @@ class CopyrightMixin(object):
v = True v = True
else: else:
v = False v = False
return _CopyvioCheckResult(v, best_confidence, best_match, num_queries,
article_chain, best_chains)
return CopyvioCheckResult(v, best_confidence, best_match, num_queries,
article_chain, best_chains)


def copyvio_compare(self, url, min_confidence=0.5, force=False): def copyvio_compare(self, url, min_confidence=0.5, force=False):
"""Check the page like copyvio_check(), but against a specific URL. """Check the page like copyvio_check(), but against a specific URL.
@@ -298,7 +257,7 @@ class CopyrightMixin(object):
comparison is made using Markov chains and the result is returned in a comparison is made using Markov chains and the result is returned in a
_CopyvioCheckResult object - without using a search engine, as the _CopyvioCheckResult object - without using a search engine, as the
suspected "violated" URL is supplied from the start. suspected "violated" URL is supplied from the start.
Its primary use is to generate a result when the URL is retrieved from Its primary use is to generate a result when the URL is retrieved from
a cache, like the one used in EarwigBot's Toolserver site. After a a cache, like the one used in EarwigBot's Toolserver site. After a
search is done, the resulting URL is stored in a cache for 24 hours so search is done, the resulting URL is stored in a cache for 24 hours so
@@ -313,12 +272,12 @@ class CopyrightMixin(object):
""" """
content = self.get(force) content = self.get(force)
clean = self._copyvio_strip_article(content) clean = self._copyvio_strip_article(content)
article_chain = _MarkovChain(clean)
article_chain = MarkovChain(clean)
confidence, chains = self._copyvio_compare_content(article_chain, url) confidence, chains = self._copyvio_compare_content(article_chain, url)


if confidence >= min_confidence: if confidence >= min_confidence:
is_violation = True is_violation = True
else: else:
is_violation = False is_violation = False
return _CopyvioCheckResult(is_violation, confidence, url, 0,
article_chain, chains)
return CopyvioCheckResult(is_violation, confidence, url, 0,
article_chain, chains)

+ 63
- 0
earwigbot/wiki/copyvios/markov.py View File

@@ -0,0 +1,63 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from collections import defaultdict
from re import sub, UNICODE

class MarkovChain(object):
START = -1
END = -2

def __init__(self, text):
self.text = text
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
prev = self.START
for word in words:
self.chain[prev][word] += 1
prev = word
try: # This won't work if the source text is completely blank
self.chain[word][self.END] += 1
except KeyError:
pass

def size(self):
count = 0
for node in self.chain.itervalues():
for hits in node.itervalues():
count += hits
return count


class MarkovChainIntersection(MarkovChain):
def __init__(self, mc1, mc2):
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
c1 = mc1.chain
c2 = mc2.chain

for word, nodes1 in c1.iteritems():
if word in c2:
nodes2 = c2[word]
for node, count1 in nodes1.iteritems():
if node in nodes2:
count2 = nodes2[node]
self.chain[word][node] = min(count1, count2)

+ 9
- 9
earwigbot/wiki/page.py View File

@@ -1,17 +1,17 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# #
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net> # Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
#
#
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights # in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions: # furnished to do so, subject to the following conditions:
#
#
# The above copyright notice and this permission notice shall be included in # The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software. # all copies or substantial portions of the Software.
#
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -25,10 +25,10 @@ import re
from time import gmtime, strftime from time import gmtime, strftime
from urllib import quote from urllib import quote


from earwigbot.wiki.copyright import CopyrightMixin
from earwigbot.wiki.copyvios import CopyvioMixin
from earwigbot.wiki.exceptions import * from earwigbot.wiki.exceptions import *


class Page(CopyrightMixin):
class Page(CopyvioMixin):
""" """
EarwigBot's Wiki Toolset: Page Class EarwigBot's Wiki Toolset: Page Class


@@ -264,7 +264,7 @@ class Page(CopyrightMixin):
If `params` is given, we'll use it as our API query parameters. If `params` is given, we'll use it as our API query parameters.
Otherwise, we'll build params using the given kwargs via Otherwise, we'll build params using the given kwargs via
_build_edit_params(). _build_edit_params().
We'll then try to do the API query, and catch any errors the API raises We'll then try to do the API query, and catch any errors the API raises
in _handle_edit_errors(). We'll then throw these back as subclasses of in _handle_edit_errors(). We'll then throw these back as subclasses of
EditError. EditError.
@@ -275,7 +275,7 @@ class Page(CopyrightMixin):
if not self._token: if not self._token:
e = "You don't have permission to edit this page." e = "You don't have permission to edit this page."
raise PermissionsError(e) raise PermissionsError(e)
# Weed out invalid pages before we get too far: # Weed out invalid pages before we get too far:
self._force_validity() self._force_validity()


@@ -336,7 +336,7 @@ class Page(CopyrightMixin):
# Page does not exist; don't edit if it already exists: # Page does not exist; don't edit if it already exists:
params["createonly"] = "true" params["createonly"] = "true"
else: else:
params["recreate"] = "true"
params["recreate"] = "true"


return params return params




Loading…
Cancel
Save