@@ -0,0 +1,47 @@ | |||
copyvios Package | |||
================ | |||
:mod:`copyvios` Package | |||
----------------------- | |||
.. automodule:: earwigbot.wiki.copyvios | |||
:members: | |||
:undoc-members: | |||
:mod:`exclusions` Module | |||
------------------------ | |||
.. automodule:: earwigbot.wiki.copyvios.exclusions | |||
:members: | |||
:undoc-members: | |||
:mod:`markov` Module | |||
-------------------- | |||
.. automodule:: earwigbot.wiki.copyvios.markov | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
:mod:`parsers` Module | |||
--------------------- | |||
.. automodule:: earwigbot.wiki.copyvios.parsers | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
:mod:`result` Module | |||
-------------------- | |||
.. automodule:: earwigbot.wiki.copyvios.result | |||
:members: | |||
:undoc-members: | |||
:mod:`search` Module | |||
-------------------- | |||
.. automodule:: earwigbot.wiki.copyvios.search | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: |
@@ -22,13 +22,6 @@ wiki Package | |||
:members: | |||
:undoc-members: | |||
:mod:`copyright` Module | |||
.. automodule:: earwigbot.wiki.copyright | |||
:members: | |||
:undoc-members: | |||
:mod:`page` Module | |||
------------------ | |||
@@ -57,3 +50,10 @@ wiki Package | |||
.. automodule:: earwigbot.wiki.user | |||
:members: | |||
:undoc-members: | |||
Subpackages | |||
----------- | |||
.. toctree:: | |||
earwigbot.wiki.copyvios |
@@ -2,6 +2,6 @@ earwigbot | |||
========= | |||
.. toctree:: | |||
:maxdepth: 4 | |||
:maxdepth: 6 | |||
earwigbot |
@@ -47,9 +47,10 @@ wikis, you can usually use code like this:: | |||
site = bot.wiki.add_site(project=project, lang=lang) | |||
This works because EarwigBot assumes that the URL for the site is | |||
``"//{lang}.{project}.org"`` and the API is at ``/w/api.php``; this might | |||
change if you're dealing with non-WMF wikis, where the code might look | |||
something more like:: | |||
``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL | |||
connection info (if any) is stored as ``config.wiki["sql"]``. This might change | |||
if you're dealing with non-WMF wikis, where the code might look something more | |||
like:: | |||
project, lang = "mywiki", "it" | |||
try: | |||
@@ -30,6 +30,7 @@ class Link(Command): | |||
name = "link" | |||
def process(self, data): | |||
self.site = self.bot.wiki.get_site() | |||
msg = data.msg | |||
if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg): | |||
@@ -41,8 +42,8 @@ class Link(Command): | |||
if not data.args: | |||
self.reply(data, "what do you want me to link to?") | |||
return | |||
pagename = ' '.join(data.args) | |||
link = self.parse_link(pagename) | |||
pagename = " ".join(data.args) | |||
link = self.site.get_page(pagename).url | |||
self.reply(data, link) | |||
def parse_line(self, line): | |||
@@ -56,8 +57,7 @@ class Link(Command): | |||
if links: | |||
# re.findall() returns a list of tuples, but we only want the 2nd | |||
# item in each tuple: | |||
links = [i[1] for i in links] | |||
results = map(self.parse_link, links) | |||
results = [self.site.get_page(name[1]).url for name in links] | |||
# Find all {{templates}} | |||
templates = re.findall("(\{\{(.*?)(\||\}\}))", line) | |||
@@ -67,10 +67,6 @@ class Link(Command): | |||
return results | |||
def parse_link(self, pagename): | |||
link = quote(pagename.replace(" ", "_"), safe="/:") | |||
return "".join(("http://enwp.org/", link)) | |||
def parse_template(self, pagename): | |||
pagename = "".join(("Template:", pagename)) | |||
return self.parse_link(pagename) | |||
return self.site.get_page(pagename).url |
@@ -23,6 +23,7 @@ | |||
from hashlib import sha256 | |||
from os.path import expanduser | |||
from threading import Lock | |||
from urllib import quote | |||
import oursql | |||
@@ -70,35 +71,36 @@ class AFCCopyvios(Task): | |||
"""Detect copyvios in 'page' and add a note if any are found.""" | |||
title = page.title | |||
if title in self.ignore_list: | |||
msg = "Skipping page in ignore list: [[{0}]]" | |||
msg = u"Skipping page in ignore list: [[{0}]]" | |||
self.logger.info(msg.format(title)) | |||
return | |||
pageid = page.pageid | |||
if self.has_been_processed(pageid): | |||
msg = "Skipping check on already processed page [[{0}]]" | |||
msg = u"Skipping check on already processed page [[{0}]]" | |||
self.logger.info(msg.format(title)) | |||
return | |||
self.logger.info("Checking [[{0}]]".format(title)) | |||
self.logger.info(u"Checking [[{0}]]".format(title)) | |||
result = page.copyvio_check(self.min_confidence, self.max_queries) | |||
url = result.url | |||
confidence = "{0}%".format(round(result.confidence * 100, 2)) | |||
if result.violation: | |||
safeurl = quote(url.encode("utf8"), safe="/:").decode("utf8") | |||
content = page.get() | |||
template = "\{\{{0}|url={1}|confidence={2}\}\}\n" | |||
template = template.format(self.template, url, confidence) | |||
template = u"\{\{{0}|url={1}|confidence={2}\}\}\n" | |||
template = template.format(self.template, safeurl, confidence) | |||
newtext = template + content | |||
if "{url}" in self.summary: | |||
page.edit(newtext, self.summary.format(url=url)) | |||
else: | |||
page.edit(newtext, self.summary) | |||
msg = "Found violation: [[{0}]] -> {1} ({2} confidence)" | |||
self.logger.warn(msg.format(title, url, confidence)) | |||
msg = u"Found violation: [[{0}]] -> {1} ({2} confidence)" | |||
self.logger.info(msg.format(title, url, confidence)) | |||
else: | |||
msg = "No violations detected (best: {1} at {2} confidence)" | |||
self.logger.debug(msg.format(url, confidence)) | |||
msg = u"No violations detected in [[{0}]] (best: {1} at {2} confidence)" | |||
self.logger.info(msg.format(title, url, confidence)) | |||
self.log_processed(pageid) | |||
if self.cache_results: | |||
@@ -110,9 +112,7 @@ class AFCCopyvios(Task): | |||
with self.conn.cursor() as cursor: | |||
cursor.execute(query, (pageid,)) | |||
results = cursor.fetchall() | |||
if results: | |||
return True | |||
return False | |||
return True if results else False | |||
def log_processed(self, pageid): | |||
"""Adds pageid to our database of processed pages. | |||
@@ -138,8 +138,8 @@ class AFCCopyvios(Task): | |||
be) retained for one day; this task does not remove old entries (that | |||
is handled by the Toolserver component). | |||
This will only be called if "cache_results" == True in the task's | |||
config, which is False by default. | |||
This will only be called if ``cache_results == True`` in the task's | |||
config, which is ``False`` by default. | |||
""" | |||
pageid = page.pageid | |||
hash = sha256(page.get()).hexdigest() | |||
@@ -1,324 +0,0 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from collections import defaultdict | |||
from functools import partial | |||
from gzip import GzipFile | |||
from json import loads | |||
from re import sub, UNICODE | |||
from StringIO import StringIO | |||
from time import sleep, time | |||
from urllib import quote_plus, urlencode | |||
from urllib2 import build_opener, URLError | |||
try: | |||
import oauth2 as oauth | |||
except ImportError: | |||
oauth = None | |||
from earwigbot.exceptions import * | |||
class _CopyvioCheckResult(object): | |||
def __init__(self, violation, confidence, url, queries, article, chains): | |||
self.violation = violation | |||
self.confidence = confidence | |||
self.url = url | |||
self.queries = queries | |||
self.article_chain = article | |||
self.source_chain = chains[0] | |||
self.delta_chain = chains[1] | |||
def __repr__(self): | |||
r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" | |||
return r.format(self.violation, self.confidence, self.url, self.queries) | |||
class _MarkovChain(object): | |||
START = -1 | |||
END = -2 | |||
def __init__(self, text): | |||
self.text = text | |||
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) | |||
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split() | |||
prev = self.START | |||
for word in words: | |||
self.chain[prev][word] += 1 | |||
prev = word | |||
try: # This won't work if the source text is completely blank | |||
self.chain[word][self.END] += 1 | |||
except KeyError: | |||
pass | |||
def size(self): | |||
count = 0 | |||
for node in self.chain.itervalues(): | |||
for hits in node.itervalues(): | |||
count += hits | |||
return count | |||
class _MarkovChainIntersection(_MarkovChain): | |||
def __init__(self, mc1, mc2): | |||
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) | |||
c1 = mc1.chain | |||
c2 = mc2.chain | |||
for word, nodes1 in c1.iteritems(): | |||
if word in c2: | |||
nodes2 = c2[word] | |||
for node, count1 in nodes1.iteritems(): | |||
if node in nodes2: | |||
count2 = nodes2[node] | |||
self.chain[word][node] = min(count1, count2) | |||
class CopyrightMixIn(object): | |||
""" | |||
EarwigBot's Wiki Toolset: Copyright Violation Mixin | |||
This is a mixin that provides two public methods, copyvio_check() and | |||
copyvio_compare(). The former checks the page for copyright violations | |||
using a search engine API, and the latter compares the page against a | |||
specified URL. Credentials for the search engine API are stored in the | |||
site's config. | |||
""" | |||
def __init__(self, site): | |||
self._opener = build_opener() | |||
self._opener.addheaders = site._opener.addheaders | |||
def _open_url_ignoring_errors(self, url): | |||
"""Open a URL using self._opener and return its content, or None. | |||
Will decompress the content if the headers contain "gzip" as its | |||
content encoding, and will return None if URLError is raised while | |||
opening the URL. IOErrors while gunzipping a compressed response are | |||
ignored, and the original content is returned. | |||
""" | |||
try: | |||
response = self._opener.open(url) | |||
except URLError: | |||
return None | |||
result = response.read() | |||
if response.headers.get("Content-Encoding") == "gzip": | |||
stream = StringIO(result) | |||
gzipper = GzipFile(fileobj=stream) | |||
try: | |||
result = gzipper.read() | |||
except IOError: | |||
pass | |||
return result | |||
def _select_search_engine(self): | |||
"""Return a function that can be called to do web searches. | |||
The "function" is a functools.partial object that takes one argument, a | |||
query, and returns a list of URLs, ranked by importance. The underlying | |||
logic depends on the 'engine' argument; for example, if 'engine' is | |||
"Yahoo! BOSS", we'll use self._yahoo_boss_query for querying. | |||
Raises UnknownSearchEngineError if the 'engine' listed in our config is | |||
unknown to us, and UnsupportedSearchEngineError if we are missing a | |||
required package or module, like oauth2 for "Yahoo! BOSS". | |||
""" | |||
engine, credentials = self._site._search_config | |||
if engine == "Yahoo! BOSS": | |||
if not oauth: | |||
e = "The package 'oauth2' could not be imported" | |||
raise UnsupportedSearchEngineError(e) | |||
searcher = self._yahoo_boss_query | |||
else: | |||
raise UnknownSearchEngineError(engine) | |||
return partial(searcher, credentials) | |||
def _yahoo_boss_query(self, cred, query): | |||
"""Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials. | |||
Returns a list of URLs, no more than fifty, ranked by relevance (as | |||
determined by Yahoo). Raises SearchQueryError() on errors. | |||
""" | |||
base_url = "http://yboss.yahooapis.com/ysearch/web" | |||
query = quote_plus(query.join('"', '"')) | |||
params = {"q": query, "style": "raw", "format": "json"} | |||
url = "{0}?{1}".format(base_url, urlencode(params)) | |||
consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"]) | |||
client = oauth.Client(consumer) | |||
headers, body = client.request(url, "GET") | |||
if headers["status"] != "200": | |||
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" | |||
raise SearchQueryError(e.format(headers["status"], body)) | |||
try: | |||
res = loads(body) | |||
except ValueError: | |||
e = "Yahoo! BOSS Error: JSON could not be decoded" | |||
raise SearchQueryError(e) | |||
try: | |||
results = res["bossresponse"]["web"]["results"] | |||
except KeyError: | |||
return [] | |||
return [result["url"] for result in results] | |||
def _copyvio_strip_html(self, html): | |||
""" | |||
STUB | |||
""" | |||
return html | |||
def _copyvio_strip_article(self, content): | |||
"""Clean the page's raw text by removing templates and formatting. | |||
Returns the page's text with all HTML and wikicode formatting removed, | |||
including templates, tables, references, and the Bibliography/ | |||
References/Sources/See also section(s). It retains punctuation | |||
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses, | |||
quotes) and original capitalization, but not brackets (square and | |||
angular), abnormal spacing, nor anything else. HTML entities are | |||
replaced by their unicode equivalents. | |||
STUB | |||
""" | |||
return content | |||
def _copyvio_chunk_article(self, content, max_chunks): | |||
""" | |||
STUB | |||
""" | |||
return [content] | |||
def _copyvio_compare_content(self, article, url): | |||
""" | |||
DOCSTRING NEEDED | |||
""" | |||
html = self._open_url_ignoring_errors(url) | |||
if not html: | |||
return 0 | |||
source = _MarkovChain(self._copyvio_strip_html(html)) | |||
delta = _MarkovChainIntersection(article, source) | |||
return float(delta.size()) / article.size(), (source, delta) | |||
def copyvio_check(self, min_confidence=0.5, max_queries=-1, | |||
interquery_sleep=1, force=False): | |||
"""Check the page for copyright violations. | |||
Returns a _CopyvioCheckResult object with four useful attributes: | |||
"violation", "confidence", "url", and "queries". "confidence" is a | |||
number between 0 and 1; if it is less than "min_confidence", we could | |||
not find any indication of a violation (so "violation" will be False | |||
and "url" may or may not be None), otherwise it indicates the relative | |||
faith in our results, "violation" will be True, and "url" will be the | |||
place the article is suspected of being copied from. "queries" is the | |||
number of queries used to determine the results. | |||
"max_queries" is self-explanatory; we will never make more than this | |||
number of queries in a given check. If it's less than 0, we will not | |||
limit our number of queries. | |||
"interquery_sleep" is the minimum amount of time we will sleep between | |||
search engine queries, in seconds. | |||
"force" is simply passed to page.get() - it has the same behavior there | |||
as it does here. | |||
Raises CopyvioCheckError or subclasses (UnknownSearchEngineError, | |||
SearchQueryError, ...) on errors. | |||
""" | |||
search = self._select_search_engine() | |||
handled_urls = [] | |||
best_confidence = 0 | |||
best_match = None | |||
num_queries = 0 | |||
empty = _MarkovChain("") | |||
best_chains = (empty, _MarkovChainIntersection(empty, empty)) | |||
content = self.get(force) | |||
clean = self._copyvio_strip_article(content) | |||
chunks = self._copyvio_chunk_article(clean, max_queries) | |||
article_chain = _MarkovChain(clean) | |||
last_query = time() | |||
if article_chain.size() < 20: # Auto-fail very small articles | |||
return _CopyvioCheckResult(False, best_confidence, best_match, | |||
num_queries, article_chain, best_chains) | |||
while (chunks and best_confidence < min_confidence and | |||
(max_queries < 0 or num_queries < max_queries)): | |||
urls = search(chunks.pop(0)) | |||
urls = [url for url in urls if url not in handled_urls] | |||
for url in urls: | |||
handled_urls.append(url) | |||
conf, chains = self._copyvio_compare_content(article_chain, url) | |||
if conf > best_confidence: | |||
best_confidence = conf | |||
best_match = url | |||
best_chains = chains | |||
num_queries += 1 | |||
diff = time() - last_query | |||
if diff < interquery_sleep: | |||
sleep(interquery_sleep - diff) | |||
last_query = time() | |||
if best_confidence >= min_confidence: # violation? | |||
v = True | |||
else: | |||
v = False | |||
return _CopyvioCheckResult(v, best_confidence, best_match, num_queries, | |||
article_chain, best_chains) | |||
def copyvio_compare(self, url, min_confidence=0.5, force=False): | |||
"""Check the page like copyvio_check(), but against a specific URL. | |||
This is essentially a reduced version of the above - a copyivo | |||
comparison is made using Markov chains and the result is returned in a | |||
_CopyvioCheckResult object - without using a search engine, as the | |||
suspected "violated" URL is supplied from the start. | |||
Its primary use is to generate a result when the URL is retrieved from | |||
a cache, like the one used in EarwigBot's Toolserver site. After a | |||
search is done, the resulting URL is stored in a cache for 24 hours so | |||
future checks against that page will not require another set of | |||
time-and-money-consuming search engine queries. However, the comparison | |||
itself (which includes the article's and the source's content) cannot | |||
be stored for data retention reasons, so a fresh comparison is made | |||
using this function. | |||
Since no searching is done, neither UnknownSearchEngineError nor | |||
SearchQueryError will be raised. | |||
""" | |||
content = self.get(force) | |||
clean = self._copyvio_strip_article(content) | |||
article_chain = _MarkovChain(clean) | |||
confidence, chains = self._copyvio_compare_content(article_chain, url) | |||
if confidence >= min_confidence: | |||
is_violation = True | |||
else: | |||
is_violation = False | |||
return _CopyvioCheckResult(is_violation, confidence, url, 0, | |||
article_chain, chains) |
@@ -0,0 +1,229 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from gzip import GzipFile | |||
from StringIO import StringIO | |||
from time import sleep, time | |||
from urllib2 import build_opener, URLError | |||
try: | |||
import oauth2 as oauth | |||
except ImportError: | |||
oauth = None | |||
from earwigbot import exceptions | |||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | |||
from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser | |||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult | |||
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine | |||
__all__ = ["CopyvioMixIn"] | |||
class CopyvioMixIn(object): | |||
""" | |||
**EarwigBot: Wiki Toolset: Copyright Violation MixIn** | |||
This is a mixin that provides two public methods, :py:meth:`copyvio_check` | |||
and :py:meth:`copyvio_compare`. The former checks the page for copyright | |||
violations using a search engine API, and the latter compares the page | |||
against a given URL. Credentials for the search engine API are stored in | |||
the :py:class:`~earwigbot.wiki.site.Site`'s config. | |||
""" | |||
def __init__(self, site): | |||
self._search_config = site._search_config | |||
self._exclusions_db = self._search_config["exclusions_db"] | |||
self._opener = build_opener() | |||
self._opener.addheaders = site._opener.addheaders | |||
def _open_url_ignoring_errors(self, url): | |||
"""Open a URL using self._opener and return its content, or None. | |||
Will decompress the content if the headers contain "gzip" as its | |||
content encoding, and will return None if URLError is raised while | |||
opening the URL. IOErrors while gunzipping a compressed response are | |||
ignored, and the original content is returned. | |||
""" | |||
try: | |||
response = self._opener.open(url) | |||
except URLError: | |||
return None | |||
result = response.read() | |||
if response.headers.get("Content-Encoding") == "gzip": | |||
stream = StringIO(result) | |||
gzipper = GzipFile(fileobj=stream) | |||
try: | |||
result = gzipper.read() | |||
except IOError: | |||
pass | |||
return result | |||
def _select_search_engine(self): | |||
"""Return a function that can be called to do web searches. | |||
The function takes one argument, a search query, and returns a list of | |||
URLs, ranked by importance. The underlying logic depends on the | |||
*engine* argument within our config; for example, if *engine* is | |||
"Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying. | |||
Raises UnknownSearchEngineError if the 'engine' listed in our config is | |||
unknown to us, and UnsupportedSearchEngineError if we are missing a | |||
required package or module, like oauth2 for "Yahoo! BOSS". | |||
""" | |||
engine = self._search_config["engine"] | |||
credentials = self._search_config["credentials"] | |||
if engine == "Yahoo! BOSS": | |||
if not oauth: | |||
e = "The package 'oauth2' could not be imported" | |||
raise exceptions.UnsupportedSearchEngineError(e) | |||
return YahooBOSSSearchEngine(credentials) | |||
raise exceptions.UnknownSearchEngineError(engine) | |||
def _copyvio_compare_content(self, article, url): | |||
"""Return a number comparing an article and a URL. | |||
The *article* is a Markov chain, whereas the *url* is just a string | |||
that we'll try to open and read ourselves. | |||
""" | |||
html = self._open_url_ignoring_errors(url) | |||
if not html: | |||
return 0 | |||
source = MarkovChain(HTMLTextParser(html).strip()) | |||
delta = MarkovChainIntersection(article, source) | |||
return float(delta.size()) / article.size(), (source, delta) | |||
def copyvio_check(self, min_confidence=0.5, max_queries=-1, | |||
interquery_sleep=1): | |||
"""Check the page for copyright violations. | |||
Returns a | |||
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object | |||
with information on the results of the check. | |||
*max_queries* is self-explanatory; we will never make more than this | |||
number of queries in a given check. If it's lower than 0, we will not | |||
limit the number of queries. | |||
*interquery_sleep* is the minimum amount of time we will sleep between | |||
search engine queries, in seconds. | |||
Raises :py:exc:`~earwigbot.exceptions.CopyvioCheckError` or subclasses | |||
(:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError`, | |||
:py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors. | |||
""" | |||
searcher = self._select_search_engine() | |||
self._exclusions_db.sync(self.site.name) | |||
handled_urls = [] | |||
best_confidence = 0 | |||
best_match = None | |||
num_queries = 0 | |||
empty = MarkovChain("") | |||
best_chains = (empty, MarkovChainIntersection(empty, empty)) | |||
parser = ArticleTextParser(self.get()) | |||
clean = parser.strip() | |||
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) | |||
article_chain = MarkovChain(clean) | |||
last_query = time() | |||
if article_chain.size() < 20: # Auto-fail very small articles | |||
return CopyvioCheckResult(False, best_confidence, best_match, | |||
num_queries, article_chain, best_chains) | |||
while (chunks and best_confidence < min_confidence and | |||
(max_queries < 0 or num_queries < max_queries)): | |||
chunk = chunks.pop(0) | |||
log = u"[[{0}]] -> querying {1} for {2!r}" | |||
self._logger.debug(log.format(self.title, searcher.name, chunk)) | |||
urls = searcher.search(chunk) | |||
urls = [url for url in urls if url not in handled_urls] | |||
for url in urls: | |||
handled_urls.append(url) | |||
if self._exclusions_db.check(self.site.name, url): | |||
continue | |||
conf, chains = self._copyvio_compare_content(article_chain, url) | |||
if conf > best_confidence: | |||
best_confidence = conf | |||
best_match = url | |||
best_chains = chains | |||
num_queries += 1 | |||
diff = time() - last_query | |||
if diff < interquery_sleep: | |||
sleep(interquery_sleep - diff) | |||
last_query = time() | |||
if best_confidence >= min_confidence: | |||
is_violation = True | |||
log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2}; using {3} queries)" | |||
self._logger.debug(log.format(self.title, best_confidence, | |||
best_match, num_queries)) | |||
else: | |||
is_violation = False | |||
log = u"No violation for [[{0}]] (confidence: {1}; using {2} queries)" | |||
self._logger.debug(log.format(self.title, best_confidence, | |||
num_queries)) | |||
return CopyvioCheckResult(is_violation, best_confidence, best_match, | |||
num_queries, article_chain, best_chains) | |||
def copyvio_compare(self, url, min_confidence=0.5): | |||
"""Check the page like :py:meth:`copyvio_check` against a specific URL. | |||
This is essentially a reduced version of the above - a copyivo | |||
comparison is made using Markov chains and the result is returned in a | |||
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object - | |||
but without using a search engine, since the suspected "violated" URL | |||
is supplied from the start. | |||
Its primary use is to generate a result when the URL is retrieved from | |||
a cache, like the one used in EarwigBot's Toolserver site. After a | |||
search is done, the resulting URL is stored in a cache for 24 hours so | |||
future checks against that page will not require another set of | |||
time-and-money-consuming search engine queries. However, the comparison | |||
itself (which includes the article's and the source's content) cannot | |||
be stored for data retention reasons, so a fresh comparison is made | |||
using this function. | |||
Since no searching is done, neither | |||
:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError` nor | |||
:py:exc:`~earwigbot.exceptions.SearchQueryError` will be raised. | |||
""" | |||
content = self.get() | |||
clean = ArticleTextParser(content).strip() | |||
article_chain = MarkovChain(clean) | |||
confidence, chains = self._copyvio_compare_content(article_chain, url) | |||
if confidence >= min_confidence: | |||
is_violation = True | |||
log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2})" | |||
self._logger.debug(log.format(self.title, confidence, url)) | |||
else: | |||
is_violation = False | |||
log = u"No violation for [[{0}]] (confidence: {1}; URL: {2})" | |||
self._logger.debug(log.format(self.title, confidence, url)) | |||
return CopyvioCheckResult(is_violation, confidence, url, 0, | |||
article_chain, chains) |
@@ -0,0 +1,164 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
import re | |||
import sqlite3 as sqlite | |||
from threading import Lock | |||
from time import time | |||
from earwigbot import exceptions | |||
__all__ = ["ExclusionsDB"] | |||
default_sources = { | |||
"enwiki": [ | |||
"Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def", | |||
"Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl", | |||
"Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr", | |||
"Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz" | |||
] | |||
} | |||
class ExclusionsDB(object): | |||
""" | |||
**EarwigBot: Wiki Toolset: Exclusions Database Manager** | |||
Controls the :file:`.exclusions.db` file, which stores URLs excluded from | |||
copyright violation checks on account of being known mirrors, for example. | |||
""" | |||
def __init__(self, sitesdb, dbfile, logger): | |||
self._sitesdb = sitesdb | |||
self._dbfile = dbfile | |||
self._logger = logger | |||
self._db_access_lock = Lock() | |||
def __repr__(self): | |||
"""Return the canonical string representation of the ExclusionsDB.""" | |||
res = "ExclusionsDB(sitesdb={0!r}, dbfile={1!r}, logger={2!r})" | |||
return res.format(self._sitesdb, self._dbfile, self._logger) | |||
def __str__(self): | |||
"""Return a nice string representation of the ExclusionsDB.""" | |||
return "<ExclusionsDB at {0}>".format(self._dbfile) | |||
def _create(self): | |||
"""Initialize the exclusions database with its necessary tables.""" | |||
script = """ | |||
CREATE TABLE sources (source_sitename, source_page); | |||
CREATE TABLE updates (update_sitename, update_time); | |||
CREATE TABLE exclusions (exclusion_sitename, exclusion_url); | |||
""" | |||
query = "INSERT INTO sources VALUES (?, ?);" | |||
sources = [] | |||
for sitename, pages in default_sources.iteritems(): | |||
[sources.append((sitename, page)) for page in pages] | |||
with sqlite.connect(self._dbfile) as conn: | |||
conn.executescript(script) | |||
conn.executemany(query, sources) | |||
def _load_source(self, site, source): | |||
"""Load from a specific source and return a set of URLs.""" | |||
urls = set() | |||
try: | |||
data = site.get_page(source).get() | |||
except exceptions.PageNotFoundError: | |||
return urls | |||
regexes = [ | |||
"url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>", | |||
"\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?" | |||
] | |||
for regex in regexes: | |||
[urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)] | |||
return urls | |||
def _update(self, sitename): | |||
"""Update the database from listed sources in the index.""" | |||
query1 = "SELECT source_page FROM sources WHERE source_sitename = ?;" | |||
query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" | |||
query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?" | |||
query4 = "INSERT INTO exclusions VALUES (?, ?);" | |||
query5 = "SELECT 1 FROM updates WHERE update_sitename = ?;" | |||
query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?;" | |||
query7 = "INSERT INTO updates VALUES (?, ?);" | |||
site = self._sitesdb.get_site(sitename) | |||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||
urls = set() | |||
for (source,) in conn.execute(query1, (sitename,)): | |||
urls |= self._load_source(site, source) | |||
for (url,) in conn.execute(query2, (sitename,)): | |||
if url in urls: | |||
urls.remove(url) | |||
else: | |||
conn.execute(query3, (sitename, url)) | |||
conn.executemany(query4, [(sitename, url) for url in urls]) | |||
if conn.execute(query5, (name,)).fetchone(): | |||
conn.execute(query6, (time(), sitename)) | |||
else: | |||
conn.execute(query7, (sitename, time())) | |||
def _get_last_update(self, sitename): | |||
"""Return the UNIX timestamp of the last time the db was updated.""" | |||
query = "SELECT update_time FROM updates WHERE update_sitename = ?;" | |||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||
try: | |||
result = conn.execute(query, (sitename,)).fetchone() | |||
except sqlite.OperationalError: | |||
self._create() | |||
return 0 | |||
return result[0] if result else 0 | |||
def sync(self, sitename): | |||
"""Update the database if it hasn't been updated in the past month. | |||
This only updates the exclusions database for the *sitename* site. | |||
""" | |||
max_staleness = 60 * 60 * 24 * 30 | |||
time_since_update = int(time() - self._get_last_update()) | |||
if time_since_update > max_staleness: | |||
log = u"Updating stale database: {0} (last updated {1} seconds ago)" | |||
self._logger.info(log.format(sitename, time_since_update)) | |||
self._update(sitename) | |||
else: | |||
log = u"Database for {0} is still fresh (last updated {1} seconds ago)" | |||
self._logger.debug(log.format(sitename, time_since_update)) | |||
def check(self, sitename, url): | |||
"""Check whether a given URL is in the exclusions database. | |||
Return ``True`` if the URL is in the database, or ``False`` otherwise. | |||
""" | |||
normalized = re.sub("https?://", "", url.lower()) | |||
query = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" | |||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||
for row in conn.execute(query, (sitename,)): | |||
if normalized.startswith(row[0]): | |||
log = u"Exclusion detected in {0} for {1}" | |||
self._logger.debug(log.format(sitename, url)) | |||
return True | |||
log = u"No exclusions in {0} for {1}".format(sitename, url) | |||
self._logger.debug(log) | |||
return False |
@@ -0,0 +1,87 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from collections import defaultdict | |||
from re import sub, UNICODE | |||
__all__ = ["MarkovChain", "MarkovChainIntersection"] | |||
class MarkovChain(object): | |||
"""Implements a basic ngram Markov chain of words.""" | |||
START = -1 | |||
END = -2 | |||
degree = 3 # 2 for bigrams, 3 for trigrams, etc. | |||
def __init__(self, text): | |||
self.text = text | |||
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) | |||
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split() | |||
padding = self.degree - 1 | |||
words = ([self.START] * padding) + words + ([self.END] * padding) | |||
for i in range(len(words) - self.degree + 1): | |||
last = i + self.degree - 1 | |||
self.chain[words[i:last]][words[last]] += 1 | |||
def __repr__(self): | |||
"""Return the canonical string representation of the MarkovChain.""" | |||
return "MarkovChain(text={0!r})".format(self.text) | |||
def __str__(self): | |||
"""Return a nice string representation of the MarkovChain.""" | |||
return "<MarkovChain of size {0}>".format(self.size()) | |||
def size(self): | |||
"""Return the size of the Markov chain: the total number of nodes.""" | |||
count = 0 | |||
for node in self.chain.itervalues(): | |||
for hits in node.itervalues(): | |||
count += hits | |||
return count | |||
class MarkovChainIntersection(MarkovChain): | |||
"""Implements the intersection of two chains (i.e., their shared nodes).""" | |||
def __init__(self, mc1, mc2): | |||
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) | |||
self.mc1, self.mc2 = mc1, mc2 | |||
c1 = mc1.chain | |||
c2 = mc2.chain | |||
for word, nodes1 in c1.iteritems(): | |||
if word in c2: | |||
nodes2 = c2[word] | |||
for node, count1 in nodes1.iteritems(): | |||
if node in nodes2: | |||
count2 = nodes2[node] | |||
self.chain[word][node] = min(count1, count2) | |||
def __repr__(self): | |||
"""Return the canonical string representation of the intersection.""" | |||
res = "MarkovChainIntersection(mc1={0!r}, mc2={1!r})" | |||
return res.format(self.mc1, self.mc2) | |||
def __str__(self): | |||
"""Return a nice string representation of the intersection.""" | |||
res = "<MarkovChainIntersection of size {0} ({1} ^ {2})>" | |||
return res.format(self.size(), self.mc1, self.mc2) |
@@ -0,0 +1,148 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from os import path | |||
try: | |||
from bs4 import BeautifulSoup | |||
except ImportError: | |||
BeautifulSoup = None | |||
try: | |||
import mwparserfromhell | |||
except ImportError: | |||
mwparserfromhell = None | |||
try: | |||
import nltk | |||
except ImportError: | |||
nltk = None | |||
__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] | |||
class BaseTextParser(object): | |||
"""Base class for a parser that handles text.""" | |||
def __init__(self, text): | |||
self.text = text | |||
def __repr__(self): | |||
"""Return the canonical string representation of the text parser.""" | |||
return "{0}(text={1!r})".format(self.__class__.__name__, self.text) | |||
def __str__(self): | |||
"""Return a nice string representation of the text parser.""" | |||
name = self.__class__.__name__ | |||
return "<{0} of text with size {1}>".format(name, len(text)) | |||
class ArticleTextParser(BaseTextParser): | |||
"""A parser that can strip and chunk wikicode article text.""" | |||
def strip(self): | |||
"""Clean the page's raw text by removing templates and formatting. | |||
Return the page's text with all HTML and wikicode formatting removed, | |||
including templates, tables, and references. It retains punctuation | |||
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses, | |||
quotes), original capitalization, and so forth. HTML entities are | |||
replaced by their unicode equivalents. | |||
The actual stripping is handled by :py:mod:`mwparserfromhell`. | |||
""" | |||
wikicode = mwparserfromhell.parse(self.text) | |||
self.clean = wikicode.strip_code(normalize=True) | |||
return self.clean | |||
def chunk(self, nltk_dir, max_chunks, max_query=256): | |||
"""Convert the clean article text into a list of web-searchable chunks. | |||
No greater than *max_chunks* will be returned. Each chunk will only be | |||
a sentence or two long at most (no more than *max_query*). The idea is | |||
to return a sample of the article text rather than the whole, so we'll | |||
pick and choose from parts of it, especially if the article is large | |||
and *max_chunks* is low, so we don't end up just searching for just the | |||
first paragraph. | |||
This is implemented using :py:mod:`nltk` (http://nltk.org/). A base | |||
directory (*nltk_dir*) is required to store nltk's punctuation | |||
database. This is typically located in the bot's working directory. | |||
""" | |||
datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") | |||
try: | |||
tokenizer = nltk.data.load("file:" + datafile) | |||
except LookupError: | |||
nltk.download("punkt", nltk_dir) | |||
tokenizer = nltk.data.load("file:" + datafile) | |||
sentences = [] | |||
for sentence in tokenizer.tokenize(self.clean): | |||
if len(sentence) > max_query: | |||
words = sentence.split() | |||
while len(" ".join(words)) > max_query: | |||
words.pop() | |||
sentence = " ".join(words) | |||
sentences.append(sentence) | |||
if max_chunks >= len(sentences): | |||
return sentences | |||
chunks = [] | |||
while len(chunks) < max_chunks: | |||
if len(chunks) % 5 == 0: | |||
chunk = sentences.pop(0) # Pop from beginning | |||
elif len(chunks) % 5 == 1: | |||
chunk = sentences.pop() # Pop from end | |||
elif len(chunks) % 5 == 2: | |||
chunk = sentences.pop(len(sentences) / 2) # Pop from Q2 | |||
elif len(chunks) % 5 == 3: | |||
chunk = sentences.pop(len(sentences) / 4) # Pop from Q1 | |||
else: | |||
chunk = sentences.pop(3 * len(sentences) / 4) # Pop from Q3 | |||
chunks.append(chunk) | |||
return chunks | |||
class HTMLTextParser(BaseTextParser): | |||
"""A parser that can extract the text from an HTML document.""" | |||
hidden_tags = [ | |||
"script", "style" | |||
] | |||
def strip(self): | |||
"""Return the actual text contained within an HTML document. | |||
Implemented using :py:mod:`BeautifulSoup <bs4>` | |||
(http://www.crummy.com/software/BeautifulSoup/). | |||
""" | |||
try: | |||
soup = BeautifulSoup(self.text, "lxml").body | |||
except ValueError: | |||
soup = BeautifulSoup(self.text).body | |||
is_comment = lambda text: isinstance(text, bs4.element.Comment) | |||
[comment.extract() for comment in soup.find_all(text=is_comment)] | |||
for tag in self.hidden_tags: | |||
[element.extract() for element in soup.find_all(tag)] | |||
return "\n".join(soup.stripped_strings) |
@@ -0,0 +1,60 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
__all__ = ["CopyvioCheckResult"] | |||
class CopyvioCheckResult(object): | |||
""" | |||
**EarwigBot: Wiki Toolset: Copyvio Check Result** | |||
A class holding information about the results of a copyvio check. | |||
*Attributes:* | |||
- :py:attr:`violation`: ``True`` if this is a violation, else ``False`` | |||
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy | |||
- :py:attr:`url`: the URL of the violated page | |||
- :py:attr:`queries`: the number of queries used to reach a result | |||
- :py:attr:`article_chain`: the MarkovChain of the article text | |||
- :py:attr:`source_chain`: the MarkovChain of the violated page text | |||
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two | |||
""" | |||
def __init__(self, violation, confidence, url, queries, article, chains): | |||
self.violation = violation | |||
self.confidence = confidence | |||
self.url = url | |||
self.queries = queries | |||
self.article_chain = article | |||
self.source_chain = chains[0] | |||
self.delta_chain = chains[1] | |||
def __repr__(self): | |||
"""Return the canonical string representation of the result.""" | |||
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" | |||
return res.format(self.violation, self.confidence, self.url, | |||
self.queries) | |||
def __str__(self): | |||
"""Return a nice string representation of the result.""" | |||
res = "<CopyvioCheckResult ({0} with {1} conf)>" | |||
return res.format(self.violation, self.confidence) |
@@ -0,0 +1,94 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from json import loads | |||
from urllib import quote_plus, urlencode | |||
try: | |||
import oauth2 as oauth | |||
except ImportError: | |||
oauth = None | |||
from earwigbot.exceptions import SearchQueryError | |||
__all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] | |||
class BaseSearchEngine(object): | |||
"""Base class for a simple search engine interface.""" | |||
name = "Base" | |||
def __init__(self, cred): | |||
"""Store credentials *cred* for searching later on.""" | |||
self.cred = cred | |||
def __repr__(self): | |||
"""Return the canonical string representation of the search engine.""" | |||
return "{0}()".format(self.__class__.__name__) | |||
def __str__(self): | |||
"""Return a nice string representation of the search engine.""" | |||
return "<{0}>".format(self.__class__.__name__) | |||
def search(self, query): | |||
"""Use this engine to search for *query*. | |||
Not implemented in this base class; overridden in subclasses. | |||
""" | |||
raise NotImplementedError() | |||
class YahooBOSSSearchEngine(BaseSearchEngine): | |||
"""A search engine interface with Yahoo! BOSS.""" | |||
name = "Yahoo! BOSS" | |||
def search(self, query): | |||
"""Do a Yahoo! BOSS web search for *query*. | |||
Returns a list of URLs, no more than fifty, ranked by relevance (as | |||
determined by Yahoo). Raises | |||
:py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. | |||
""" | |||
base_url = "http://yboss.yahooapis.com/ysearch/web" | |||
query = quote_plus(query.join('"', '"')) | |||
params = {"q": query, "type": "html,text", "format": "json"} | |||
url = "{0}?{1}".format(base_url, urlencode(params)) | |||
consumer = oauth.Consumer(key=self.cred["key"], | |||
secret=self.cred["secret"]) | |||
client = oauth.Client(consumer) | |||
headers, body = client.request(url, "GET") | |||
if headers["status"] != "200": | |||
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" | |||
raise SearchQueryError(e.format(headers["status"], body)) | |||
try: | |||
res = loads(body) | |||
except ValueError: | |||
e = "Yahoo! BOSS Error: JSON could not be decoded" | |||
raise SearchQueryError(e) | |||
try: | |||
results = res["bossresponse"]["web"]["results"] | |||
except KeyError: | |||
return [] | |||
return [result["url"] for result in results] |
@@ -21,6 +21,7 @@ | |||
# SOFTWARE. | |||
from hashlib import md5 | |||
from logging import getLogger, NullHandler | |||
import re | |||
from time import gmtime, strftime | |||
from urllib import quote | |||
@@ -31,11 +32,11 @@ except ImportError: | |||
mwparserfromhell = None | |||
from earwigbot import exceptions | |||
from earwigbot.wiki.copyright import CopyrightMixIn | |||
from earwigbot.wiki.copyvios import CopyvioMixIn | |||
__all__ = ["Page"] | |||
class Page(CopyrightMixIn): | |||
class Page(CopyvioMixIn): | |||
""" | |||
**EarwigBot: Wiki Toolset: Page** | |||
@@ -81,7 +82,8 @@ class Page(CopyrightMixIn): | |||
PAGE_MISSING = 2 | |||
PAGE_EXISTS = 3 | |||
def __init__(self, site, title, follow_redirects=False, pageid=None): | |||
def __init__(self, site, title, follow_redirects=False, pageid=None, | |||
logger=None): | |||
"""Constructor for new Page instances. | |||
Takes four arguments: a Site object, the Page's title (or pagename), | |||
@@ -100,6 +102,14 @@ class Page(CopyrightMixIn): | |||
self._follow_redirects = self._keep_following = follow_redirects | |||
self._pageid = pageid | |||
# Set up our internal logger: | |||
if logger: | |||
self._logger = logger | |||
else: # Just set up a null logger to eat up our messages: | |||
self._logger = getLogger("earwigbot.wiki") | |||
self._logger.addHandler(NullHandler()) | |||
# Attributes to be loaded through the API: | |||
self._exists = self.PAGE_UNKNOWN | |||
self._is_redirect = None | |||
self._lastrevid = None | |||
@@ -92,7 +92,7 @@ class Site(object): | |||
namespaces=None, login=(None, None), cookiejar=None, | |||
user_agent=None, use_https=False, assert_edit=None, | |||
maxlag=None, wait_between_queries=3, logger=None, | |||
search_config=(None, None)): | |||
search_config=None): | |||
"""Constructor for new Site instances. | |||
This probably isn't necessary to call yourself unless you're building a | |||
@@ -560,10 +560,10 @@ class Site(object): | |||
return [self.SERVICE_API] | |||
sqllag = self._sql_info_cache["replag"] | |||
if sqllag > 180: | |||
if sqllag > 300: | |||
if not self._maxlag: | |||
return [self.SERVICE_API, self.SERVICE_SQL] | |||
if now - self._api_info_cache["lastcheck"] > 120: | |||
if now - self._api_info_cache["lastcheck"] > 300: | |||
self._api_info_cache["lastcheck"] = now | |||
try: | |||
self._api_info_cache["maxlag"] = apilag = self.get_maxlag() | |||
@@ -571,7 +571,7 @@ class Site(object): | |||
self._api_info_cache["maxlag"] = apilag = 0 | |||
else: | |||
apilag = self._api_info_cache["maxlag"] | |||
if sqllag / (180.0 / self._maxlag) < apilag: | |||
if apilag > self._maxlag: | |||
return [self.SERVICE_SQL, self.SERVICE_API] | |||
return [self.SERVICE_API, self.SERVICE_SQL] | |||
@@ -789,8 +789,9 @@ class Site(object): | |||
prefix = title.split(":", 1)[0] | |||
if prefix != title: # Avoid a page that is simply "Category" | |||
if prefix in prefixes: | |||
return Category(self, title, follow_redirects, pageid) | |||
return Page(self, title, follow_redirects, pageid) | |||
return Category(self, title, follow_redirects, pageid, | |||
self._logger) | |||
return Page(self, title, follow_redirects, pageid, self._logger) | |||
def get_category(self, catname, follow_redirects=False, pageid=None): | |||
"""Return a :py:class:`Category` object for the given category name. | |||
@@ -802,7 +803,7 @@ class Site(object): | |||
catname = self._unicodeify(catname) | |||
prefix = self.namespace_id_to_name(constants.NS_CATEGORY) | |||
pagename = u':'.join((prefix, catname)) | |||
return Category(self, pagename, follow_redirects, pageid) | |||
return Category(self, pagename, follow_redirects, pageid, self._logger) | |||
def get_user(self, username=None): | |||
"""Return a :py:class:`User` object for the given username. | |||
@@ -815,7 +816,7 @@ class Site(object): | |||
username = self._unicodeify(username) | |||
else: | |||
username = self._get_username() | |||
return User(self, username) | |||
return User(self, username, self._logger) | |||
def delegate(self, services, args=None, kwargs=None): | |||
"""Delegate a task to either the API or SQL depending on conditions. | |||
@@ -29,6 +29,7 @@ import sqlite3 as sqlite | |||
from earwigbot import __version__ | |||
from earwigbot.exceptions import SiteNotFoundError | |||
from earwigbot.wiki.copyvios.exclusions import ExclusionsDB | |||
from earwigbot.wiki.site import Site | |||
__all__ = ["SitesDB"] | |||
@@ -58,11 +59,16 @@ class SitesDB(object): | |||
"""Set up the manager with an attribute for the base Bot object.""" | |||
self.config = bot.config | |||
self._logger = bot.logger.getChild("wiki") | |||
self._sites = {} # Internal site cache | |||
self._sitesdb = path.join(bot.config.root_dir, "sites.db") | |||
self._cookie_file = path.join(bot.config.root_dir, ".cookies") | |||
self._cookiejar = None | |||
excl_db = path.join(bot.config.root_dir, "exclusions.db") | |||
excl_logger = self._logger.getChild("exclusionsdb") | |||
self._exclusions_db = ExclusionsDB(self, excl_db, excl_logger) | |||
def __repr__(self): | |||
"""Return the canonical string representation of the SitesDB.""" | |||
res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})" | |||
@@ -192,6 +198,17 @@ class SitesDB(object): | |||
user_agent = user_agent.replace("$1", __version__) | |||
user_agent = user_agent.replace("$2", python_version()) | |||
if search_config: | |||
nltk_dir = path.join(self.config.root_dir, ".nltk") | |||
search_config["nltk_dir"] = nltk_dir | |||
search_config["exclusions_db"] = self._exclusions_db | |||
if not sql: | |||
sql = config.wiki.get("sql", {}) | |||
for key, value in sql.iteritems(): | |||
if "$1" in value: | |||
sql[key] = value.replace("$1", name) | |||
return Site(name=name, project=project, lang=lang, base_url=base_url, | |||
article_path=article_path, script_path=script_path, | |||
sql=sql, namespaces=namespaces, login=login, | |||
@@ -332,13 +349,12 @@ class SitesDB(object): | |||
the script path (meaning the API is located at | |||
``"{base_url}{script_path}/api.php"`` -> | |||
``"//{lang}.{project}.org/w/api.php"``), so this is the default. If | |||
your wiki is different, provide the script_path as an argument. The | |||
only other argument to :py:class:`~earwigbot.wiki.site.Site` that we | |||
can't get from config files or by querying the wiki itself is SQL | |||
connection info, so provide a dict of kwargs as *sql* and Site will | |||
pass it to :py:func:`oursql.connect(**sql) <oursql.connect>`, allowing | |||
you to make queries with :py:meth:`site.sql_query | |||
<earwigbot.wiki.site.Site.sql_query>`. | |||
your wiki is different, provide the script_path as an argument. SQL | |||
connection settings are guessed automatically using config's template | |||
value. If this is wrong or not specified, provide a dict of kwargs as | |||
*sql* and Site will pass it to :py:func:`oursql.connect(**sql) | |||
<oursql.connect>`, allowing you to make queries with | |||
:py:meth:`site.sql_query <earwigbot.wiki.site.Site.sql_query>`. | |||
Returns ``True`` if the site was added successfully or ``False`` if the | |||
site is already in our sitesdb (this can be done purposefully to update | |||
@@ -359,15 +375,31 @@ class SitesDB(object): | |||
use_https = config.wiki.get("useHTTPS", False) | |||
assert_edit = config.wiki.get("assert") | |||
maxlag = config.wiki.get("maxlag") | |||
wait_between_queries = config.wiki.get("waitTime", 5) | |||
wait_between_queries = config.wiki.get("waitTime", 3) | |||
logger = self._logger.getChild(name) | |||
search_config = config.wiki.get("search") | |||
if user_agent: | |||
user_agent = user_agent.replace("$1", __version__) | |||
user_agent = user_agent.replace("$2", python_version()) | |||
if search_config: | |||
nltk_dir = path.join(self.config.root_dir, ".nltk") | |||
search_config["nltk_dir"] = nltk_dir | |||
search_config["exclusions_db"] = self._exclusions_db | |||
if not sql: | |||
sql = config.wiki.get("sql", {}) | |||
for key, value in sql.iteritems(): | |||
if "$1" in value: | |||
sql[key] = value.replace("$1", name) | |||
# Create a Site object to log in and load the other attributes: | |||
site = Site(base_url=base_url, script_path=script_path, sql=sql, | |||
login=login, cookiejar=cookiejar, user_agent=user_agent, | |||
use_https=use_https, assert_edit=assert_edit, | |||
maxlag=maxlag, wait_between_queries=wait_between_queries, | |||
search_config=search_config) | |||
logger=logger, search_config=search_config) | |||
self._add_site_to_sitesdb(site) | |||
self._sites[site.name] = site | |||
@@ -20,6 +20,7 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from logging import getLogger, NullHandler | |||
from time import gmtime, strptime | |||
from earwigbot.exceptions import UserNotFoundError | |||
@@ -60,7 +61,7 @@ class User(object): | |||
talkpage | |||
""" | |||
def __init__(self, site, name): | |||
def __init__(self, site, name, logger=None): | |||
"""Constructor for new User instances. | |||
Takes two arguments, a Site object (necessary for doing API queries), | |||
@@ -76,6 +77,13 @@ class User(object): | |||
self._site = site | |||
self._name = name | |||
# Set up our internal logger: | |||
if logger: | |||
self._logger = logger | |||
else: # Just set up a null logger to eat up our messages: | |||
self._logger = getLogger("earwigbot.wiki") | |||
self._logger.addHandler(NullHandler()) | |||
def __repr__(self): | |||
"""Return the canonical string representation of the User.""" | |||
return "User(name={0!r}, site={1!r})".format(self._name, self._site) | |||
@@ -25,6 +25,25 @@ from setuptools import setup, find_packages | |||
from earwigbot import __version__ | |||
# Not all of these dependencies are required, particularly the copyvio-specific | |||
# ones (bs4, lxml, nltk, and oauth2) or the command-specific ones (GitPython, | |||
# pytz). The bot should run fine without them, but will raise an exception if | |||
# you try to detect copyvios or run a command that requries one. | |||
dependencies = [ | |||
"GitPython >= 0.3.2.RC1", # Interfacing with git for !git and __version__ | |||
"PyYAML >= 3.10", # Parsing config files | |||
"beautifulsoup4 >= 4.1.1", # Parsing/scraping HTML for copyvios | |||
"lxml >= 2.3.4", # Faster parser for BeautifulSoup | |||
"mwparserfromhell >= 0.1", # Parsing wikicode for manipulation | |||
"nltk >= 2.0.2", # Parsing sentences to split article content for copyvios | |||
"oursql >= 0.9.3", # Interfacing with MediaWiki databases | |||
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search for copyvios | |||
"py-bcrypt >= 0.2", # Hashing the bot key in the config file | |||
"pycrypto >= 2.5", # Storing bot passwords and keys in the config file | |||
"pytz >= 2012c", # Handling timezones for the !time IRC command | |||
] | |||
with open("README.rst") as fp: | |||
long_docs = fp.read() | |||
@@ -32,15 +51,7 @@ setup( | |||
name = "earwigbot", | |||
packages = find_packages(exclude=("tests",)), | |||
entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]}, | |||
install_requires = ["GitPython >= 0.3.2.RC1", # Interfacing with git | |||
"PyYAML >= 3.10", # Config parsing | |||
"mwparserfromhell >= 0.1", # Wikicode parsing | |||
"oursql >= 0.9.3", # Talking with MediaWiki databases | |||
"oauth2 >= 1.5.211", # Talking with Yahoo BOSS Search | |||
"py-bcrypt >= 0.2", # Password hashing in config | |||
"pycrypto >= 2.5", # Storing bot passwords and keys | |||
"pytz >= 2012c", # Timezone handling | |||
], | |||
install_requires = dependencies, | |||
test_suite = "tests", | |||
version = __version__, | |||
author = "Ben Kurtovic", | |||