@@ -0,0 +1,47 @@ | |||||
copyvios Package | |||||
================ | |||||
:mod:`copyvios` Package | |||||
----------------------- | |||||
.. automodule:: earwigbot.wiki.copyvios | |||||
:members: | |||||
:undoc-members: | |||||
:mod:`exclusions` Module | |||||
------------------------ | |||||
.. automodule:: earwigbot.wiki.copyvios.exclusions | |||||
:members: | |||||
:undoc-members: | |||||
:mod:`markov` Module | |||||
-------------------- | |||||
.. automodule:: earwigbot.wiki.copyvios.markov | |||||
:members: | |||||
:undoc-members: | |||||
:show-inheritance: | |||||
:mod:`parsers` Module | |||||
--------------------- | |||||
.. automodule:: earwigbot.wiki.copyvios.parsers | |||||
:members: | |||||
:undoc-members: | |||||
:show-inheritance: | |||||
:mod:`result` Module | |||||
-------------------- | |||||
.. automodule:: earwigbot.wiki.copyvios.result | |||||
:members: | |||||
:undoc-members: | |||||
:mod:`search` Module | |||||
-------------------- | |||||
.. automodule:: earwigbot.wiki.copyvios.search | |||||
:members: | |||||
:undoc-members: | |||||
:show-inheritance: |
@@ -22,13 +22,6 @@ wiki Package | |||||
:members: | :members: | ||||
:undoc-members: | :undoc-members: | ||||
:mod:`copyright` Module | |||||
.. automodule:: earwigbot.wiki.copyright | |||||
:members: | |||||
:undoc-members: | |||||
:mod:`page` Module | :mod:`page` Module | ||||
------------------ | ------------------ | ||||
@@ -57,3 +50,10 @@ wiki Package | |||||
.. automodule:: earwigbot.wiki.user | .. automodule:: earwigbot.wiki.user | ||||
:members: | :members: | ||||
:undoc-members: | :undoc-members: | ||||
Subpackages | |||||
----------- | |||||
.. toctree:: | |||||
earwigbot.wiki.copyvios |
@@ -2,6 +2,6 @@ earwigbot | |||||
========= | ========= | ||||
.. toctree:: | .. toctree:: | ||||
:maxdepth: 4 | |||||
:maxdepth: 6 | |||||
earwigbot | earwigbot |
@@ -47,9 +47,10 @@ wikis, you can usually use code like this:: | |||||
site = bot.wiki.add_site(project=project, lang=lang) | site = bot.wiki.add_site(project=project, lang=lang) | ||||
This works because EarwigBot assumes that the URL for the site is | This works because EarwigBot assumes that the URL for the site is | ||||
``"//{lang}.{project}.org"`` and the API is at ``/w/api.php``; this might | |||||
change if you're dealing with non-WMF wikis, where the code might look | |||||
something more like:: | |||||
``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL | |||||
connection info (if any) is stored as ``config.wiki["sql"]``. This might change | |||||
if you're dealing with non-WMF wikis, where the code might look something more | |||||
like:: | |||||
project, lang = "mywiki", "it" | project, lang = "mywiki", "it" | ||||
try: | try: | ||||
@@ -30,6 +30,7 @@ class Link(Command): | |||||
name = "link" | name = "link" | ||||
def process(self, data): | def process(self, data): | ||||
self.site = self.bot.wiki.get_site() | |||||
msg = data.msg | msg = data.msg | ||||
if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg): | if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg): | ||||
@@ -41,8 +42,8 @@ class Link(Command): | |||||
if not data.args: | if not data.args: | ||||
self.reply(data, "what do you want me to link to?") | self.reply(data, "what do you want me to link to?") | ||||
return | return | ||||
pagename = ' '.join(data.args) | |||||
link = self.parse_link(pagename) | |||||
pagename = " ".join(data.args) | |||||
link = self.site.get_page(pagename).url | |||||
self.reply(data, link) | self.reply(data, link) | ||||
def parse_line(self, line): | def parse_line(self, line): | ||||
@@ -56,8 +57,7 @@ class Link(Command): | |||||
if links: | if links: | ||||
# re.findall() returns a list of tuples, but we only want the 2nd | # re.findall() returns a list of tuples, but we only want the 2nd | ||||
# item in each tuple: | # item in each tuple: | ||||
links = [i[1] for i in links] | |||||
results = map(self.parse_link, links) | |||||
results = [self.site.get_page(name[1]).url for name in links] | |||||
# Find all {{templates}} | # Find all {{templates}} | ||||
templates = re.findall("(\{\{(.*?)(\||\}\}))", line) | templates = re.findall("(\{\{(.*?)(\||\}\}))", line) | ||||
@@ -67,10 +67,6 @@ class Link(Command): | |||||
return results | return results | ||||
def parse_link(self, pagename): | |||||
link = quote(pagename.replace(" ", "_"), safe="/:") | |||||
return "".join(("http://enwp.org/", link)) | |||||
def parse_template(self, pagename): | def parse_template(self, pagename): | ||||
pagename = "".join(("Template:", pagename)) | pagename = "".join(("Template:", pagename)) | ||||
return self.parse_link(pagename) | |||||
return self.site.get_page(pagename).url |
@@ -23,6 +23,7 @@ | |||||
from hashlib import sha256 | from hashlib import sha256 | ||||
from os.path import expanduser | from os.path import expanduser | ||||
from threading import Lock | from threading import Lock | ||||
from urllib import quote | |||||
import oursql | import oursql | ||||
@@ -70,35 +71,36 @@ class AFCCopyvios(Task): | |||||
"""Detect copyvios in 'page' and add a note if any are found.""" | """Detect copyvios in 'page' and add a note if any are found.""" | ||||
title = page.title | title = page.title | ||||
if title in self.ignore_list: | if title in self.ignore_list: | ||||
msg = "Skipping page in ignore list: [[{0}]]" | |||||
msg = u"Skipping page in ignore list: [[{0}]]" | |||||
self.logger.info(msg.format(title)) | self.logger.info(msg.format(title)) | ||||
return | return | ||||
pageid = page.pageid | pageid = page.pageid | ||||
if self.has_been_processed(pageid): | if self.has_been_processed(pageid): | ||||
msg = "Skipping check on already processed page [[{0}]]" | |||||
msg = u"Skipping check on already processed page [[{0}]]" | |||||
self.logger.info(msg.format(title)) | self.logger.info(msg.format(title)) | ||||
return | return | ||||
self.logger.info("Checking [[{0}]]".format(title)) | |||||
self.logger.info(u"Checking [[{0}]]".format(title)) | |||||
result = page.copyvio_check(self.min_confidence, self.max_queries) | result = page.copyvio_check(self.min_confidence, self.max_queries) | ||||
url = result.url | url = result.url | ||||
confidence = "{0}%".format(round(result.confidence * 100, 2)) | confidence = "{0}%".format(round(result.confidence * 100, 2)) | ||||
if result.violation: | if result.violation: | ||||
safeurl = quote(url.encode("utf8"), safe="/:").decode("utf8") | |||||
content = page.get() | content = page.get() | ||||
template = "\{\{{0}|url={1}|confidence={2}\}\}\n" | |||||
template = template.format(self.template, url, confidence) | |||||
template = u"\{\{{0}|url={1}|confidence={2}\}\}\n" | |||||
template = template.format(self.template, safeurl, confidence) | |||||
newtext = template + content | newtext = template + content | ||||
if "{url}" in self.summary: | if "{url}" in self.summary: | ||||
page.edit(newtext, self.summary.format(url=url)) | page.edit(newtext, self.summary.format(url=url)) | ||||
else: | else: | ||||
page.edit(newtext, self.summary) | page.edit(newtext, self.summary) | ||||
msg = "Found violation: [[{0}]] -> {1} ({2} confidence)" | |||||
self.logger.warn(msg.format(title, url, confidence)) | |||||
msg = u"Found violation: [[{0}]] -> {1} ({2} confidence)" | |||||
self.logger.info(msg.format(title, url, confidence)) | |||||
else: | else: | ||||
msg = "No violations detected (best: {1} at {2} confidence)" | |||||
self.logger.debug(msg.format(url, confidence)) | |||||
msg = u"No violations detected in [[{0}]] (best: {1} at {2} confidence)" | |||||
self.logger.info(msg.format(title, url, confidence)) | |||||
self.log_processed(pageid) | self.log_processed(pageid) | ||||
if self.cache_results: | if self.cache_results: | ||||
@@ -110,9 +112,7 @@ class AFCCopyvios(Task): | |||||
with self.conn.cursor() as cursor: | with self.conn.cursor() as cursor: | ||||
cursor.execute(query, (pageid,)) | cursor.execute(query, (pageid,)) | ||||
results = cursor.fetchall() | results = cursor.fetchall() | ||||
if results: | |||||
return True | |||||
return False | |||||
return True if results else False | |||||
def log_processed(self, pageid): | def log_processed(self, pageid): | ||||
"""Adds pageid to our database of processed pages. | """Adds pageid to our database of processed pages. | ||||
@@ -138,8 +138,8 @@ class AFCCopyvios(Task): | |||||
be) retained for one day; this task does not remove old entries (that | be) retained for one day; this task does not remove old entries (that | ||||
is handled by the Toolserver component). | is handled by the Toolserver component). | ||||
This will only be called if "cache_results" == True in the task's | |||||
config, which is False by default. | |||||
This will only be called if ``cache_results == True`` in the task's | |||||
config, which is ``False`` by default. | |||||
""" | """ | ||||
pageid = page.pageid | pageid = page.pageid | ||||
hash = sha256(page.get()).hexdigest() | hash = sha256(page.get()).hexdigest() | ||||
@@ -1,324 +0,0 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
from collections import defaultdict | |||||
from functools import partial | |||||
from gzip import GzipFile | |||||
from json import loads | |||||
from re import sub, UNICODE | |||||
from StringIO import StringIO | |||||
from time import sleep, time | |||||
from urllib import quote_plus, urlencode | |||||
from urllib2 import build_opener, URLError | |||||
try: | |||||
import oauth2 as oauth | |||||
except ImportError: | |||||
oauth = None | |||||
from earwigbot.exceptions import * | |||||
class _CopyvioCheckResult(object): | |||||
def __init__(self, violation, confidence, url, queries, article, chains): | |||||
self.violation = violation | |||||
self.confidence = confidence | |||||
self.url = url | |||||
self.queries = queries | |||||
self.article_chain = article | |||||
self.source_chain = chains[0] | |||||
self.delta_chain = chains[1] | |||||
def __repr__(self): | |||||
r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" | |||||
return r.format(self.violation, self.confidence, self.url, self.queries) | |||||
class _MarkovChain(object): | |||||
START = -1 | |||||
END = -2 | |||||
def __init__(self, text): | |||||
self.text = text | |||||
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) | |||||
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split() | |||||
prev = self.START | |||||
for word in words: | |||||
self.chain[prev][word] += 1 | |||||
prev = word | |||||
try: # This won't work if the source text is completely blank | |||||
self.chain[word][self.END] += 1 | |||||
except KeyError: | |||||
pass | |||||
def size(self): | |||||
count = 0 | |||||
for node in self.chain.itervalues(): | |||||
for hits in node.itervalues(): | |||||
count += hits | |||||
return count | |||||
class _MarkovChainIntersection(_MarkovChain): | |||||
def __init__(self, mc1, mc2): | |||||
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) | |||||
c1 = mc1.chain | |||||
c2 = mc2.chain | |||||
for word, nodes1 in c1.iteritems(): | |||||
if word in c2: | |||||
nodes2 = c2[word] | |||||
for node, count1 in nodes1.iteritems(): | |||||
if node in nodes2: | |||||
count2 = nodes2[node] | |||||
self.chain[word][node] = min(count1, count2) | |||||
class CopyrightMixIn(object): | |||||
""" | |||||
EarwigBot's Wiki Toolset: Copyright Violation Mixin | |||||
This is a mixin that provides two public methods, copyvio_check() and | |||||
copyvio_compare(). The former checks the page for copyright violations | |||||
using a search engine API, and the latter compares the page against a | |||||
specified URL. Credentials for the search engine API are stored in the | |||||
site's config. | |||||
""" | |||||
def __init__(self, site): | |||||
self._opener = build_opener() | |||||
self._opener.addheaders = site._opener.addheaders | |||||
def _open_url_ignoring_errors(self, url): | |||||
"""Open a URL using self._opener and return its content, or None. | |||||
Will decompress the content if the headers contain "gzip" as its | |||||
content encoding, and will return None if URLError is raised while | |||||
opening the URL. IOErrors while gunzipping a compressed response are | |||||
ignored, and the original content is returned. | |||||
""" | |||||
try: | |||||
response = self._opener.open(url) | |||||
except URLError: | |||||
return None | |||||
result = response.read() | |||||
if response.headers.get("Content-Encoding") == "gzip": | |||||
stream = StringIO(result) | |||||
gzipper = GzipFile(fileobj=stream) | |||||
try: | |||||
result = gzipper.read() | |||||
except IOError: | |||||
pass | |||||
return result | |||||
def _select_search_engine(self): | |||||
"""Return a function that can be called to do web searches. | |||||
The "function" is a functools.partial object that takes one argument, a | |||||
query, and returns a list of URLs, ranked by importance. The underlying | |||||
logic depends on the 'engine' argument; for example, if 'engine' is | |||||
"Yahoo! BOSS", we'll use self._yahoo_boss_query for querying. | |||||
Raises UnknownSearchEngineError if the 'engine' listed in our config is | |||||
unknown to us, and UnsupportedSearchEngineError if we are missing a | |||||
required package or module, like oauth2 for "Yahoo! BOSS". | |||||
""" | |||||
engine, credentials = self._site._search_config | |||||
if engine == "Yahoo! BOSS": | |||||
if not oauth: | |||||
e = "The package 'oauth2' could not be imported" | |||||
raise UnsupportedSearchEngineError(e) | |||||
searcher = self._yahoo_boss_query | |||||
else: | |||||
raise UnknownSearchEngineError(engine) | |||||
return partial(searcher, credentials) | |||||
def _yahoo_boss_query(self, cred, query): | |||||
"""Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials. | |||||
Returns a list of URLs, no more than fifty, ranked by relevance (as | |||||
determined by Yahoo). Raises SearchQueryError() on errors. | |||||
""" | |||||
base_url = "http://yboss.yahooapis.com/ysearch/web" | |||||
query = quote_plus(query.join('"', '"')) | |||||
params = {"q": query, "style": "raw", "format": "json"} | |||||
url = "{0}?{1}".format(base_url, urlencode(params)) | |||||
consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"]) | |||||
client = oauth.Client(consumer) | |||||
headers, body = client.request(url, "GET") | |||||
if headers["status"] != "200": | |||||
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" | |||||
raise SearchQueryError(e.format(headers["status"], body)) | |||||
try: | |||||
res = loads(body) | |||||
except ValueError: | |||||
e = "Yahoo! BOSS Error: JSON could not be decoded" | |||||
raise SearchQueryError(e) | |||||
try: | |||||
results = res["bossresponse"]["web"]["results"] | |||||
except KeyError: | |||||
return [] | |||||
return [result["url"] for result in results] | |||||
def _copyvio_strip_html(self, html): | |||||
""" | |||||
STUB | |||||
""" | |||||
return html | |||||
def _copyvio_strip_article(self, content): | |||||
"""Clean the page's raw text by removing templates and formatting. | |||||
Returns the page's text with all HTML and wikicode formatting removed, | |||||
including templates, tables, references, and the Bibliography/ | |||||
References/Sources/See also section(s). It retains punctuation | |||||
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses, | |||||
quotes) and original capitalization, but not brackets (square and | |||||
angular), abnormal spacing, nor anything else. HTML entities are | |||||
replaced by their unicode equivalents. | |||||
STUB | |||||
""" | |||||
return content | |||||
def _copyvio_chunk_article(self, content, max_chunks): | |||||
""" | |||||
STUB | |||||
""" | |||||
return [content] | |||||
def _copyvio_compare_content(self, article, url): | |||||
""" | |||||
DOCSTRING NEEDED | |||||
""" | |||||
html = self._open_url_ignoring_errors(url) | |||||
if not html: | |||||
return 0 | |||||
source = _MarkovChain(self._copyvio_strip_html(html)) | |||||
delta = _MarkovChainIntersection(article, source) | |||||
return float(delta.size()) / article.size(), (source, delta) | |||||
def copyvio_check(self, min_confidence=0.5, max_queries=-1, | |||||
interquery_sleep=1, force=False): | |||||
"""Check the page for copyright violations. | |||||
Returns a _CopyvioCheckResult object with four useful attributes: | |||||
"violation", "confidence", "url", and "queries". "confidence" is a | |||||
number between 0 and 1; if it is less than "min_confidence", we could | |||||
not find any indication of a violation (so "violation" will be False | |||||
and "url" may or may not be None), otherwise it indicates the relative | |||||
faith in our results, "violation" will be True, and "url" will be the | |||||
place the article is suspected of being copied from. "queries" is the | |||||
number of queries used to determine the results. | |||||
"max_queries" is self-explanatory; we will never make more than this | |||||
number of queries in a given check. If it's less than 0, we will not | |||||
limit our number of queries. | |||||
"interquery_sleep" is the minimum amount of time we will sleep between | |||||
search engine queries, in seconds. | |||||
"force" is simply passed to page.get() - it has the same behavior there | |||||
as it does here. | |||||
Raises CopyvioCheckError or subclasses (UnknownSearchEngineError, | |||||
SearchQueryError, ...) on errors. | |||||
""" | |||||
search = self._select_search_engine() | |||||
handled_urls = [] | |||||
best_confidence = 0 | |||||
best_match = None | |||||
num_queries = 0 | |||||
empty = _MarkovChain("") | |||||
best_chains = (empty, _MarkovChainIntersection(empty, empty)) | |||||
content = self.get(force) | |||||
clean = self._copyvio_strip_article(content) | |||||
chunks = self._copyvio_chunk_article(clean, max_queries) | |||||
article_chain = _MarkovChain(clean) | |||||
last_query = time() | |||||
if article_chain.size() < 20: # Auto-fail very small articles | |||||
return _CopyvioCheckResult(False, best_confidence, best_match, | |||||
num_queries, article_chain, best_chains) | |||||
while (chunks and best_confidence < min_confidence and | |||||
(max_queries < 0 or num_queries < max_queries)): | |||||
urls = search(chunks.pop(0)) | |||||
urls = [url for url in urls if url not in handled_urls] | |||||
for url in urls: | |||||
handled_urls.append(url) | |||||
conf, chains = self._copyvio_compare_content(article_chain, url) | |||||
if conf > best_confidence: | |||||
best_confidence = conf | |||||
best_match = url | |||||
best_chains = chains | |||||
num_queries += 1 | |||||
diff = time() - last_query | |||||
if diff < interquery_sleep: | |||||
sleep(interquery_sleep - diff) | |||||
last_query = time() | |||||
if best_confidence >= min_confidence: # violation? | |||||
v = True | |||||
else: | |||||
v = False | |||||
return _CopyvioCheckResult(v, best_confidence, best_match, num_queries, | |||||
article_chain, best_chains) | |||||
def copyvio_compare(self, url, min_confidence=0.5, force=False): | |||||
"""Check the page like copyvio_check(), but against a specific URL. | |||||
This is essentially a reduced version of the above - a copyivo | |||||
comparison is made using Markov chains and the result is returned in a | |||||
_CopyvioCheckResult object - without using a search engine, as the | |||||
suspected "violated" URL is supplied from the start. | |||||
Its primary use is to generate a result when the URL is retrieved from | |||||
a cache, like the one used in EarwigBot's Toolserver site. After a | |||||
search is done, the resulting URL is stored in a cache for 24 hours so | |||||
future checks against that page will not require another set of | |||||
time-and-money-consuming search engine queries. However, the comparison | |||||
itself (which includes the article's and the source's content) cannot | |||||
be stored for data retention reasons, so a fresh comparison is made | |||||
using this function. | |||||
Since no searching is done, neither UnknownSearchEngineError nor | |||||
SearchQueryError will be raised. | |||||
""" | |||||
content = self.get(force) | |||||
clean = self._copyvio_strip_article(content) | |||||
article_chain = _MarkovChain(clean) | |||||
confidence, chains = self._copyvio_compare_content(article_chain, url) | |||||
if confidence >= min_confidence: | |||||
is_violation = True | |||||
else: | |||||
is_violation = False | |||||
return _CopyvioCheckResult(is_violation, confidence, url, 0, | |||||
article_chain, chains) |
@@ -0,0 +1,229 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
from gzip import GzipFile | |||||
from StringIO import StringIO | |||||
from time import sleep, time | |||||
from urllib2 import build_opener, URLError | |||||
try: | |||||
import oauth2 as oauth | |||||
except ImportError: | |||||
oauth = None | |||||
from earwigbot import exceptions | |||||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | |||||
from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser | |||||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult | |||||
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine | |||||
__all__ = ["CopyvioMixIn"] | |||||
class CopyvioMixIn(object): | |||||
""" | |||||
**EarwigBot: Wiki Toolset: Copyright Violation MixIn** | |||||
This is a mixin that provides two public methods, :py:meth:`copyvio_check` | |||||
and :py:meth:`copyvio_compare`. The former checks the page for copyright | |||||
violations using a search engine API, and the latter compares the page | |||||
against a given URL. Credentials for the search engine API are stored in | |||||
the :py:class:`~earwigbot.wiki.site.Site`'s config. | |||||
""" | |||||
def __init__(self, site): | |||||
self._search_config = site._search_config | |||||
self._exclusions_db = self._search_config["exclusions_db"] | |||||
self._opener = build_opener() | |||||
self._opener.addheaders = site._opener.addheaders | |||||
def _open_url_ignoring_errors(self, url): | |||||
"""Open a URL using self._opener and return its content, or None. | |||||
Will decompress the content if the headers contain "gzip" as its | |||||
content encoding, and will return None if URLError is raised while | |||||
opening the URL. IOErrors while gunzipping a compressed response are | |||||
ignored, and the original content is returned. | |||||
""" | |||||
try: | |||||
response = self._opener.open(url) | |||||
except URLError: | |||||
return None | |||||
result = response.read() | |||||
if response.headers.get("Content-Encoding") == "gzip": | |||||
stream = StringIO(result) | |||||
gzipper = GzipFile(fileobj=stream) | |||||
try: | |||||
result = gzipper.read() | |||||
except IOError: | |||||
pass | |||||
return result | |||||
def _select_search_engine(self): | |||||
"""Return a function that can be called to do web searches. | |||||
The function takes one argument, a search query, and returns a list of | |||||
URLs, ranked by importance. The underlying logic depends on the | |||||
*engine* argument within our config; for example, if *engine* is | |||||
"Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying. | |||||
Raises UnknownSearchEngineError if the 'engine' listed in our config is | |||||
unknown to us, and UnsupportedSearchEngineError if we are missing a | |||||
required package or module, like oauth2 for "Yahoo! BOSS". | |||||
""" | |||||
engine = self._search_config["engine"] | |||||
credentials = self._search_config["credentials"] | |||||
if engine == "Yahoo! BOSS": | |||||
if not oauth: | |||||
e = "The package 'oauth2' could not be imported" | |||||
raise exceptions.UnsupportedSearchEngineError(e) | |||||
return YahooBOSSSearchEngine(credentials) | |||||
raise exceptions.UnknownSearchEngineError(engine) | |||||
def _copyvio_compare_content(self, article, url): | |||||
"""Return a number comparing an article and a URL. | |||||
The *article* is a Markov chain, whereas the *url* is just a string | |||||
that we'll try to open and read ourselves. | |||||
""" | |||||
html = self._open_url_ignoring_errors(url) | |||||
if not html: | |||||
return 0 | |||||
source = MarkovChain(HTMLTextParser(html).strip()) | |||||
delta = MarkovChainIntersection(article, source) | |||||
return float(delta.size()) / article.size(), (source, delta) | |||||
def copyvio_check(self, min_confidence=0.5, max_queries=-1, | |||||
interquery_sleep=1): | |||||
"""Check the page for copyright violations. | |||||
Returns a | |||||
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object | |||||
with information on the results of the check. | |||||
*max_queries* is self-explanatory; we will never make more than this | |||||
number of queries in a given check. If it's lower than 0, we will not | |||||
limit the number of queries. | |||||
*interquery_sleep* is the minimum amount of time we will sleep between | |||||
search engine queries, in seconds. | |||||
Raises :py:exc:`~earwigbot.exceptions.CopyvioCheckError` or subclasses | |||||
(:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError`, | |||||
:py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors. | |||||
""" | |||||
searcher = self._select_search_engine() | |||||
self._exclusions_db.sync(self.site.name) | |||||
handled_urls = [] | |||||
best_confidence = 0 | |||||
best_match = None | |||||
num_queries = 0 | |||||
empty = MarkovChain("") | |||||
best_chains = (empty, MarkovChainIntersection(empty, empty)) | |||||
parser = ArticleTextParser(self.get()) | |||||
clean = parser.strip() | |||||
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) | |||||
article_chain = MarkovChain(clean) | |||||
last_query = time() | |||||
if article_chain.size() < 20: # Auto-fail very small articles | |||||
return CopyvioCheckResult(False, best_confidence, best_match, | |||||
num_queries, article_chain, best_chains) | |||||
while (chunks and best_confidence < min_confidence and | |||||
(max_queries < 0 or num_queries < max_queries)): | |||||
chunk = chunks.pop(0) | |||||
log = u"[[{0}]] -> querying {1} for {2!r}" | |||||
self._logger.debug(log.format(self.title, searcher.name, chunk)) | |||||
urls = searcher.search(chunk) | |||||
urls = [url for url in urls if url not in handled_urls] | |||||
for url in urls: | |||||
handled_urls.append(url) | |||||
if self._exclusions_db.check(self.site.name, url): | |||||
continue | |||||
conf, chains = self._copyvio_compare_content(article_chain, url) | |||||
if conf > best_confidence: | |||||
best_confidence = conf | |||||
best_match = url | |||||
best_chains = chains | |||||
num_queries += 1 | |||||
diff = time() - last_query | |||||
if diff < interquery_sleep: | |||||
sleep(interquery_sleep - diff) | |||||
last_query = time() | |||||
if best_confidence >= min_confidence: | |||||
is_violation = True | |||||
log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2}; using {3} queries)" | |||||
self._logger.debug(log.format(self.title, best_confidence, | |||||
best_match, num_queries)) | |||||
else: | |||||
is_violation = False | |||||
log = u"No violation for [[{0}]] (confidence: {1}; using {2} queries)" | |||||
self._logger.debug(log.format(self.title, best_confidence, | |||||
num_queries)) | |||||
return CopyvioCheckResult(is_violation, best_confidence, best_match, | |||||
num_queries, article_chain, best_chains) | |||||
def copyvio_compare(self, url, min_confidence=0.5): | |||||
"""Check the page like :py:meth:`copyvio_check` against a specific URL. | |||||
This is essentially a reduced version of the above - a copyivo | |||||
comparison is made using Markov chains and the result is returned in a | |||||
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object - | |||||
but without using a search engine, since the suspected "violated" URL | |||||
is supplied from the start. | |||||
Its primary use is to generate a result when the URL is retrieved from | |||||
a cache, like the one used in EarwigBot's Toolserver site. After a | |||||
search is done, the resulting URL is stored in a cache for 24 hours so | |||||
future checks against that page will not require another set of | |||||
time-and-money-consuming search engine queries. However, the comparison | |||||
itself (which includes the article's and the source's content) cannot | |||||
be stored for data retention reasons, so a fresh comparison is made | |||||
using this function. | |||||
Since no searching is done, neither | |||||
:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError` nor | |||||
:py:exc:`~earwigbot.exceptions.SearchQueryError` will be raised. | |||||
""" | |||||
content = self.get() | |||||
clean = ArticleTextParser(content).strip() | |||||
article_chain = MarkovChain(clean) | |||||
confidence, chains = self._copyvio_compare_content(article_chain, url) | |||||
if confidence >= min_confidence: | |||||
is_violation = True | |||||
log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2})" | |||||
self._logger.debug(log.format(self.title, confidence, url)) | |||||
else: | |||||
is_violation = False | |||||
log = u"No violation for [[{0}]] (confidence: {1}; URL: {2})" | |||||
self._logger.debug(log.format(self.title, confidence, url)) | |||||
return CopyvioCheckResult(is_violation, confidence, url, 0, | |||||
article_chain, chains) |
@@ -0,0 +1,164 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
import re | |||||
import sqlite3 as sqlite | |||||
from threading import Lock | |||||
from time import time | |||||
from earwigbot import exceptions | |||||
__all__ = ["ExclusionsDB"] | |||||
default_sources = { | |||||
"enwiki": [ | |||||
"Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def", | |||||
"Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl", | |||||
"Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr", | |||||
"Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz" | |||||
] | |||||
} | |||||
class ExclusionsDB(object): | |||||
""" | |||||
**EarwigBot: Wiki Toolset: Exclusions Database Manager** | |||||
Controls the :file:`.exclusions.db` file, which stores URLs excluded from | |||||
copyright violation checks on account of being known mirrors, for example. | |||||
""" | |||||
def __init__(self, sitesdb, dbfile, logger): | |||||
self._sitesdb = sitesdb | |||||
self._dbfile = dbfile | |||||
self._logger = logger | |||||
self._db_access_lock = Lock() | |||||
def __repr__(self): | |||||
"""Return the canonical string representation of the ExclusionsDB.""" | |||||
res = "ExclusionsDB(sitesdb={0!r}, dbfile={1!r}, logger={2!r})" | |||||
return res.format(self._sitesdb, self._dbfile, self._logger) | |||||
def __str__(self): | |||||
"""Return a nice string representation of the ExclusionsDB.""" | |||||
return "<ExclusionsDB at {0}>".format(self._dbfile) | |||||
def _create(self): | |||||
"""Initialize the exclusions database with its necessary tables.""" | |||||
script = """ | |||||
CREATE TABLE sources (source_sitename, source_page); | |||||
CREATE TABLE updates (update_sitename, update_time); | |||||
CREATE TABLE exclusions (exclusion_sitename, exclusion_url); | |||||
""" | |||||
query = "INSERT INTO sources VALUES (?, ?);" | |||||
sources = [] | |||||
for sitename, pages in default_sources.iteritems(): | |||||
[sources.append((sitename, page)) for page in pages] | |||||
with sqlite.connect(self._dbfile) as conn: | |||||
conn.executescript(script) | |||||
conn.executemany(query, sources) | |||||
def _load_source(self, site, source): | |||||
"""Load from a specific source and return a set of URLs.""" | |||||
urls = set() | |||||
try: | |||||
data = site.get_page(source).get() | |||||
except exceptions.PageNotFoundError: | |||||
return urls | |||||
regexes = [ | |||||
"url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>", | |||||
"\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?" | |||||
] | |||||
for regex in regexes: | |||||
[urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)] | |||||
return urls | |||||
def _update(self, sitename): | |||||
"""Update the database from listed sources in the index.""" | |||||
query1 = "SELECT source_page FROM sources WHERE source_sitename = ?;" | |||||
query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" | |||||
query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?" | |||||
query4 = "INSERT INTO exclusions VALUES (?, ?);" | |||||
query5 = "SELECT 1 FROM updates WHERE update_sitename = ?;" | |||||
query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?;" | |||||
query7 = "INSERT INTO updates VALUES (?, ?);" | |||||
site = self._sitesdb.get_site(sitename) | |||||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||||
urls = set() | |||||
for (source,) in conn.execute(query1, (sitename,)): | |||||
urls |= self._load_source(site, source) | |||||
for (url,) in conn.execute(query2, (sitename,)): | |||||
if url in urls: | |||||
urls.remove(url) | |||||
else: | |||||
conn.execute(query3, (sitename, url)) | |||||
conn.executemany(query4, [(sitename, url) for url in urls]) | |||||
if conn.execute(query5, (name,)).fetchone(): | |||||
conn.execute(query6, (time(), sitename)) | |||||
else: | |||||
conn.execute(query7, (sitename, time())) | |||||
def _get_last_update(self, sitename): | |||||
"""Return the UNIX timestamp of the last time the db was updated.""" | |||||
query = "SELECT update_time FROM updates WHERE update_sitename = ?;" | |||||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||||
try: | |||||
result = conn.execute(query, (sitename,)).fetchone() | |||||
except sqlite.OperationalError: | |||||
self._create() | |||||
return 0 | |||||
return result[0] if result else 0 | |||||
def sync(self, sitename): | |||||
"""Update the database if it hasn't been updated in the past month. | |||||
This only updates the exclusions database for the *sitename* site. | |||||
""" | |||||
max_staleness = 60 * 60 * 24 * 30 | |||||
time_since_update = int(time() - self._get_last_update()) | |||||
if time_since_update > max_staleness: | |||||
log = u"Updating stale database: {0} (last updated {1} seconds ago)" | |||||
self._logger.info(log.format(sitename, time_since_update)) | |||||
self._update(sitename) | |||||
else: | |||||
log = u"Database for {0} is still fresh (last updated {1} seconds ago)" | |||||
self._logger.debug(log.format(sitename, time_since_update)) | |||||
def check(self, sitename, url): | |||||
"""Check whether a given URL is in the exclusions database. | |||||
Return ``True`` if the URL is in the database, or ``False`` otherwise. | |||||
""" | |||||
normalized = re.sub("https?://", "", url.lower()) | |||||
query = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" | |||||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||||
for row in conn.execute(query, (sitename,)): | |||||
if normalized.startswith(row[0]): | |||||
log = u"Exclusion detected in {0} for {1}" | |||||
self._logger.debug(log.format(sitename, url)) | |||||
return True | |||||
log = u"No exclusions in {0} for {1}".format(sitename, url) | |||||
self._logger.debug(log) | |||||
return False |
@@ -0,0 +1,87 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
from collections import defaultdict | |||||
from re import sub, UNICODE | |||||
__all__ = ["MarkovChain", "MarkovChainIntersection"] | |||||
class MarkovChain(object): | |||||
"""Implements a basic ngram Markov chain of words.""" | |||||
START = -1 | |||||
END = -2 | |||||
degree = 3 # 2 for bigrams, 3 for trigrams, etc. | |||||
def __init__(self, text): | |||||
self.text = text | |||||
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) | |||||
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split() | |||||
padding = self.degree - 1 | |||||
words = ([self.START] * padding) + words + ([self.END] * padding) | |||||
for i in range(len(words) - self.degree + 1): | |||||
last = i + self.degree - 1 | |||||
self.chain[words[i:last]][words[last]] += 1 | |||||
def __repr__(self): | |||||
"""Return the canonical string representation of the MarkovChain.""" | |||||
return "MarkovChain(text={0!r})".format(self.text) | |||||
def __str__(self): | |||||
"""Return a nice string representation of the MarkovChain.""" | |||||
return "<MarkovChain of size {0}>".format(self.size()) | |||||
def size(self): | |||||
"""Return the size of the Markov chain: the total number of nodes.""" | |||||
count = 0 | |||||
for node in self.chain.itervalues(): | |||||
for hits in node.itervalues(): | |||||
count += hits | |||||
return count | |||||
class MarkovChainIntersection(MarkovChain): | |||||
"""Implements the intersection of two chains (i.e., their shared nodes).""" | |||||
def __init__(self, mc1, mc2): | |||||
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) | |||||
self.mc1, self.mc2 = mc1, mc2 | |||||
c1 = mc1.chain | |||||
c2 = mc2.chain | |||||
for word, nodes1 in c1.iteritems(): | |||||
if word in c2: | |||||
nodes2 = c2[word] | |||||
for node, count1 in nodes1.iteritems(): | |||||
if node in nodes2: | |||||
count2 = nodes2[node] | |||||
self.chain[word][node] = min(count1, count2) | |||||
def __repr__(self): | |||||
"""Return the canonical string representation of the intersection.""" | |||||
res = "MarkovChainIntersection(mc1={0!r}, mc2={1!r})" | |||||
return res.format(self.mc1, self.mc2) | |||||
def __str__(self): | |||||
"""Return a nice string representation of the intersection.""" | |||||
res = "<MarkovChainIntersection of size {0} ({1} ^ {2})>" | |||||
return res.format(self.size(), self.mc1, self.mc2) |
@@ -0,0 +1,148 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
from os import path | |||||
try: | |||||
from bs4 import BeautifulSoup | |||||
except ImportError: | |||||
BeautifulSoup = None | |||||
try: | |||||
import mwparserfromhell | |||||
except ImportError: | |||||
mwparserfromhell = None | |||||
try: | |||||
import nltk | |||||
except ImportError: | |||||
nltk = None | |||||
__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] | |||||
class BaseTextParser(object): | |||||
"""Base class for a parser that handles text.""" | |||||
def __init__(self, text): | |||||
self.text = text | |||||
def __repr__(self): | |||||
"""Return the canonical string representation of the text parser.""" | |||||
return "{0}(text={1!r})".format(self.__class__.__name__, self.text) | |||||
def __str__(self): | |||||
"""Return a nice string representation of the text parser.""" | |||||
name = self.__class__.__name__ | |||||
return "<{0} of text with size {1}>".format(name, len(text)) | |||||
class ArticleTextParser(BaseTextParser): | |||||
"""A parser that can strip and chunk wikicode article text.""" | |||||
def strip(self): | |||||
"""Clean the page's raw text by removing templates and formatting. | |||||
Return the page's text with all HTML and wikicode formatting removed, | |||||
including templates, tables, and references. It retains punctuation | |||||
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses, | |||||
quotes), original capitalization, and so forth. HTML entities are | |||||
replaced by their unicode equivalents. | |||||
The actual stripping is handled by :py:mod:`mwparserfromhell`. | |||||
""" | |||||
wikicode = mwparserfromhell.parse(self.text) | |||||
self.clean = wikicode.strip_code(normalize=True) | |||||
return self.clean | |||||
def chunk(self, nltk_dir, max_chunks, max_query=256): | |||||
"""Convert the clean article text into a list of web-searchable chunks. | |||||
No greater than *max_chunks* will be returned. Each chunk will only be | |||||
a sentence or two long at most (no more than *max_query*). The idea is | |||||
to return a sample of the article text rather than the whole, so we'll | |||||
pick and choose from parts of it, especially if the article is large | |||||
and *max_chunks* is low, so we don't end up just searching for just the | |||||
first paragraph. | |||||
This is implemented using :py:mod:`nltk` (http://nltk.org/). A base | |||||
directory (*nltk_dir*) is required to store nltk's punctuation | |||||
database. This is typically located in the bot's working directory. | |||||
""" | |||||
datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") | |||||
try: | |||||
tokenizer = nltk.data.load("file:" + datafile) | |||||
except LookupError: | |||||
nltk.download("punkt", nltk_dir) | |||||
tokenizer = nltk.data.load("file:" + datafile) | |||||
sentences = [] | |||||
for sentence in tokenizer.tokenize(self.clean): | |||||
if len(sentence) > max_query: | |||||
words = sentence.split() | |||||
while len(" ".join(words)) > max_query: | |||||
words.pop() | |||||
sentence = " ".join(words) | |||||
sentences.append(sentence) | |||||
if max_chunks >= len(sentences): | |||||
return sentences | |||||
chunks = [] | |||||
while len(chunks) < max_chunks: | |||||
if len(chunks) % 5 == 0: | |||||
chunk = sentences.pop(0) # Pop from beginning | |||||
elif len(chunks) % 5 == 1: | |||||
chunk = sentences.pop() # Pop from end | |||||
elif len(chunks) % 5 == 2: | |||||
chunk = sentences.pop(len(sentences) / 2) # Pop from Q2 | |||||
elif len(chunks) % 5 == 3: | |||||
chunk = sentences.pop(len(sentences) / 4) # Pop from Q1 | |||||
else: | |||||
chunk = sentences.pop(3 * len(sentences) / 4) # Pop from Q3 | |||||
chunks.append(chunk) | |||||
return chunks | |||||
class HTMLTextParser(BaseTextParser): | |||||
"""A parser that can extract the text from an HTML document.""" | |||||
hidden_tags = [ | |||||
"script", "style" | |||||
] | |||||
def strip(self): | |||||
"""Return the actual text contained within an HTML document. | |||||
Implemented using :py:mod:`BeautifulSoup <bs4>` | |||||
(http://www.crummy.com/software/BeautifulSoup/). | |||||
""" | |||||
try: | |||||
soup = BeautifulSoup(self.text, "lxml").body | |||||
except ValueError: | |||||
soup = BeautifulSoup(self.text).body | |||||
is_comment = lambda text: isinstance(text, bs4.element.Comment) | |||||
[comment.extract() for comment in soup.find_all(text=is_comment)] | |||||
for tag in self.hidden_tags: | |||||
[element.extract() for element in soup.find_all(tag)] | |||||
return "\n".join(soup.stripped_strings) |
@@ -0,0 +1,60 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
__all__ = ["CopyvioCheckResult"] | |||||
class CopyvioCheckResult(object): | |||||
""" | |||||
**EarwigBot: Wiki Toolset: Copyvio Check Result** | |||||
A class holding information about the results of a copyvio check. | |||||
*Attributes:* | |||||
- :py:attr:`violation`: ``True`` if this is a violation, else ``False`` | |||||
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy | |||||
- :py:attr:`url`: the URL of the violated page | |||||
- :py:attr:`queries`: the number of queries used to reach a result | |||||
- :py:attr:`article_chain`: the MarkovChain of the article text | |||||
- :py:attr:`source_chain`: the MarkovChain of the violated page text | |||||
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two | |||||
""" | |||||
def __init__(self, violation, confidence, url, queries, article, chains): | |||||
self.violation = violation | |||||
self.confidence = confidence | |||||
self.url = url | |||||
self.queries = queries | |||||
self.article_chain = article | |||||
self.source_chain = chains[0] | |||||
self.delta_chain = chains[1] | |||||
def __repr__(self): | |||||
"""Return the canonical string representation of the result.""" | |||||
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" | |||||
return res.format(self.violation, self.confidence, self.url, | |||||
self.queries) | |||||
def __str__(self): | |||||
"""Return a nice string representation of the result.""" | |||||
res = "<CopyvioCheckResult ({0} with {1} conf)>" | |||||
return res.format(self.violation, self.confidence) |
@@ -0,0 +1,94 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
from json import loads | |||||
from urllib import quote_plus, urlencode | |||||
try: | |||||
import oauth2 as oauth | |||||
except ImportError: | |||||
oauth = None | |||||
from earwigbot.exceptions import SearchQueryError | |||||
__all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] | |||||
class BaseSearchEngine(object): | |||||
"""Base class for a simple search engine interface.""" | |||||
name = "Base" | |||||
def __init__(self, cred): | |||||
"""Store credentials *cred* for searching later on.""" | |||||
self.cred = cred | |||||
def __repr__(self): | |||||
"""Return the canonical string representation of the search engine.""" | |||||
return "{0}()".format(self.__class__.__name__) | |||||
def __str__(self): | |||||
"""Return a nice string representation of the search engine.""" | |||||
return "<{0}>".format(self.__class__.__name__) | |||||
def search(self, query): | |||||
"""Use this engine to search for *query*. | |||||
Not implemented in this base class; overridden in subclasses. | |||||
""" | |||||
raise NotImplementedError() | |||||
class YahooBOSSSearchEngine(BaseSearchEngine): | |||||
"""A search engine interface with Yahoo! BOSS.""" | |||||
name = "Yahoo! BOSS" | |||||
def search(self, query): | |||||
"""Do a Yahoo! BOSS web search for *query*. | |||||
Returns a list of URLs, no more than fifty, ranked by relevance (as | |||||
determined by Yahoo). Raises | |||||
:py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. | |||||
""" | |||||
base_url = "http://yboss.yahooapis.com/ysearch/web" | |||||
query = quote_plus(query.join('"', '"')) | |||||
params = {"q": query, "type": "html,text", "format": "json"} | |||||
url = "{0}?{1}".format(base_url, urlencode(params)) | |||||
consumer = oauth.Consumer(key=self.cred["key"], | |||||
secret=self.cred["secret"]) | |||||
client = oauth.Client(consumer) | |||||
headers, body = client.request(url, "GET") | |||||
if headers["status"] != "200": | |||||
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" | |||||
raise SearchQueryError(e.format(headers["status"], body)) | |||||
try: | |||||
res = loads(body) | |||||
except ValueError: | |||||
e = "Yahoo! BOSS Error: JSON could not be decoded" | |||||
raise SearchQueryError(e) | |||||
try: | |||||
results = res["bossresponse"]["web"]["results"] | |||||
except KeyError: | |||||
return [] | |||||
return [result["url"] for result in results] |
@@ -21,6 +21,7 @@ | |||||
# SOFTWARE. | # SOFTWARE. | ||||
from hashlib import md5 | from hashlib import md5 | ||||
from logging import getLogger, NullHandler | |||||
import re | import re | ||||
from time import gmtime, strftime | from time import gmtime, strftime | ||||
from urllib import quote | from urllib import quote | ||||
@@ -31,11 +32,11 @@ except ImportError: | |||||
mwparserfromhell = None | mwparserfromhell = None | ||||
from earwigbot import exceptions | from earwigbot import exceptions | ||||
from earwigbot.wiki.copyright import CopyrightMixIn | |||||
from earwigbot.wiki.copyvios import CopyvioMixIn | |||||
__all__ = ["Page"] | __all__ = ["Page"] | ||||
class Page(CopyrightMixIn): | |||||
class Page(CopyvioMixIn): | |||||
""" | """ | ||||
**EarwigBot: Wiki Toolset: Page** | **EarwigBot: Wiki Toolset: Page** | ||||
@@ -81,7 +82,8 @@ class Page(CopyrightMixIn): | |||||
PAGE_MISSING = 2 | PAGE_MISSING = 2 | ||||
PAGE_EXISTS = 3 | PAGE_EXISTS = 3 | ||||
def __init__(self, site, title, follow_redirects=False, pageid=None): | |||||
def __init__(self, site, title, follow_redirects=False, pageid=None, | |||||
logger=None): | |||||
"""Constructor for new Page instances. | """Constructor for new Page instances. | ||||
Takes four arguments: a Site object, the Page's title (or pagename), | Takes four arguments: a Site object, the Page's title (or pagename), | ||||
@@ -100,6 +102,14 @@ class Page(CopyrightMixIn): | |||||
self._follow_redirects = self._keep_following = follow_redirects | self._follow_redirects = self._keep_following = follow_redirects | ||||
self._pageid = pageid | self._pageid = pageid | ||||
# Set up our internal logger: | |||||
if logger: | |||||
self._logger = logger | |||||
else: # Just set up a null logger to eat up our messages: | |||||
self._logger = getLogger("earwigbot.wiki") | |||||
self._logger.addHandler(NullHandler()) | |||||
# Attributes to be loaded through the API: | |||||
self._exists = self.PAGE_UNKNOWN | self._exists = self.PAGE_UNKNOWN | ||||
self._is_redirect = None | self._is_redirect = None | ||||
self._lastrevid = None | self._lastrevid = None | ||||
@@ -92,7 +92,7 @@ class Site(object): | |||||
namespaces=None, login=(None, None), cookiejar=None, | namespaces=None, login=(None, None), cookiejar=None, | ||||
user_agent=None, use_https=False, assert_edit=None, | user_agent=None, use_https=False, assert_edit=None, | ||||
maxlag=None, wait_between_queries=3, logger=None, | maxlag=None, wait_between_queries=3, logger=None, | ||||
search_config=(None, None)): | |||||
search_config=None): | |||||
"""Constructor for new Site instances. | """Constructor for new Site instances. | ||||
This probably isn't necessary to call yourself unless you're building a | This probably isn't necessary to call yourself unless you're building a | ||||
@@ -560,10 +560,10 @@ class Site(object): | |||||
return [self.SERVICE_API] | return [self.SERVICE_API] | ||||
sqllag = self._sql_info_cache["replag"] | sqllag = self._sql_info_cache["replag"] | ||||
if sqllag > 180: | |||||
if sqllag > 300: | |||||
if not self._maxlag: | if not self._maxlag: | ||||
return [self.SERVICE_API, self.SERVICE_SQL] | return [self.SERVICE_API, self.SERVICE_SQL] | ||||
if now - self._api_info_cache["lastcheck"] > 120: | |||||
if now - self._api_info_cache["lastcheck"] > 300: | |||||
self._api_info_cache["lastcheck"] = now | self._api_info_cache["lastcheck"] = now | ||||
try: | try: | ||||
self._api_info_cache["maxlag"] = apilag = self.get_maxlag() | self._api_info_cache["maxlag"] = apilag = self.get_maxlag() | ||||
@@ -571,7 +571,7 @@ class Site(object): | |||||
self._api_info_cache["maxlag"] = apilag = 0 | self._api_info_cache["maxlag"] = apilag = 0 | ||||
else: | else: | ||||
apilag = self._api_info_cache["maxlag"] | apilag = self._api_info_cache["maxlag"] | ||||
if sqllag / (180.0 / self._maxlag) < apilag: | |||||
if apilag > self._maxlag: | |||||
return [self.SERVICE_SQL, self.SERVICE_API] | return [self.SERVICE_SQL, self.SERVICE_API] | ||||
return [self.SERVICE_API, self.SERVICE_SQL] | return [self.SERVICE_API, self.SERVICE_SQL] | ||||
@@ -789,8 +789,9 @@ class Site(object): | |||||
prefix = title.split(":", 1)[0] | prefix = title.split(":", 1)[0] | ||||
if prefix != title: # Avoid a page that is simply "Category" | if prefix != title: # Avoid a page that is simply "Category" | ||||
if prefix in prefixes: | if prefix in prefixes: | ||||
return Category(self, title, follow_redirects, pageid) | |||||
return Page(self, title, follow_redirects, pageid) | |||||
return Category(self, title, follow_redirects, pageid, | |||||
self._logger) | |||||
return Page(self, title, follow_redirects, pageid, self._logger) | |||||
def get_category(self, catname, follow_redirects=False, pageid=None): | def get_category(self, catname, follow_redirects=False, pageid=None): | ||||
"""Return a :py:class:`Category` object for the given category name. | """Return a :py:class:`Category` object for the given category name. | ||||
@@ -802,7 +803,7 @@ class Site(object): | |||||
catname = self._unicodeify(catname) | catname = self._unicodeify(catname) | ||||
prefix = self.namespace_id_to_name(constants.NS_CATEGORY) | prefix = self.namespace_id_to_name(constants.NS_CATEGORY) | ||||
pagename = u':'.join((prefix, catname)) | pagename = u':'.join((prefix, catname)) | ||||
return Category(self, pagename, follow_redirects, pageid) | |||||
return Category(self, pagename, follow_redirects, pageid, self._logger) | |||||
def get_user(self, username=None): | def get_user(self, username=None): | ||||
"""Return a :py:class:`User` object for the given username. | """Return a :py:class:`User` object for the given username. | ||||
@@ -815,7 +816,7 @@ class Site(object): | |||||
username = self._unicodeify(username) | username = self._unicodeify(username) | ||||
else: | else: | ||||
username = self._get_username() | username = self._get_username() | ||||
return User(self, username) | |||||
return User(self, username, self._logger) | |||||
def delegate(self, services, args=None, kwargs=None): | def delegate(self, services, args=None, kwargs=None): | ||||
"""Delegate a task to either the API or SQL depending on conditions. | """Delegate a task to either the API or SQL depending on conditions. | ||||
@@ -29,6 +29,7 @@ import sqlite3 as sqlite | |||||
from earwigbot import __version__ | from earwigbot import __version__ | ||||
from earwigbot.exceptions import SiteNotFoundError | from earwigbot.exceptions import SiteNotFoundError | ||||
from earwigbot.wiki.copyvios.exclusions import ExclusionsDB | |||||
from earwigbot.wiki.site import Site | from earwigbot.wiki.site import Site | ||||
__all__ = ["SitesDB"] | __all__ = ["SitesDB"] | ||||
@@ -58,11 +59,16 @@ class SitesDB(object): | |||||
"""Set up the manager with an attribute for the base Bot object.""" | """Set up the manager with an attribute for the base Bot object.""" | ||||
self.config = bot.config | self.config = bot.config | ||||
self._logger = bot.logger.getChild("wiki") | self._logger = bot.logger.getChild("wiki") | ||||
self._sites = {} # Internal site cache | self._sites = {} # Internal site cache | ||||
self._sitesdb = path.join(bot.config.root_dir, "sites.db") | self._sitesdb = path.join(bot.config.root_dir, "sites.db") | ||||
self._cookie_file = path.join(bot.config.root_dir, ".cookies") | self._cookie_file = path.join(bot.config.root_dir, ".cookies") | ||||
self._cookiejar = None | self._cookiejar = None | ||||
excl_db = path.join(bot.config.root_dir, "exclusions.db") | |||||
excl_logger = self._logger.getChild("exclusionsdb") | |||||
self._exclusions_db = ExclusionsDB(self, excl_db, excl_logger) | |||||
def __repr__(self): | def __repr__(self): | ||||
"""Return the canonical string representation of the SitesDB.""" | """Return the canonical string representation of the SitesDB.""" | ||||
res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})" | res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})" | ||||
@@ -192,6 +198,17 @@ class SitesDB(object): | |||||
user_agent = user_agent.replace("$1", __version__) | user_agent = user_agent.replace("$1", __version__) | ||||
user_agent = user_agent.replace("$2", python_version()) | user_agent = user_agent.replace("$2", python_version()) | ||||
if search_config: | |||||
nltk_dir = path.join(self.config.root_dir, ".nltk") | |||||
search_config["nltk_dir"] = nltk_dir | |||||
search_config["exclusions_db"] = self._exclusions_db | |||||
if not sql: | |||||
sql = config.wiki.get("sql", {}) | |||||
for key, value in sql.iteritems(): | |||||
if "$1" in value: | |||||
sql[key] = value.replace("$1", name) | |||||
return Site(name=name, project=project, lang=lang, base_url=base_url, | return Site(name=name, project=project, lang=lang, base_url=base_url, | ||||
article_path=article_path, script_path=script_path, | article_path=article_path, script_path=script_path, | ||||
sql=sql, namespaces=namespaces, login=login, | sql=sql, namespaces=namespaces, login=login, | ||||
@@ -332,13 +349,12 @@ class SitesDB(object): | |||||
the script path (meaning the API is located at | the script path (meaning the API is located at | ||||
``"{base_url}{script_path}/api.php"`` -> | ``"{base_url}{script_path}/api.php"`` -> | ||||
``"//{lang}.{project}.org/w/api.php"``), so this is the default. If | ``"//{lang}.{project}.org/w/api.php"``), so this is the default. If | ||||
your wiki is different, provide the script_path as an argument. The | |||||
only other argument to :py:class:`~earwigbot.wiki.site.Site` that we | |||||
can't get from config files or by querying the wiki itself is SQL | |||||
connection info, so provide a dict of kwargs as *sql* and Site will | |||||
pass it to :py:func:`oursql.connect(**sql) <oursql.connect>`, allowing | |||||
you to make queries with :py:meth:`site.sql_query | |||||
<earwigbot.wiki.site.Site.sql_query>`. | |||||
your wiki is different, provide the script_path as an argument. SQL | |||||
connection settings are guessed automatically using config's template | |||||
value. If this is wrong or not specified, provide a dict of kwargs as | |||||
*sql* and Site will pass it to :py:func:`oursql.connect(**sql) | |||||
<oursql.connect>`, allowing you to make queries with | |||||
:py:meth:`site.sql_query <earwigbot.wiki.site.Site.sql_query>`. | |||||
Returns ``True`` if the site was added successfully or ``False`` if the | Returns ``True`` if the site was added successfully or ``False`` if the | ||||
site is already in our sitesdb (this can be done purposefully to update | site is already in our sitesdb (this can be done purposefully to update | ||||
@@ -359,15 +375,31 @@ class SitesDB(object): | |||||
use_https = config.wiki.get("useHTTPS", False) | use_https = config.wiki.get("useHTTPS", False) | ||||
assert_edit = config.wiki.get("assert") | assert_edit = config.wiki.get("assert") | ||||
maxlag = config.wiki.get("maxlag") | maxlag = config.wiki.get("maxlag") | ||||
wait_between_queries = config.wiki.get("waitTime", 5) | |||||
wait_between_queries = config.wiki.get("waitTime", 3) | |||||
logger = self._logger.getChild(name) | |||||
search_config = config.wiki.get("search") | search_config = config.wiki.get("search") | ||||
if user_agent: | |||||
user_agent = user_agent.replace("$1", __version__) | |||||
user_agent = user_agent.replace("$2", python_version()) | |||||
if search_config: | |||||
nltk_dir = path.join(self.config.root_dir, ".nltk") | |||||
search_config["nltk_dir"] = nltk_dir | |||||
search_config["exclusions_db"] = self._exclusions_db | |||||
if not sql: | |||||
sql = config.wiki.get("sql", {}) | |||||
for key, value in sql.iteritems(): | |||||
if "$1" in value: | |||||
sql[key] = value.replace("$1", name) | |||||
# Create a Site object to log in and load the other attributes: | # Create a Site object to log in and load the other attributes: | ||||
site = Site(base_url=base_url, script_path=script_path, sql=sql, | site = Site(base_url=base_url, script_path=script_path, sql=sql, | ||||
login=login, cookiejar=cookiejar, user_agent=user_agent, | login=login, cookiejar=cookiejar, user_agent=user_agent, | ||||
use_https=use_https, assert_edit=assert_edit, | use_https=use_https, assert_edit=assert_edit, | ||||
maxlag=maxlag, wait_between_queries=wait_between_queries, | maxlag=maxlag, wait_between_queries=wait_between_queries, | ||||
search_config=search_config) | |||||
logger=logger, search_config=search_config) | |||||
self._add_site_to_sitesdb(site) | self._add_site_to_sitesdb(site) | ||||
self._sites[site.name] = site | self._sites[site.name] = site | ||||
@@ -20,6 +20,7 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
from logging import getLogger, NullHandler | |||||
from time import gmtime, strptime | from time import gmtime, strptime | ||||
from earwigbot.exceptions import UserNotFoundError | from earwigbot.exceptions import UserNotFoundError | ||||
@@ -60,7 +61,7 @@ class User(object): | |||||
talkpage | talkpage | ||||
""" | """ | ||||
def __init__(self, site, name): | |||||
def __init__(self, site, name, logger=None): | |||||
"""Constructor for new User instances. | """Constructor for new User instances. | ||||
Takes two arguments, a Site object (necessary for doing API queries), | Takes two arguments, a Site object (necessary for doing API queries), | ||||
@@ -76,6 +77,13 @@ class User(object): | |||||
self._site = site | self._site = site | ||||
self._name = name | self._name = name | ||||
# Set up our internal logger: | |||||
if logger: | |||||
self._logger = logger | |||||
else: # Just set up a null logger to eat up our messages: | |||||
self._logger = getLogger("earwigbot.wiki") | |||||
self._logger.addHandler(NullHandler()) | |||||
def __repr__(self): | def __repr__(self): | ||||
"""Return the canonical string representation of the User.""" | """Return the canonical string representation of the User.""" | ||||
return "User(name={0!r}, site={1!r})".format(self._name, self._site) | return "User(name={0!r}, site={1!r})".format(self._name, self._site) | ||||
@@ -25,6 +25,25 @@ from setuptools import setup, find_packages | |||||
from earwigbot import __version__ | from earwigbot import __version__ | ||||
# Not all of these dependencies are required, particularly the copyvio-specific | |||||
# ones (bs4, lxml, nltk, and oauth2) or the command-specific ones (GitPython, | |||||
# pytz). The bot should run fine without them, but will raise an exception if | |||||
# you try to detect copyvios or run a command that requries one. | |||||
dependencies = [ | |||||
"GitPython >= 0.3.2.RC1", # Interfacing with git for !git and __version__ | |||||
"PyYAML >= 3.10", # Parsing config files | |||||
"beautifulsoup4 >= 4.1.1", # Parsing/scraping HTML for copyvios | |||||
"lxml >= 2.3.4", # Faster parser for BeautifulSoup | |||||
"mwparserfromhell >= 0.1", # Parsing wikicode for manipulation | |||||
"nltk >= 2.0.2", # Parsing sentences to split article content for copyvios | |||||
"oursql >= 0.9.3", # Interfacing with MediaWiki databases | |||||
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search for copyvios | |||||
"py-bcrypt >= 0.2", # Hashing the bot key in the config file | |||||
"pycrypto >= 2.5", # Storing bot passwords and keys in the config file | |||||
"pytz >= 2012c", # Handling timezones for the !time IRC command | |||||
] | |||||
with open("README.rst") as fp: | with open("README.rst") as fp: | ||||
long_docs = fp.read() | long_docs = fp.read() | ||||
@@ -32,15 +51,7 @@ setup( | |||||
name = "earwigbot", | name = "earwigbot", | ||||
packages = find_packages(exclude=("tests",)), | packages = find_packages(exclude=("tests",)), | ||||
entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]}, | entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]}, | ||||
install_requires = ["GitPython >= 0.3.2.RC1", # Interfacing with git | |||||
"PyYAML >= 3.10", # Config parsing | |||||
"mwparserfromhell >= 0.1", # Wikicode parsing | |||||
"oursql >= 0.9.3", # Talking with MediaWiki databases | |||||
"oauth2 >= 1.5.211", # Talking with Yahoo BOSS Search | |||||
"py-bcrypt >= 0.2", # Password hashing in config | |||||
"pycrypto >= 2.5", # Storing bot passwords and keys | |||||
"pytz >= 2012c", # Timezone handling | |||||
], | |||||
install_requires = dependencies, | |||||
test_suite = "tests", | test_suite = "tests", | ||||
version = __version__, | version = __version__, | ||||
author = "Ben Kurtovic", | author = "Ben Kurtovic", | ||||