From 24f7eabb7711fcc3b4b65046136c7a04e8f34259 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 26 Dec 2011 14:04:23 -0500 Subject: [PATCH] Some more work on copyvio detection code Also removed the hardcoded version in user-agent strings. --- earwigbot/commands/ctcp.py | 6 ++- earwigbot/tasks/afc_copyvios.py | 17 ++++--- earwigbot/wiki/constants.py | 3 +- earwigbot/wiki/copyright.py | 108 ++++++++++++++++++++++++++++++---------- earwigbot/wiki/functions.py | 4 +- earwigbot/wiki/page.py | 1 + 6 files changed, 103 insertions(+), 36 deletions(-) diff --git a/earwigbot/commands/ctcp.py b/earwigbot/commands/ctcp.py index 81073a8..41541c7 100644 --- a/earwigbot/commands/ctcp.py +++ b/earwigbot/commands/ctcp.py @@ -23,6 +23,7 @@ import platform import time +import earwigbot from earwigbot.classes import BaseCommand from earwigbot.config import config @@ -61,7 +62,8 @@ class Command(BaseCommand): self.connection.notice(target, "\x01TIME {0}\x01".format(ts)) elif command == "VERSION": - default = "EarwigBot - 0.1-dev - Python/$1 https://github.com/earwig/earwigbot" + default = "EarwigBot - $1 - Python/$2 https://github.com/earwig/earwigbot" vers = config.irc.get("version", default) - vers = vers.replace("$1", platform.python_version()) + vers = vers.replace("$1", earwigbot.__version__) + vers = vers.replace("$2", platform.python_version()) self.connection.notice(target, "\x01VERSION {0}\x01".format(vers)) diff --git a/earwigbot/tasks/afc_copyvios.py b/earwigbot/tasks/afc_copyvios.py index c90043e..85a4895 100644 --- a/earwigbot/tasks/afc_copyvios.py +++ b/earwigbot/tasks/afc_copyvios.py @@ -89,22 +89,25 @@ class Task(BaseTask): return self.logger.info("Checking [[{0}]]".format(title)) - content = page.get() result = page.copyvio_check(self.engine, self.credentials, self.min_confidence, self.max_queries) - if result.url: - url = result.url + url = result.url + confidence = "{0}%".format(round(result.confidence * 100, 2)) + + if result.violation: content = page.get() - template = "\{\{{0}|url={1}\}\}".format(self.template, url) + template = "\{\{{0}|url={1}|confidence={2}\}\}" + template = template.format(self.template, url, confidence) newtext = "\n".join((template, content)) if "{url}" in self.summary: page.edit(newtext, self.summary.format(url=url)) else: page.edit(newtext, self.summary) - msg = "Found violation: [[{0}]] -> {1}" - self.logger.warn(msg.format(title, url)) + msg = "Found violation: [[{0}]] -> {1} ({2} confidence)" + self.logger.warn(msg.format(title, url, confidence)) else: - self.logger.debug("No violations detected") + msg = "No violations detected (best: {1} at {2} confidence)" + self.logger.debug(msg.format(url, confidence)) self.log_processed(pageid) diff --git a/earwigbot/wiki/constants.py b/earwigbot/wiki/constants.py index 85e8118..eb094d6 100644 --- a/earwigbot/wiki/constants.py +++ b/earwigbot/wiki/constants.py @@ -31,8 +31,9 @@ Import with `from earwigbot.wiki import constants` or `from earwigbot.wiki.const """ # Default User Agent when making API queries: +from earwigbot import __version__ as _v from platform import python_version as _p -USER_AGENT = "EarwigBot/0.1-dev (Python/{0}; https://github.com/earwig/earwigbot)".format(_p()) +USER_AGENT = "EarwigBot/{0} (Python/{1}; https://github.com/earwig/earwigbot)".format(_v, _p()) # Default namespace IDs: NS_MAIN = 0 diff --git a/earwigbot/wiki/copyright.py b/earwigbot/wiki/copyright.py index 0a86a9d..65be2b9 100644 --- a/earwigbot/wiki/copyright.py +++ b/earwigbot/wiki/copyright.py @@ -20,9 +20,13 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from functools import partial +from gzip import GzipFile from json import loads +from StringIO import StringIO from time import sleep, time from urllib import quote_plus, urlencode +from urllib2 import build_opener, URLError try: import oauth2 as oauth @@ -32,14 +36,15 @@ except ImportError: from earwigbot.wiki.exceptions import * class CopyvioCheckResult(object): - def __init__(self, confidence, url, queries): + def __init__(self, violation, confidence, url, queries): + self.violation = violation self.confidence = confidence self.url = url self.queries = queries def __repr__(self): - r = "CopyvioCheckResult(confidence={0!r}, url={1!r}, queries={2|r})" - return r.format(self.confidence, self.url, self.queries) + r = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" + return r.format(self.violation, self.confidence, self.url, self.queries) class CopyrightMixin(object): @@ -50,7 +55,57 @@ class CopyrightMixin(object): checks the page for copyright violations using a search engine API. The API keys must be provided to the method as arguments. """ - def _yahoo_boss_query(self, query, cred): + def __init__(self): + self._opener = build_opener() + self._opener.addheaders = self._site._opener.addheaders + + def _open_url_ignoring_errors(self, url): + """Open a URL using self._opener and return its content, or None. + + Will decompress the content if the headers contain "gzip" as its + content encoding, and will return None if URLError is raised while + opening the URL. IOErrors while gunzipping a compressed response are + ignored, and the original content is returned. + """ + try: + response = self._opener.open(url) + except URLError: + return None + result = response.read() + + if response.headers.get("Content-Encoding") == "gzip": + stream = StringIO(result) + gzipper = GzipFile(fileobj=stream) + try: + result = gzipper.read() + except IOError: + pass + + return result + + def _select_search_engine(self, engine, credentials): + """Return a function that can be called to do web searches. + + The "function" is a functools.partial object that takes one argument, a + query, and returns a list of URLs, ranked by importance. The underlying + logic depends on the 'engine' argument; for example, if 'engine' is + "Yahoo! BOSS", we'll use self._yahoo_boss_query for querying. + + Raises UnknownSearchEngineError if 'engine' is not known to us, and + UnsupportedSearchEngineError if we are missing a required package or + module, like oauth2 for "Yahoo! BOSS". + """ + if engine == "Yahoo! BOSS": + if not oauth: + e = "The package 'oauth2' could not be imported" + raise UnsupportedSearchEngineError(e) + searcher = self._yahoo_boss_query + else: + raise UnknownSearchEngineError(engine) + + return partial(searcher, credentials) + + def _yahoo_boss_query(self, cred, query): """Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials. Returns a list of URLs, no more than fifty, ranked by relevance (as @@ -84,21 +139,27 @@ class CopyrightMixin(object): def _copyvio_strip_content(self, content): return content - def _copyvio_explode_content(self, content): - return content + def _copyvio_chunk_content(self, content): + return [content] def _copyvio_compare_content(self, content, url): - return 0 + html = self._open_url_ignoring_errors(url) + if not html: + return 0 + + confidence = 0 + return confidence - def copyvio_check(self, engine, credentials, min_confidence=0.5, + def copyvio_check(self, engine, credentials, min_confidence=0.75, max_queries=-1, interquery_sleep=1, force=False): """Check the page for copyright violations. - Returns a CopyvioCheckResult object, with three useful attributes: - "confidence", "url", and "queries". "confidence" is a number between - 0 and 1; if it is less than min_confidence, we could not find any - indication of a violation (so "url" will be None), otherwise it - indicates the relative faith in our results, and "url" will be the + Returns a CopyvioCheckResult object, with four useful attributes: + "violation", "confidence", "url", and "queries". "confidence" is a + number between 0 and 1; if it is less than "min_confidence", we could + not find any indication of a violation (so "violation" will be False + and "url" may or may not be None), otherwise it indicates the relative + faith in our results, "violation" will be True, and "url" will be the place the article is suspected of being copied from. "queries" is the number of queries used to determine the results. @@ -115,26 +176,19 @@ class CopyrightMixin(object): Raises CopyvioCheckError or subclasses (UnknownSearchEngineError, SearchQueryError, ...) on errors. """ - if engine == "Yahoo! BOSS": - if not oauth: - e = "The package 'oauth2' could not be imported" - raise UnsupportedSearchEngineError(e) - querier = self._yahoo_boss_query - else: - raise UnknownSearchEngineError(engine) - + search = self._select_search_engine(engine, credentials) handled_urls = [] best_confidence = 0 best_match = None num_queries = 0 content = self.get(force) clean = self._copyvio_strip_content(content) - fragments = self._copyvio_explode_content(clean) + chunks = self._copyvio_chunk_content(clean) last_query = time() - while (fragments and best_confidence < min_confidence and + while (chunks and best_confidence < min_confidence and (max_queries < 0 or num_queries < max_queries)): - urls = querier(fragments.pop(0), credentials) + urls = search(chunks.pop(0)) urls = [url for url in urls if url not in handled_urls] for url in urls: confidence = self._copyvio_compare_content(content, url) @@ -147,4 +201,8 @@ class CopyrightMixin(object): sleep(interquery_sleep - diff) last_query = time() - return CopyvioCheckResult(best_confidence, best_match, num_queries) + if best_confidence >= min_confidence: # violation? + vi = True + else: + vi = False + return CopyvioCheckResult(vi, best_confidence, best_match, num_queries) diff --git a/earwigbot/wiki/functions.py b/earwigbot/wiki/functions.py index 16f6f3c..9e05862 100644 --- a/earwigbot/wiki/functions.py +++ b/earwigbot/wiki/functions.py @@ -37,6 +37,7 @@ from os import chmod, path import platform import stat +import earwigbot from earwigbot.config import config from earwigbot.wiki.exceptions import SiteNotFoundError from earwigbot.wiki.site import Site @@ -111,7 +112,8 @@ def _get_site_object_from_dict(name, d): maxlag = config.wiki.get("maxlag") if user_agent: - user_agent = user_agent.replace("$1", platform.python_version()) + user_agent = user_agent.replace("$1", earwigbot.__version__) + user_agent = user_agent.replace("$2", platform.python_version()) for key, value in namespaces.items(): # Convert string keys to integers del namespaces[key] diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py index 5b359e1..1b2c06a 100644 --- a/earwigbot/wiki/page.py +++ b/earwigbot/wiki/page.py @@ -69,6 +69,7 @@ class Page(CopyrightMixin): __init__ will not do any API queries, but it will use basic namespace logic to determine our namespace ID and if we are a talkpage. """ + super(Page, self).__init__() self._site = site self._title = title.strip() self._follow_redirects = self._keep_following = follow_redirects