Some more work on copyvio detection code

Also removed the hardcoded version in user-agent strings.
13 years ago · 24f7eabb77
--- a/earwigbot/commands/ctcp.py
+++ b/earwigbot/commands/ctcp.py
@@ -23,6 +23,7 @@
 import platform
 import time
 import earwigbot
 from earwigbot.classes import BaseCommand
 from earwigbot.config import config
@@ -61,7 +62,8 @@ class Command(BaseCommand):
            self.connection.notice(target, "\x01TIME {0}\x01".format(ts))
        elif command == "VERSION":
            default = "EarwigBot - 0.1-dev - Python/$1 https://github.com/earwig/earwigbot"
            default = "EarwigBot - $1 - Python/$2 https://github.com/earwig/earwigbot"
            vers = config.irc.get("version", default)
            vers = vers.replace("$1", platform.python_version())
            vers = vers.replace("$1", earwigbot.__version__)
            vers = vers.replace("$2", platform.python_version())
            self.connection.notice(target, "\x01VERSION {0}\x01".format(vers))
--- a/earwigbot/tasks/afc_copyvios.py
+++ b/earwigbot/tasks/afc_copyvios.py
@@ -89,22 +89,25 @@ class Task(BaseTask):
            return
        self.logger.info("Checking [[{0}]]".format(title))
        content = page.get() 
        result = page.copyvio_check(self.engine, self.credentials,
                                    self.min_confidence, self.max_queries)
        if result.url:
            url = result.url
        url = result.url
        confidence = "{0}%".format(round(result.confidence * 100, 2))
        if result.violation:
            content = page.get()
            template = "\{\{{0}|url={1}\}\}".format(self.template, url)
            template = "\{\{{0}|url={1}|confidence={2}\}\}"
            template = template.format(self.template, url, confidence)
            newtext = "\n".join((template, content))
            if "{url}" in self.summary:
                page.edit(newtext, self.summary.format(url=url))
            else:
                page.edit(newtext, self.summary)
            msg = "Found violation: [[{0}]] -> {1}"
            self.logger.warn(msg.format(title, url))
            msg = "Found violation: [[{0}]] -> {1} ({2} confidence)"
            self.logger.warn(msg.format(title, url, confidence))
        else:
            self.logger.debug("No violations detected")
            msg = "No violations detected (best: {1} at {2} confidence)"
            self.logger.debug(msg.format(url, confidence))
        self.log_processed(pageid)
--- a/earwigbot/wiki/constants.py
+++ b/earwigbot/wiki/constants.py
@@ -31,8 +31,9 @@ Import with `from earwigbot.wiki import constants` or `from earwigbot.wiki.const
 """
 # Default User Agent when making API queries:
 from earwigbot import __version__ as _v
 from platform import python_version as _p
 USER_AGENT = "EarwigBot/0.1-dev (Python/{0}; https://github.com/earwig/earwigbot)".format(_p())
 USER_AGENT = "EarwigBot/{0} (Python/{1}; https://github.com/earwig/earwigbot)".format(_v, _p())
 # Default namespace IDs:
 NS_MAIN = 0
--- a/earwigbot/wiki/copyright.py
+++ b/earwigbot/wiki/copyright.py
@@ -20,9 +20,13 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 from functools import partial
 from gzip import GzipFile
 from json import loads
 from StringIO import StringIO
 from time import sleep, time
 from urllib import quote_plus, urlencode
 from urllib2 import build_opener, URLError
 try:
    import oauth2 as oauth
@@ -32,14 +36,15 @@ except ImportError:
 from earwigbot.wiki.exceptions import *
 class CopyvioCheckResult(object):
    def __init__(self, confidence, url, queries):
    def __init__(self, violation, confidence, url, queries):
        self.violation = violation
        self.confidence = confidence
        self.url = url
        self.queries = queries
    def __repr__(self):
        r = "CopyvioCheckResult(confidence={0!r}, url={1!r}, queries={2|r})"
        return r.format(self.confidence, self.url, self.queries)
        r = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
        return r.format(self.violation, self.confidence, self.url, self.queries)
 class CopyrightMixin(object):
@@ -50,7 +55,57 @@ class CopyrightMixin(object):
    checks the page for copyright violations using a search engine API. The
    API keys must be provided to the method as arguments.
    """
    def _yahoo_boss_query(self, query, cred):
    def __init__(self):
        self._opener = build_opener()
        self._opener.addheaders = self._site._opener.addheaders
    def _open_url_ignoring_errors(self, url):
        """Open a URL using self._opener and return its content, or None.
        Will decompress the content if the headers contain "gzip" as its
        content encoding, and will return None if URLError is raised while
        opening the URL. IOErrors while gunzipping a compressed response are
        ignored, and the original content is returned.
        """
        try:
            response = self._opener.open(url)
        except URLError:
            return None
        result = response.read()
        if response.headers.get("Content-Encoding") == "gzip":
            stream = StringIO(result)
            gzipper = GzipFile(fileobj=stream)
            try:
                result = gzipper.read()
            except IOError:
                pass
        return result
    def _select_search_engine(self, engine, credentials):
        """Return a function that can be called to do web searches.
        The "function" is a functools.partial object that takes one argument, a
        query, and returns a list of URLs, ranked by importance. The underlying
        logic depends on the 'engine' argument; for example, if 'engine' is
        "Yahoo! BOSS", we'll use self._yahoo_boss_query for querying.
        Raises UnknownSearchEngineError if 'engine' is not known to us, and
        UnsupportedSearchEngineError if we are missing a required package or
        module, like oauth2 for "Yahoo! BOSS".
        """
        if engine == "Yahoo! BOSS":
            if not oauth:
                e = "The package 'oauth2' could not be imported"
                raise UnsupportedSearchEngineError(e)
            searcher = self._yahoo_boss_query
        else:
            raise UnknownSearchEngineError(engine)
        return partial(searcher, credentials)
    def _yahoo_boss_query(self, cred, query):
        """Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials.
        Returns a list of URLs, no more than fifty, ranked by relevance (as
@@ -84,21 +139,27 @@ class CopyrightMixin(object):
    def _copyvio_strip_content(self, content):
        return content
    def _copyvio_explode_content(self, content):
        return content
    def _copyvio_chunk_content(self, content):
        return [content]
    def _copyvio_compare_content(self, content, url):
        return 0
        html = self._open_url_ignoring_errors(url)
        if not html:
            return 0
        confidence = 0
        return confidence
    def copyvio_check(self, engine, credentials, min_confidence=0.5,
    def copyvio_check(self, engine, credentials, min_confidence=0.75,
                      max_queries=-1, interquery_sleep=1, force=False):
        """Check the page for copyright violations.
        Returns a CopyvioCheckResult object, with three useful attributes:
        "confidence", "url", and "queries". "confidence" is a number between
        0 and 1; if it is less than min_confidence, we could not find any
        indication of a violation (so "url" will be None), otherwise it
        indicates the relative faith in our results, and "url" will be the
        Returns a CopyvioCheckResult object, with four useful attributes:
        "violation", "confidence", "url", and "queries". "confidence" is a
        number between 0 and 1; if it is less than "min_confidence", we could
        not find any indication of a violation (so "violation" will be False
        and "url" may or may not be None), otherwise it indicates the relative
        faith in our results, "violation" will be True, and "url" will be the
        place the article is suspected of being copied from. "queries" is the
        number of queries used to determine the results.
@@ -115,26 +176,19 @@ class CopyrightMixin(object):
        Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
        SearchQueryError, ...) on errors.
        """
        if engine == "Yahoo! BOSS":
            if not oauth:
                e = "The package 'oauth2' could not be imported"
                raise UnsupportedSearchEngineError(e)
            querier = self._yahoo_boss_query
        else:
            raise UnknownSearchEngineError(engine)
        search = self._select_search_engine(engine, credentials)
        handled_urls = []
        best_confidence = 0
        best_match = None
        num_queries = 0
        content = self.get(force)
        clean = self._copyvio_strip_content(content)
        fragments = self._copyvio_explode_content(clean)
        chunks = self._copyvio_chunk_content(clean)
        last_query = time()
        while (fragments and best_confidence < min_confidence and
        while (chunks and best_confidence < min_confidence and
               (max_queries < 0 or num_queries < max_queries)):
            urls = querier(fragments.pop(0), credentials)
            urls = search(chunks.pop(0))
            urls = [url for url in urls if url not in handled_urls]
            for url in urls:
                confidence = self._copyvio_compare_content(content, url)
@@ -147,4 +201,8 @@ class CopyrightMixin(object):
                sleep(interquery_sleep - diff)
            last_query = time()
        return CopyvioCheckResult(best_confidence, best_match, num_queries)
        if best_confidence >= min_confidence:  # violation?
            vi = True
        else:
            vi = False
        return CopyvioCheckResult(vi, best_confidence, best_match, num_queries)
--- a/earwigbot/wiki/functions.py
+++ b/earwigbot/wiki/functions.py
@@ -37,6 +37,7 @@ from os import chmod, path
 import platform
 import stat
 import earwigbot
 from earwigbot.config import config
 from earwigbot.wiki.exceptions import SiteNotFoundError
 from earwigbot.wiki.site import Site
@@ -111,7 +112,8 @@ def _get_site_object_from_dict(name, d):
    maxlag = config.wiki.get("maxlag")
    if user_agent:
        user_agent = user_agent.replace("$1", platform.python_version())
        user_agent = user_agent.replace("$1", earwigbot.__version__)
        user_agent = user_agent.replace("$2", platform.python_version())
    for key, value in namespaces.items():  # Convert string keys to integers
        del namespaces[key]
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -69,6 +69,7 @@ class Page(CopyrightMixin):
        __init__ will not do any API queries, but it will use basic namespace
        logic to determine our namespace ID and if we are a talkpage.
        """
        super(Page, self).__init__()
        self._site = site
        self._title = title.strip()
        self._follow_redirects = self._keep_following = follow_redirects