ben
/
earwigbot
şunun yansıması https://github.com/earwig/earwigbot


			
							# -*- coding: utf-8  -*-
#
# Copyright (C) 2009-2014 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from collections import namedtuple
from gzip import GzipFile
from Queue import Empty, Queue
from socket import timeout
from StringIO import StringIO
from threading import Lock, Semaphore, Thread
from time import sleep, time
from urllib2 import build_opener, URLError

from earwigbot import exceptions, importer
from earwigbot.wiki.copyvios.markov import (
    EMPTY, EMPTY_INTERSECTION, MarkovChain, MarkovChainIntersection)
from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine

oauth = importer.new("oauth2")
tldextract = importer.new("tldextract")

__all__ = ["CopyvioMixIn"]

_WorkingResult = namedtuple("_WorkingResult", ["url", "confidence", "chains"])

class _CopyvioWorkspace(object):
    """Manages a single copyvio check distributed across threads."""

    def __init__(self, article, min_confidence, until, logger, headers,
                 url_timeout=5, max_concurrent_requests=6):
        self.best = _WorkingResult(None, 0.0, (EMPTY, EMPTY_INTERSECTION))
        self.request_semaphore = Semaphore(max_concurrent_requests)

        self._article = article
        self._logger = logger.getChild("copyvios")
        self._min_confidence = min_confidence
        self._handled_urls = []
        self._is_finished = False
        self._enqueue_lock = Lock()
        self._result_lock = Lock()

        self._workers = {}
        self._worker_args = (self, until, headers, url_timeout)

    def _calculate_confidence(self, delta):
        """Return the confidence of a violation as a float between 0 and 1."""
        return float(delta.size()) / self._article.size()

    def _finish_early(self):
        """Finish handling links prematurely (if we've hit min_confidence)."""
        self._logger.debug("Confidence threshold met; clearing worker queues")
        with self._enqueue_lock:
            for worker in self._workers.itervalues():
                with worker.queue.mutex:
                    worker.queue.clear()
                    worker.queue.put(None)
            self._is_finished = True

    def enqueue(self, urls, exclude_check=None):
        """Put a list of URLs into the worker queue.

        *exclude_check* is an optional exclusion function that takes a URL and
        returns ``True`` if we should skip it and ``False`` otherwise.
        """
        for url in urls:
            with self._enqueue_lock:
                if self._is_finished:
                    break
                if url in self._handled_urls:
                    continue
                self._handled_urls.append(url)
                if exclude_check and exclude_check(url):
                    continue

                try:
                    key = tldextract.extract(url).registered_domain
                except ImportError:  # Fall back on very naive method
                    from urlparse import urlparse
                    key = u".".join(urlparse(url).netloc.split(".")[-2:])

                logmsg = "enqueue(): {0} {1} -> {2}"
                if key in self._workers:
                    self._logger.debug(logmsg.format("PUT", key, url))
                    self._workers[key].queue.put(url)
                else:
                    self._logger.debug(logmsg.format("NEW", key, url))
                    worker = _CopyvioWorker(*self._worker_args)
                    worker.queue.put(url)
                    thread = Thread(target=worker.run)
                    thread.name = "cvworker-" + key.encode("utf8")
                    thread.daemon = True
                    thread.start()
                    self._workers[key] = worker

    def wait(self):
        """Wait for the workers to finish handling the queue."""
        self._logger.debug("Waiting on {0} workers".format(len(self._workers)))
        for worker in self._workers.itervalues():
            worker.queue.put(None)  # Exit signal to workers
        for worker in self._workers.itervalues():
            worker.join()

    def compare(self, url, source):
        """Compare a source to the article, and update the working result."""
        delta = MarkovChainIntersection(self._article, source)
        confidence = self._calculate_confidence(delta)
        self._logger.debug("compare(): {0} -> {1}".format(url, confidence))
        with self._result_lock:
            if confidence > self.best.confidence:
                self.best = _WorkingResult(url, confidence, (source, delta))
                if confidence >= self._min_confidence:
                    self._finish_early()


class _CopyvioWorker(object):
    """A multithreaded URL opener/parser instance."""

    def __init__(self, workspace, until, headers, url_timeout):
        self.queue = Queue()

        self._workspace = workspace
        self._until = until
        self._opener = build_opener()
        self._opener.addheaders = headers
        self._url_timeout = url_timeout

    def _open_url(self, url):
        """Open a URL and return its parsed content, or None.

        First, we will decompress the content if the headers contain "gzip" as
        its content encoding. Then, we will return the content stripped using
        an HTML parser if the headers indicate it is HTML, or return the
        content directly if it is plain text. If we don't understand the
        content type, we'll return None.

        If a URLError was raised while opening the URL or an IOError was raised
        while decompressing, None will be returned.
        """
        with self._workspace.request_semaphore:
            try:
                response = self._opener.open(url, timeout=self._url_timeout)
                result = response.read()
            except (URLError, timeout):
                return None

        if response.headers.get("Content-Encoding") == "gzip":
            stream = StringIO(result)
            gzipper = GzipFile(fileobj=stream)
            try:
                result = gzipper.read()
            except IOError:
                return None

        ctype_full = response.headers.get("Content-Type", "text/plain")
        ctype = ctype_full.split(";", 1)[0]
        if ctype in ["text/html", "application/xhtml+xml"]:
            return HTMLTextParser(result).strip()
        elif ctype == "text/plain":
            return result.strip()
        else:
            return None

    def run(self):
        """Main entry point for the worker.

        We will keep fetching URLs from the queue and handling them until
        either we run out of time, or we get an exit signal that the queue is
        now empty.
        """
        while True:
            if self._until:
                max_time = self._until - time()
                if max_time <= 0:
                    return
                try:
                    url = self.queue.get(timeout=max_time)
                except Empty:
                    return
            else:
                url = self.queue.get()
            if url is None:  # Exit signal
                return
            text = self._open_url(url.encode("utf8"))
            if text:
                self._workspace.compare(url, MarkovChain(text))


class CopyvioMixIn(object):
    """
    **EarwigBot: Wiki Toolset: Copyright Violation MixIn**

    This is a mixin that provides two public methods, :py:meth:`copyvio_check`
    and :py:meth:`copyvio_compare`. The former checks the page for copyright
    violations using a search engine API, and the latter compares the page
    against a given URL. Credentials for the search engine API are stored in
    the :py:class:`~earwigbot.wiki.site.Site`'s config.
    """

    def __init__(self, site):
        self._search_config = site._search_config
        self._exclusions_db = self._search_config.get("exclusions_db")
        self._addheaders = site._opener.addheaders

    def _get_search_engine(self):
        """Return a function that can be called to do web searches.

        The function takes one argument, a search query, and returns a list of
        URLs, ranked by importance. The underlying logic depends on the
        *engine* argument within our config; for example, if *engine* is
        "Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying.

        Raises UnknownSearchEngineError if the 'engine' listed in our config is
        unknown to us, and UnsupportedSearchEngineError if we are missing a
        required package or module, like oauth2 for "Yahoo! BOSS".
        """
        engine = self._search_config["engine"]
        credentials = self._search_config["credentials"]

        if engine == "Yahoo! BOSS":
            try:
                oauth.__version__  # Force-load the lazy module
            except ImportError:
                e = "Yahoo! BOSS requires the 'oauth2' package: https://github.com/simplegeo/python-oauth2"
                raise exceptions.UnsupportedSearchEngineError(e)
            opener = build_opener()
            opener.addheaders = self._addheaders
            return YahooBOSSSearchEngine(credentials, opener)

        raise exceptions.UnknownSearchEngineError(engine)

    def copyvio_check(self, min_confidence=0.5, max_queries=15, max_time=-1):
        """Check the page for copyright violations.

        Returns a :class:`.CopyvioCheckResult` object with information on the
        results of the check.

        *min_confidence* is the minimum amount of confidence we must have in
        the similarity between a source text and the article in order for us to
        consider it a suspected violation. This is a number between 0 and 1.

        *max_queries* is self-explanatory; we will never make more than this
        number of queries in a given check.

        *max_time* can be set to prevent copyvio checks from taking longer than
        a set amount of time (generally around a minute), which can be useful
        if checks are called through a web server with timeouts. We will stop
        checking new URLs as soon as this limit is reached.

        Raises :exc:`.CopyvioCheckError` or subclasses
        (:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on
        errors.
        """
        log = "Starting copyvio check for [[{0}]]"
        self._logger.info(log.format(self.title))
        start_time = time()
        until = (start_time + max_time) if max_time > 0 else None
        searcher = self._get_search_engine()
        parser = ArticleTextParser(self.get())
        article = MarkovChain(parser.strip())
        workspace = _CopyvioWorkspace(article, min_confidence, until,
                                      self._logger, self._addheaders)
        if self._exclusions_db:
            self._exclusions_db.sync(self.site.name)
            exclude = lambda u: self._exclusions_db.check(self.site.name, u)
        else:
            exclude = None

        if article.size() < 20:  # Auto-fail very small articles
            result = CopyvioCheckResult(False, 0.0, None, 0, 0, article,
                                        workspace.best.chains)
            self._logger.info(result.get_log_message(self.title))
            return result

        workspace.enqueue(parser.get_links(), exclude)
        chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
        num_queries = 0
        for chunk in chunks:
            if workspace.best.confidence >= min_confidence:
                break
            log = u"[[{0}]] -> querying {1} for {2!r}"
            self._logger.debug(log.format(self.title, searcher.name, chunk))
            workspace.enqueue(searcher.search(chunk), exclude)
            num_queries += 1
            sleep(1)

        workspace.wait()
        result = CopyvioCheckResult(
            workspace.best.confidence >= min_confidence,
            workspace.best.confidence, workspace.best.url, num_queries,
            time() - start_time, article, workspace.best.chains)
        self._logger.info(result.get_log_message(self.title))
        return result

    def copyvio_compare(self, url, min_confidence=0.5, max_time=30):
        """Check the page like :py:meth:`copyvio_check` against a specific URL.

        This is essentially a reduced version of :meth:`copyvio_check` - a
        copyivo comparison is made using Markov chains and the result is
        returned in a :class:`.CopyvioCheckResult` object - but without using a
        search engine, since the suspected "violated" URL is supplied from the
        start.

        Its primary use is to generate a result when the URL is retrieved from
        a cache, like the one used in EarwigBot's Tool Labs site. After a
        search is done, the resulting URL is stored in a cache for 72 hours so
        future checks against that page will not require another set of
        time-and-money-consuming search engine queries. However, the comparison
        itself (which includes the article's and the source's content) cannot
        be stored for data retention reasons, so a fresh comparison is made
        using this function.

        Since no searching is done, neither :exc:`.UnknownSearchEngineError`
        nor :exc:`.SearchQueryError` will be raised.
        """
        log = "Starting copyvio compare for [[{0}]] against {1}"
        self._logger.info(log.format(self.title, url))
        start_time = time()
        until = (start_time + max_time) if max_time > 0 else None
        article = MarkovChain(ArticleTextParser(self.get()).strip())
        workspace = _CopyvioWorkspace(article, min_confidence, until,
                                      self._logger, self._addheaders, max_time)
        workspace.enqueue([url])
        workspace.wait()
        url, conf, chains = workspace.best
        result = CopyvioCheckResult(conf >= min_confidence, conf, url, 0,
                                    time() - start_time, article, chains)
        self._logger.info(result.get_log_message(self.title))
        return result