ben
/
copyvios
mirror of https://github.com/earwig/copyvios


			
				
					
						
						
							
							# -*- coding: utf-8  -*-

from datetime import datetime, timedelta
from hashlib import sha256
from logging import getLogger
import re
from urlparse import urlparse

from earwigbot import exceptions
from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain
from earwigbot.wiki.copyvios.parsers import ArticleTextParser
from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult

from .misc import Query, get_db, get_cursor, get_sql_error, sql_dialect
from .sites import get_site
from .turnitin import search_turnitin

__all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]

T_POSSIBLE = 0.4
T_SUSPECT = 0.75

_LOGGER = getLogger("copyvios.checker")

def _coerce_bool(val):
    return val and val not in ("0", "false")

def do_check(query=None):
    if not query:
        query = Query()
    if query.lang:
        query.lang = query.orig_lang = query.lang.strip().lower()
        if "::" in query.lang:
            query.lang, query.name = query.lang.split("::", 1)
    if query.project:
        query.project = query.project.strip().lower()
    if query.oldid:
        query.oldid = query.oldid.strip().lstrip("0")

    query.submitted = query.project and query.lang and (query.title or query.oldid)
    if query.submitted:
        query.site = get_site(query)
        if query.site:
            _get_results(query, follow=not _coerce_bool(query.noredirect))
    return query

def _get_results(query, follow=True):
    if query.oldid:
        if not re.match(r"^\d+$", query.oldid):
            query.error = "bad oldid"
            return
        page = query.page = _get_page_by_revid(query.site, query.oldid)
        if not page:
            return
    else:
        page = query.page = query.site.get_page(query.title)
        try:
            page.get()  # Make sure that the page exists before we check it!
        except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
            return
        if page.is_redirect and follow:
            try:
                query.title = page.get_redirect_target()
            except exceptions.RedirectError:
                pass  # Something's wrong. Continue checking the original page.
            else:
                query.redirected_from = page
                _get_results(query, follow=False)
                return

    if not query.action:
        query.action = "compare" if query.url else "search"
    if query.action == "search":
        use_engine = 0 if query.use_engine in ("0", "false") else 1
        use_links = 0 if query.use_links in ("0", "false") else 1
        use_turnitin = 1 if query.turnitin in ("1", "true") else 0
        if not use_engine and not use_links and not use_turnitin:
            query.error = "no search method"
            return

        # Handle the turnitin check
        if use_turnitin:
            query.turnitin_result = search_turnitin(page.title, query.lang)

        # Handle the copyvio check
        _perform_check(query, page, use_engine, use_links)
    elif query.action == "compare":
        if not query.url:
            query.error = "no URL"
            return
        scheme = urlparse(query.url).scheme
        if not scheme and query.url[0] not in ":/":
            query.url = "http://" + query.url
        elif scheme not in ["http", "https"]:
            query.error = "bad URI"
            return
        result = page.copyvio_compare(query.url, min_confidence=T_SUSPECT,
                                      max_time=30)
        if result.best.chains[0] is EMPTY:
            query.error = "timeout" if result.time > 30 else "no data"
            return
        query.result = result
        query.result.cached = False
    else:
        query.error = "bad action"

def _get_page_by_revid(site, revid):
    try:
        res = site.api_query(action="query", prop="info|revisions", revids=revid,
                             rvprop="content|timestamp", inprop="protection|url",
                             rvslots="main")
        page_data = res["query"]["pages"].values()[0]
        title = page_data["title"]
        # Only need to check that these exist:
        revision = page_data["revisions"][0]
        revision["slots"]["main"]["*"]
        revision["timestamp"]
    except (exceptions.APIError, KeyError, IndexError):
        return None
    page = site.get_page(title)

    # EarwigBot doesn't understand old revisions of pages, so we use a somewhat
    # dirty hack to make this work:
    page._load_attributes(res)
    page._load_content(res)
    return page

def _perform_check(query, page, use_engine, use_links):
    conn = get_db()
    sql_error = get_sql_error()
    mode = "{0}:{1}:".format(use_engine, use_links)

    if not _coerce_bool(query.nocache):
        try:
            query.result = _get_cached_results(
                page, conn, mode, _coerce_bool(query.noskip))
        except sql_error:
            _LOGGER.exception("Failed to retrieve cached results")

    if not query.result:
        try:
            query.result = page.copyvio_check(
                min_confidence=T_SUSPECT, max_queries=8, max_time=45,
                no_searches=not use_engine, no_links=not use_links,
                short_circuit=not query.noskip)
        except exceptions.SearchQueryError as exc:
            query.error = "search error"
            query.exception = exc
            return
        query.result.cached = False
        try:
            _cache_result(page, query.result, conn, mode)
        except sql_error:
            _LOGGER.exception("Failed to cache results")

def _get_cached_results(page, conn, mode, noskip):
    query1 = """SELECT cache_time, cache_queries, cache_process_time,
                       cache_possible_miss
                FROM cache
                WHERE cache_id = ?"""
    query2 = """SELECT cdata_url, cdata_confidence, cdata_skipped, cdata_excluded
                FROM cache_data
                WHERE cdata_cache_id = ?"""
    cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())

    cursor = conn.cursor()
    cursor.execute(query1, (cache_id,))
    results = cursor.fetchall()
    if not results:
        return None
    cache_time, queries, check_time, possible_miss = results[0]
    if possible_miss and noskip:
        return None
    if not isinstance(cache_time, datetime):
        cache_time = datetime.utcfromtimestamp(cache_time)
    if datetime.utcnow() - cache_time > timedelta(days=3):
        return None
    cursor.execute(query2, (cache_id,))
    data = cursor.fetchall()

    if not data:  # TODO: do something less hacky for this edge case
        article_chain = MarkovChain(ArticleTextParser(page.get()).strip())
        result = CopyvioCheckResult(False, [], queries, check_time,
                                    article_chain, possible_miss)
        result.cached = True
        result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
        result.cache_age = _format_date(cache_time)
        return result

    url, confidence, skipped, excluded = data.pop(0)
    if skipped:  # Should be impossible: data must be bad; run a new check
        return None
    result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30)
    if abs(result.confidence - confidence) >= 0.0001:
        return None

    for url, confidence, skipped, excluded in data:
        if noskip and skipped:
            return None
        source = CopyvioSource(None, url)
        source.confidence = confidence
        source.skipped = bool(skipped)
        source.excluded = bool(excluded)
        result.sources.append(source)
    result.queries = queries
    result.time = check_time
    result.possible_miss = possible_miss
    result.cached = True
    result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
    result.cache_age = _format_date(cache_time)
    return result

def _format_date(cache_time):
    formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s")
    diff = datetime.utcnow() - cache_time
    total_seconds = diff.days * 86400 + diff.seconds
    if total_seconds > 3600:
        return formatter(total_seconds / 3600, "hour")
    if total_seconds > 60:
        return formatter(total_seconds / 60, "minute")
    return formatter(total_seconds, "second")

def _cache_result(page, result, conn, mode):
    expiry = sql_dialect(mysql="DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)",
                         sqlite="STRFTIME('%s', 'now', '-3 days')")
    query1 = "DELETE FROM cache WHERE cache_id = ?"
    query2 = "DELETE FROM cache WHERE cache_time < %s" % expiry
    query3 = """INSERT INTO cache (cache_id, cache_queries, cache_process_time,
                                   cache_possible_miss) VALUES (?, ?, ?, ?)"""
    query4 = """INSERT INTO cache_data (cdata_cache_id, cdata_url,
                                        cdata_confidence, cdata_skipped,
                                        cdata_excluded) VALUES (?, ?, ?, ?, ?)"""
    cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
    data = [(cache_id, source.url[:1024], source.confidence, source.skipped,
             source.excluded)
            for source in result.sources]
    with get_cursor(conn) as cursor:
        cursor.execute(query1, (cache_id,))
        cursor.execute(query2)
        cursor.execute(query3, (cache_id, result.queries, result.time,
                                result.possible_miss))
        cursor.executemany(query4, data)