- # -*- coding: utf-8 -*-
- from datetime import datetime, timedelta
- from hashlib import sha256
- from logging import getLogger
- import re
- from urlparse import urlparse
- from earwigbot import exceptions
- from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain
- from earwigbot.wiki.copyvios.parsers import ArticleTextParser
- from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult
- from .misc import Query, get_db, get_cursor, get_sql_error, sql_dialect
- from .sites import get_site
- from .turnitin import search_turnitin
- __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]
- T_POSSIBLE = 0.4
- T_SUSPECT = 0.75
- _LOGGER = getLogger("copyvios.checker")
- def _coerce_bool(val):
- return val and val not in ("0", "false")
- def do_check(query=None):
- if not query:
- query = Query()
- if query.lang:
- query.lang = query.orig_lang = query.lang.strip().lower()
- if "::" in query.lang:
- query.lang, query.name = query.lang.split("::", 1)
- if query.project:
- query.project = query.project.strip().lower()
- if query.oldid:
- query.oldid = query.oldid.strip().lstrip("0")
- query.submitted = query.project and query.lang and (query.title or query.oldid)
- if query.submitted:
- query.site = get_site(query)
- if query.site:
- _get_results(query, follow=not _coerce_bool(query.noredirect))
- return query
- def _get_results(query, follow=True):
- if query.oldid:
- if not re.match(r"^\d+$", query.oldid):
- query.error = "bad oldid"
- return
- page = query.page = _get_page_by_revid(query.site, query.oldid)
- if not page:
- return
- else:
- page = query.page = query.site.get_page(query.title)
- try:
- page.get() # Make sure that the page exists before we check it!
- except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
- return
- if page.is_redirect and follow:
- try:
- query.title = page.get_redirect_target()
- except exceptions.RedirectError:
- pass # Something's wrong. Continue checking the original page.
- else:
- query.redirected_from = page
- _get_results(query, follow=False)
- return
- if not query.action:
- query.action = "compare" if query.url else "search"
- if query.action == "search":
- use_engine = 0 if query.use_engine in ("0", "false") else 1
- use_links = 0 if query.use_links in ("0", "false") else 1
- use_turnitin = 1 if query.turnitin in ("1", "true") else 0
- if not use_engine and not use_links and not use_turnitin:
- query.error = "no search method"
- return
- # Handle the turnitin check
- if use_turnitin:
- query.turnitin_result = search_turnitin(page.title, query.lang)
- # Handle the copyvio check
- _perform_check(query, page, use_engine, use_links)
- elif query.action == "compare":
- if not query.url:
- query.error = "no URL"
- return
- scheme = urlparse(query.url).scheme
- if not scheme and query.url[0] not in ":/":
- query.url = "http://" + query.url
- elif scheme not in ["http", "https"]:
- query.error = "bad URI"
- return
- result = page.copyvio_compare(query.url, min_confidence=T_SUSPECT,
- max_time=30)
- if result.best.chains[0] is EMPTY:
- query.error = "timeout" if result.time > 30 else "no data"
- return
- query.result = result
- query.result.cached = False
- else:
- query.error = "bad action"
- def _get_page_by_revid(site, revid):
- try:
- res = site.api_query(action="query", prop="info|revisions", revids=revid,
- rvprop="content|timestamp", inprop="protection|url",
- rvslots="main")
- page_data = res["query"]["pages"].values()[0]
- title = page_data["title"]
- # Only need to check that these exist:
- revision = page_data["revisions"][0]
- revision["slots"]["main"]["*"]
- revision["timestamp"]
- except (exceptions.APIError, KeyError, IndexError):
- return None
- page = site.get_page(title)
- # EarwigBot doesn't understand old revisions of pages, so we use a somewhat
- # dirty hack to make this work:
- page._load_attributes(res)
- page._load_content(res)
- return page
- def _perform_check(query, page, use_engine, use_links):
- conn = get_db()
- sql_error = get_sql_error()
- mode = "{0}:{1}:".format(use_engine, use_links)
- if not _coerce_bool(query.nocache):
- try:
- query.result = _get_cached_results(
- page, conn, mode, _coerce_bool(query.noskip))
- except sql_error:
- _LOGGER.exception("Failed to retrieve cached results")
- if not query.result:
- try:
- query.result = page.copyvio_check(
- min_confidence=T_SUSPECT, max_queries=8, max_time=45,
- no_searches=not use_engine, no_links=not use_links,
- short_circuit=not query.noskip)
- except exceptions.SearchQueryError as exc:
- query.error = "search error"
- query.exception = exc
- return
- query.result.cached = False
- try:
- _cache_result(page, query.result, conn, mode)
- except sql_error:
- _LOGGER.exception("Failed to cache results")
- def _get_cached_results(page, conn, mode, noskip):
- query1 = """SELECT cache_time, cache_queries, cache_process_time,
- cache_possible_miss
- FROM cache
- WHERE cache_id = ?"""
- query2 = """SELECT cdata_url, cdata_confidence, cdata_skipped, cdata_excluded
- FROM cache_data
- WHERE cdata_cache_id = ?"""
- cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
- cursor = conn.cursor()
- cursor.execute(query1, (cache_id,))
- results = cursor.fetchall()
- if not results:
- return None
- cache_time, queries, check_time, possible_miss = results[0]
- if possible_miss and noskip:
- return None
- if not isinstance(cache_time, datetime):
- cache_time = datetime.utcfromtimestamp(cache_time)
- if datetime.utcnow() - cache_time > timedelta(days=3):
- return None
- cursor.execute(query2, (cache_id,))
- data = cursor.fetchall()
- if not data: # TODO: do something less hacky for this edge case
- article_chain = MarkovChain(ArticleTextParser(page.get()).strip())
- result = CopyvioCheckResult(False, [], queries, check_time,
- article_chain, possible_miss)
- result.cached = True
- result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
- result.cache_age = _format_date(cache_time)
- return result
- url, confidence, skipped, excluded = data.pop(0)
- if skipped: # Should be impossible: data must be bad; run a new check
- return None
- result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30)
- if abs(result.confidence - confidence) >= 0.0001:
- return None
- for url, confidence, skipped, excluded in data:
- if noskip and skipped:
- return None
- source = CopyvioSource(None, url)
- source.confidence = confidence
- source.skipped = bool(skipped)
- source.excluded = bool(excluded)
- result.sources.append(source)
- result.queries = queries
- result.time = check_time
- result.possible_miss = possible_miss
- result.cached = True
- result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
- result.cache_age = _format_date(cache_time)
- return result
- def _format_date(cache_time):
- formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s")
- diff = datetime.utcnow() - cache_time
- total_seconds = diff.days * 86400 + diff.seconds
- if total_seconds > 3600:
- return formatter(total_seconds / 3600, "hour")
- if total_seconds > 60:
- return formatter(total_seconds / 60, "minute")
- return formatter(total_seconds, "second")
- def _cache_result(page, result, conn, mode):
- expiry = sql_dialect(mysql="DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)",
- sqlite="STRFTIME('%s', 'now', '-3 days')")
- query1 = "DELETE FROM cache WHERE cache_id = ?"
- query2 = "DELETE FROM cache WHERE cache_time < %s" % expiry
- query3 = """INSERT INTO cache (cache_id, cache_queries, cache_process_time,
- cache_possible_miss) VALUES (?, ?, ?, ?)"""
- query4 = """INSERT INTO cache_data (cdata_cache_id, cdata_url,
- cdata_confidence, cdata_skipped,
- cdata_excluded) VALUES (?, ?, ?, ?, ?)"""
- cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
- data = [(cache_id, source.url[:1024], source.confidence, source.skipped,
- source.excluded)
- for source in result.sources]
- with get_cursor(conn) as cursor:
- cursor.execute(query1, (cache_id,))
- cursor.execute(query2)
- cursor.execute(query3, (cache_id, result.queries, result.time,
- result.possible_miss))
- cursor.executemany(query4, data)