diff --git a/copyvios/checker.py b/copyvios/checker.py index 7c5f5f3..91452d9 100644 --- a/copyvios/checker.py +++ b/copyvios/checker.py @@ -5,7 +5,9 @@ from hashlib import sha256 from urlparse import urlparse from earwigbot import exceptions -from earwigbot.wiki.copyvios.markov import EMPTY +from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain +from earwigbot.wiki.copyvios.parsers import ArticleTextParser +from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult from .misc import Query, get_cache_db from .sites import get_site, get_sites @@ -63,7 +65,7 @@ def _get_results(query, follow=True): return mode = "{0}:{1}:".format(use_engine, use_links) if not query.nocache: - query.result = _get_cached_results(page, conn, query, mode) + query.result = _get_cached_results(page, conn, mode) if not query.result: query.result = page.copyvio_check( min_confidence=T_SUSPECT, max_queries=10, max_time=45, @@ -80,10 +82,13 @@ def _get_results(query, follow=True): elif scheme not in ["http", "https"]: query.error = "bad URI" return - result = _do_copyvio_compare(query, page, query.url) - if result: - query.result = result - query.result.cached = False + result = page.copyvio_compare(query.url, min_confidence=T_SUSPECT, + max_time=30) + if result.best.chains[0] is EMPTY: + query.error = "timeout" if result.time > 30 else "no data" + return + query.result = result + query.result.cached = False else: query.error = "bad action" @@ -105,36 +110,53 @@ def _get_page_by_revid(site, revid): page._load_content(res) return page -def _get_cached_results(page, conn, query, mode): +def _get_cached_results(page, conn, mode): query1 = """DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)""" - query2 = """SELECT cache_url, cache_time, cache_queries, cache_process_time + query2 = """SELECT cache_time, cache_queries, cache_process_time FROM cache - WHERE cache_id = ? AND cache_hash = ?""" - shahash = sha256(mode + page.get().encode("utf8")).hexdigest() + WHERE cache_id = ?""" + query3 = """SELECT cdata_url, cdata_confidence, cdata_skipped + FROM cache_data + WHERE cdata_cache_id = ?""" + cache_id = sha256(mode + page.get().encode("utf8")).digest() with conn.cursor() as cursor: cursor.execute(query1) - cursor.execute(query2, (page.pageid, shahash)) + cursor.execute(query2, (cache_id,)) results = cursor.fetchall() if not results: return None + cache_time, queries, check_time = results[0] + cursor.execute(query3, (cache_id,)) + data = cursor.fetchall() - url, cache_time, num_queries, original_time = results[0] - result = _do_copyvio_compare(query, page, url) - if result: + if not data: # TODO: do something less hacky for this edge case + artchain = MarkovChain(ArticleTextParser(page.get()).strip()) + result = CopyvioCheckResult(False, [], queries, check_time, artchain) result.cached = True - result.queries = num_queries - result.original_time = original_time result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC") result.cache_age = _format_date(cache_time) - return result + return result -def _do_copyvio_compare(query, page, url): + url, confidence, skipped = data.pop(0) + if skipped: # Should be impossible: data must be bad; run a new check + return None result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30) - if not url or result.source_chain is not EMPTY: - return result - query.error = "timeout" if result.time > 30 else "no data" + if result.confidence != confidence: + return None + + for url, confidence, skipped in data: + source = CopyvioSource(None, url) + source.confidence = confidence + source.skipped = skipped + result.sources.append(source) + result.queries = queries + result.time = check_time + result.cached = True + result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC") + result.cache_age = _format_date(cache_time) + return result def _format_date(cache_time): diff = datetime.utcnow() - cache_time @@ -145,13 +167,15 @@ def _format_date(cache_time): return "{0} seconds".format(diff.seconds) def _cache_result(page, result, conn, mode): - query = """INSERT INTO cache - VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?) - ON DUPLICATE KEY UPDATE - cache_url = ?, cache_time = CURRENT_TIMESTAMP, - cache_queries = ?, cache_process_time = ?""" - shahash = sha256(mode + page.get().encode("utf8")).hexdigest() - args = (page.pageid, shahash, result.url, result.queries, result.time, - result.url, result.queries, result.time) + query1 = "DELETE FROM cache WHERE cache_id = ?" + query2 = "INSERT INTO cache VALUES (?, DEFAULT, ?, ?)" + query3 = "INSERT INTO cache_data VALUES (DEFAULT, ?, ?, ?, ?)" + cache_id = sha256(mode + page.get().encode("utf8")).digest() + data = [(cache_id, source.url, source.confidence, source.skipped) + for source in result.sources] with conn.cursor() as cursor: - cursor.execute(query, args) + cursor.execute("START TRANSACTION") + cursor.execute(query1, (cache_id,)) + cursor.execute(query2, (cache_id, result.queries, result.time)) + cursor.executemany(query3, data) + cursor.execute("COMMIT") diff --git a/copyvios/highlighter.py b/copyvios/highlighter.py index 791c64b..ca53357 100644 --- a/copyvios/highlighter.py +++ b/copyvios/highlighter.py @@ -2,14 +2,17 @@ from re import sub, UNICODE +from earwigbot.wiki.copyvios.markov import EMPTY_INTERSECTION from markupsafe import escape __all__ = ["highlight_delta"] -def highlight_delta(context, chain, delta): +def highlight_delta(context, chain, delta=None): degree = chain.degree - 1 highlights = [False] * degree block = [chain.START] * degree + if not delta: + delta = EMPTY_INTERSECTION for word in chain.text.split() + ([chain.END] * degree): word = _strip_word(chain, word) tblock = tuple(block) diff --git a/templates/index.mako b/templates/index.mako index af468fe..2165f94 100644 --- a/templates/index.mako +++ b/templates/index.mako @@ -178,15 +178,7 @@
  • Redirected from ${query.redirected_from.title | h}. Check the original page.
  • % endif % if result.cached: -
  • - Results are cachedTo save time (and money), this tool will retain the results of checks for up to 72 hours. This includes the URL of the "violated" source, but neither its content nor the content of the article. Future checks on the same page (assuming it remains unchanged) will not involve additional search queries, but a fresh comparison against the source URL will be made. If the page is modified, a new check will be run. from ${result.cache_age} ago. Retrieved in ${round(result.time, 3)} seconds (originally generated in - % if result.queries: - ${round(result.original_time, 3)}s using ${result.queries} queries). - % else: - ${round(result.original_time, 3)}s). - % endif - Bypass the cache. -
  • +
  • Results are cachedTo save time (and money), this tool will retain the results of checks for up to 72 hours. This includes the URLs of the checked sources, but neither their content nor the content of the article. Future checks on the same page (assuming it remains unchanged) will not involve additional search queries, but a fresh comparison against the source URL will be made. If the page is modified, a new check will be run. from ${result.cache_age} ago. Originally generated in ${round(result.time, 3)} seconds using ${result.queries} queries. Bypass the cache.
  • % else:
  • Results generated in ${round(result.time, 3)} seconds using ${result.queries} queries.
  • % endif @@ -194,8 +186,8 @@ - - + +
    Article:

    ${highlight_delta(result.article_chain, result.delta_chain)}

    Source:

    ${highlight_delta(result.source_chain, result.delta_chain)}

    Article:

    ${highlight_delta(result.article_chain, result.best.chains[1] if result.best else None)}

    Source:

    ${highlight_delta(result.best.chains[0], result.best.chains[1]) if result.best else ""}

    % endif