diff --git a/pages/copyvios.mako b/pages/copyvios.mako
index 4f2b540..153022f 100644
--- a/pages/copyvios.mako
+++ b/pages/copyvios.mako
@@ -1,10 +1,165 @@
<%!
+ from collections import defaultdict
+ from datetime import datetime
+ from hashlib import sha256
+ from itertools import count
+ from os.path import expanduser
+ from re import sub, UNICODE
from sys import path
+ from time import time
from urlparse import parse_qs
+ import oursql
+
path.insert(0, "../earwigbot")
import earwigbot
+
+ def get_results(lang, project, title, query):
+ earwigbot.config.config.load("config.ts-earwigbot.json")
+ try:
+ site = earwigbot.wiki.get_site(lang=lang, project=project)
+ except earwigbot.wiki.SiteNotFoundError:
+ return None, None
+ page = site.get_page(title)
+ conn = open_sql_connection()
+ if not query.get("nocache"):
+ result = get_cached_results(page, conn)
+ if query.get("nocache") or not result:
+ result = get_fresh_results(page, conn)
+ return page, result
+
+ def open_sql_connection():
+ conn_args = earwigbot.config.config.wiki["_toolserverSQLCache"]
+ conn_args["read_default_file"] = expanduser("~/.my.cnf")
+ return oursql.connect(**conn_args)
+
+ def get_cached_results(page, conn):
+ query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 1 DAY)"
+ query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
+ pageid = page.pageid()
+ hash = sha256(page.get()).hexdigest()
+ t_start = time()
+
+ with conn.cursor() as cursor:
+ cursor.execute(query1)
+ cursor.execute(query2, (pageid, hash))
+ results = cursor.fetchall()
+ if not results:
+ return None
+
+ url, cache_time, num_queries, original_tdiff = results[0]
+ result = page.copyvio_compare(url, min_confidence=0.5)
+ result.cached = True
+ result.queries = num_queries
+ result.tdiff = time() - t_start
+ result.original_tdiff = original_tdiff
+ result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
+ result.cache_age = format_date(cache_time)
+ return result
+
+ def format_date(cache_time):
+ diff = datetime.utcnow() - cache_time
+ if diff.seconds > 3600:
+ return "{0} hours".format(diff.seconds / 3600)
+ if diff.seconds > 60:
+ return "{0} minutes".format(diff.seconds / 60)
+ return "{0} seconds".format(diff.seconds)
+
+ def get_fresh_results(page, conn):
+ t_start = time()
+ result = page.copyvio_check(min_confidence=0.5, max_queries=10)
+ result.cached = False
+ result.tdiff = time() - t_start
+ cache_result(page, result, conn)
+ return result
+
+ def cache_result(page, result, conn):
+ pageid = page.pageid()
+ hash = sha256(page.get()).hexdigest()
+ query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
+ query2 = "DELETE FROM cache WHERE cache_id = ?"
+ query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
+ with conn.cursor() as cursor:
+ cursor.execute(query1, (pageid,))
+ if cursor.fetchall():
+ cursor.execute(query2, (pageid,))
+ cursor.execute(query3, (pageid, hash, result.url, result.queries,
+ result.tdiff))
+
+ def highlight_delta(chain, delta):
+ processed = []
+ prev = chain.START
+ i = 0
+ all_words = chain.text.split()
+ paragraphs = chain.text.split("\n")
+ for paragraph in paragraphs:
+ processed_words = []
+ words = paragraph.split(" ")
+ for word, i in zip(words, count(i)):
+ try:
+ next = strip_word(all_words[i+1])
+ except IndexError:
+ next = chain.END
+ sword = strip_word(word)
+ before = prev in delta.chain and sword in delta.chain[prev]
+ after = sword in delta.chain and next in delta.chain[sword]
+ is_first = i == 0
+ is_last = i + 1 == len(all_words)
+ res = highlight_word(word, before, after, is_first, is_last)
+ processed_words.append(res)
+ prev = sword
+ processed.append(u" ".join(processed_words))
+ i += 1
+ return u"
".join(processed)
+
+ def highlight_word(word, before, after, is_first, is_last):
+ if before and after:
+ # Word is in the middle of a highlighted block, so don't change
+ # anything unless this is the first word (force block to start) or
+ # the last word (force block to end):
+ res = word
+ if is_first:
+ res = u'' + res
+ if is_last:
+ res += u''
+ elif before:
+ # Word is the last in a highlighted block, so fade it out and then
+ # end the block; force open a block before the word if this is the
+ # first word:
+ res = fade_word(word, u"out") + u""
+ if is_first:
+ res = u'' + res
+ elif after:
+ # Word is the first in a highlighted block, so start the block and
+ # then fade it in; force close the block after the word if this is
+ # the last word:
+ res = u'' + fade_word(word, u"in")
+ if is_last:
+ res += u""
+ else:
+ # Word is completely outside of a highlighted block, so do nothing:
+ res = word
+ return res
+
+ def fade_word(word, dir):
+ if len(word) <= 4:
+ return u'{1}'.format(dir, word)
+ if dir == u"out":
+ return u'{0}{1}'.format(word[:-4], word[-4:])
+ return u'{0}{1}'.format(word[:4], word[4:])
+
+ def strip_word(word):
+ return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)
+
+ def urlstrip(url):
+ if url.startswith("http://"):
+ url = url[7:]
+ if url.startswith("www."):
+ url = url[4:]
+ if url.endswith("/"):
+ url = url[:-1]
+ return url
%>\
<%
query = parse_qs(environ["QUERY_STRING"])
@@ -15,15 +170,9 @@
except (KeyError, IndexError):
page = None
else:
- earwigbot.config.config.load("config.ts-earwigbot.json")
- try:
- site = earwigbot.wiki.get_site(lang=lang, project=project)
- except earwigbot.wiki.SiteNotFoundError:
- page = None
- else:
- page = site.get_page(title)
+ page, result = get_results(lang, project, title, query)
%>\
-<%include file="/support/header.mako" args="environ=environ, title='Copyvio Detector'"/>
+<%include file="/support/header.mako" args="environ=environ, title='Copyvio Detector', add_css=('copyvios.css',), add_js=('copyvios.js',)"/>
Retrieved from cache in ${round(result.tdiff, 3)} seconds (originally generated in ${round(result.original_tdiff, 3)}s using ${result.queries} queries; ${round(result.original_tdiff - result.tdiff, 3)}s saved).
+ % else:
+
Retrieved from cache in ${round(result.tdiff, 3)} seconds (originally generated in ${round(result.original_tdiff, 3)}s; ${round(result.original_tdiff - result.tdiff, 3)}s saved).