%!
from collections import defaultdict
from datetime import datetime
from hashlib import sha256
from itertools import count
from os.path import expanduser
from re import sub, UNICODE
from sys import path
from time import time
from urlparse import parse_qs
from earwigbot import bot, exceptions
import oursql
def get_results(lang, project, title, query):
bot = bot.Bot(".earwigbot")
try:
site = bot.wiki.get_site(lang=lang, project=project)
except exceptions.SiteNotFoundError:
try:
site = bot.wiki.add_site(lang=lang, project=project)
except exceptions.APIError:
return None, None
page = site.get_page(title)
conn = open_sql_connection(bot)
"""if not query.get("nocache"):
result = get_cached_results(page, conn)
if query.get("nocache") or not result:
result = get_fresh_results(page, conn)
"""
mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get())
mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain("This is some random textual content for a page.")
mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2)
result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult(
True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci))
return page, result
def open_sql_connection(bot):
conn_args = bot.config.wiki["_toolserverSQLCache"]
if "read_default_file" not in conn_args and "user" not in conn_args and "passwd" not in conn_args:
conn_args["read_default_file"] = expanduser("~/.my.cnf")
if "autoping" not in conn_args:
conn_args["autoping"] = True
if "autoreconnect" not in conn_args:
conn_args["autoreconnect"] = True
return oursql.connect(**conn_args)
def get_cached_results(page, conn):
query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 1 DAY)"
query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
pageid = page.pageid()
hash = sha256(page.get()).hexdigest()
t_start = time()
with conn.cursor() as cursor:
cursor.execute(query1)
cursor.execute(query2, (pageid, hash))
results = cursor.fetchall()
if not results:
return None
url, cache_time, num_queries, original_tdiff = results[0]
result = page.copyvio_compare(url)
result.cached = True
result.queries = num_queries
result.tdiff = time() - t_start
result.original_tdiff = original_tdiff
result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
result.cache_age = format_date(cache_time)
return result
def format_date(cache_time):
diff = datetime.utcnow() - cache_time
if diff.seconds > 3600:
return "{0} hours".format(diff.seconds / 3600)
if diff.seconds > 60:
return "{0} minutes".format(diff.seconds / 60)
return "{0} seconds".format(diff.seconds)
def get_fresh_results(page, conn):
t_start = time()
result = page.copyvio_check(max_queries=10)
result.cached = False
result.tdiff = time() - t_start
cache_result(page, result, conn)
return result
def cache_result(page, result, conn):
pageid = page.pageid()
hash = sha256(page.get()).hexdigest()
query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
query2 = "DELETE FROM cache WHERE cache_id = ?"
query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
with conn.cursor() as cursor:
cursor.execute(query1, (pageid,))
if cursor.fetchall():
cursor.execute(query2, (pageid,))
cursor.execute(query3, (pageid, hash, result.url, result.queries,
result.tdiff))
def highlight_delta(chain, delta):
processed = []
prev_prev = prev = chain.START
i = 0
all_words = chain.text.split()
paragraphs = chain.text.split("\n")
for paragraph in paragraphs:
processed_words = []
words = paragraph.split(" ")
for word, i in zip(words, count(i)):
try:
next = strip_word(all_words[i+1])
except IndexError:
next = chain.END
sword = strip_word(word)
block = [prev_prev, prev] # Block for before
alock = [prev, sword] # Block for after
before = [block in delta.chain and sword in delta.chain[block]]
after = [alock in delta.chain and next in delta.chain[alock]]
is_first = i == 0
is_last = i + 1 == len(all_words)
res = highlight_word(word, before, after, is_first, is_last)
processed_words.append(res)
prev_prev = prev
prev = sword
processed.append(u" ".join(processed_words))
i += 1
return u"
".join(processed)
def highlight_word(word, before, after, is_first, is_last):
if before and after:
# Word is in the middle of a highlighted block, so don't change
# anything unless this is the first word (force block to start) or
# the last word (force block to end):
res = word
if is_first:
res = u'' + res
if is_last:
res += u''
elif before:
# Word is the last in a highlighted block, so fade it out and then
# end the block; force open a block before the word if this is the
# first word:
res = fade_word(word, u"out") + u""
if is_first:
res = u'' + res
elif after:
# Word is the first in a highlighted block, so start the block and
# then fade it in; force close the block after the word if this is
# the last word:
res = u'' + fade_word(word, u"in")
if is_last:
res += u""
else:
# Word is completely outside of a highlighted block, so do nothing:
res = word
return res
def fade_word(word, dir):
if len(word) <= 4:
return u'{1}'.format(dir, word)
if dir == u"out":
return u'{0}{1}'.format(word[:-4], word[-4:])
return u'{0}{1}'.format(word[:4], word[4:])
def strip_word(word):
return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)
def urlstrip(url):
if url.startswith("http://"):
url = url[7:]
if url.startswith("https://"):
url = url[8:]
if url.startswith("www."):
url = url[4:]
if url.endswith("/"):
url = url[:-1]
return url
%>\
<%
query = parse_qs(environ["QUERY_STRING"])
try:
lang = query["lang"][0]
project = query["project"][0]
title = query["title"][0]
except (KeyError, IndexError):
page = None
else:
page, result = get_results(lang, project, title, query)
%>\
<%include file="/support/header.mako" args="environ=environ, title='Copyvio Detector', add_css=('copyvios.css',), add_js=('copyvios.js',)"/>
Retrieved from cache in ${round(result.tdiff, 3)} seconds (originally generated in ${round(result.original_tdiff, 3)}s using ${result.queries} queries; ${round(result.original_tdiff - result.tdiff, 3)}s saved).
% else:
Retrieved from cache in ${round(result.tdiff, 3)} seconds (originally generated in ${round(result.original_tdiff, 3)}s; ${round(result.original_tdiff - result.tdiff, 3)}s saved).