From 7d92f7a76b7c8022715d8129d94cc95d588e2e27 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 24 Feb 2012 18:01:38 -0500 Subject: [PATCH] Copyvio detection interface is (mostly) done. --- pages/copyvios.mako | 219 +++++++++++++++++++++++++++++++++++++++++++++--- static/css/copyvios.css | 88 +++++++++++++++++++ static/js/copyvios.js | 12 +++ 3 files changed, 306 insertions(+), 13 deletions(-) create mode 100644 static/css/copyvios.css create mode 100644 static/js/copyvios.js diff --git a/pages/copyvios.mako b/pages/copyvios.mako index 4f2b540..153022f 100644 --- a/pages/copyvios.mako +++ b/pages/copyvios.mako @@ -1,10 +1,165 @@ <%! + from collections import defaultdict + from datetime import datetime + from hashlib import sha256 + from itertools import count + from os.path import expanduser + from re import sub, UNICODE from sys import path + from time import time from urlparse import parse_qs + import oursql + path.insert(0, "../earwigbot") import earwigbot + + def get_results(lang, project, title, query): + earwigbot.config.config.load("config.ts-earwigbot.json") + try: + site = earwigbot.wiki.get_site(lang=lang, project=project) + except earwigbot.wiki.SiteNotFoundError: + return None, None + page = site.get_page(title) + conn = open_sql_connection() + if not query.get("nocache"): + result = get_cached_results(page, conn) + if query.get("nocache") or not result: + result = get_fresh_results(page, conn) + return page, result + + def open_sql_connection(): + conn_args = earwigbot.config.config.wiki["_toolserverSQLCache"] + conn_args["read_default_file"] = expanduser("~/.my.cnf") + return oursql.connect(**conn_args) + + def get_cached_results(page, conn): + query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 1 DAY)" + query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?" + pageid = page.pageid() + hash = sha256(page.get()).hexdigest() + t_start = time() + + with conn.cursor() as cursor: + cursor.execute(query1) + cursor.execute(query2, (pageid, hash)) + results = cursor.fetchall() + if not results: + return None + + url, cache_time, num_queries, original_tdiff = results[0] + result = page.copyvio_compare(url, min_confidence=0.5) + result.cached = True + result.queries = num_queries + result.tdiff = time() - t_start + result.original_tdiff = original_tdiff + result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC") + result.cache_age = format_date(cache_time) + return result + + def format_date(cache_time): + diff = datetime.utcnow() - cache_time + if diff.seconds > 3600: + return "{0} hours".format(diff.seconds / 3600) + if diff.seconds > 60: + return "{0} minutes".format(diff.seconds / 60) + return "{0} seconds".format(diff.seconds) + + def get_fresh_results(page, conn): + t_start = time() + result = page.copyvio_check(min_confidence=0.5, max_queries=10) + result.cached = False + result.tdiff = time() - t_start + cache_result(page, result, conn) + return result + + def cache_result(page, result, conn): + pageid = page.pageid() + hash = sha256(page.get()).hexdigest() + query1 = "SELECT 1 FROM cache WHERE cache_id = ?" + query2 = "DELETE FROM cache WHERE cache_id = ?" + query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)" + with conn.cursor() as cursor: + cursor.execute(query1, (pageid,)) + if cursor.fetchall(): + cursor.execute(query2, (pageid,)) + cursor.execute(query3, (pageid, hash, result.url, result.queries, + result.tdiff)) + + def highlight_delta(chain, delta): + processed = [] + prev = chain.START + i = 0 + all_words = chain.text.split() + paragraphs = chain.text.split("\n") + for paragraph in paragraphs: + processed_words = [] + words = paragraph.split(" ") + for word, i in zip(words, count(i)): + try: + next = strip_word(all_words[i+1]) + except IndexError: + next = chain.END + sword = strip_word(word) + before = prev in delta.chain and sword in delta.chain[prev] + after = sword in delta.chain and next in delta.chain[sword] + is_first = i == 0 + is_last = i + 1 == len(all_words) + res = highlight_word(word, before, after, is_first, is_last) + processed_words.append(res) + prev = sword + processed.append(u" ".join(processed_words)) + i += 1 + return u"

".join(processed) + + def highlight_word(word, before, after, is_first, is_last): + if before and after: + # Word is in the middle of a highlighted block, so don't change + # anything unless this is the first word (force block to start) or + # the last word (force block to end): + res = word + if is_first: + res = u'' + res + if is_last: + res += u'' + elif before: + # Word is the last in a highlighted block, so fade it out and then + # end the block; force open a block before the word if this is the + # first word: + res = fade_word(word, u"out") + u"" + if is_first: + res = u'' + res + elif after: + # Word is the first in a highlighted block, so start the block and + # then fade it in; force close the block after the word if this is + # the last word: + res = u'' + fade_word(word, u"in") + if is_last: + res += u"" + else: + # Word is completely outside of a highlighted block, so do nothing: + res = word + return res + + def fade_word(word, dir): + if len(word) <= 4: + return u'{1}'.format(dir, word) + if dir == u"out": + return u'{0}{1}'.format(word[:-4], word[-4:]) + return u'{0}{1}'.format(word[:4], word[4:]) + + def strip_word(word): + return sub("[^\w\s-]", "", word.lower(), flags=UNICODE) + + def urlstrip(url): + if url.startswith("http://"): + url = url[7:] + if url.startswith("www."): + url = url[4:] + if url.endswith("/"): + url = url[:-1] + return url %>\ <% query = parse_qs(environ["QUERY_STRING"]) @@ -15,15 +170,9 @@ except (KeyError, IndexError): page = None else: - earwigbot.config.config.load("config.ts-earwigbot.json") - try: - site = earwigbot.wiki.get_site(lang=lang, project=project) - except earwigbot.wiki.SiteNotFoundError: - page = None - else: - page = site.get_page(title) + page, result = get_results(lang, project, title, query) %>\ -<%include file="/support/header.mako" args="environ=environ, title='Copyvio Detector'"/> +<%include file="/support/header.mako" args="environ=environ, title='Copyvio Detector', add_css=('copyvios.css',), add_js=('copyvios.js',)"/>

Copyvio Detector

This tool attempts to detect copyright violations in Wikipedia articles.

@@ -42,20 +191,64 @@ Page title: % if page: - + % else: % endif + % if query.get("nocache") or page: + + Bypass cache: + % if query.get("nocache"): + + % else: + + % endif + + % endif
% if page: -
- Result for ${page.title()}: -

Watch this space!

-
+
+
+ % if result.violation: +

${page.title() | h} is a suspected violation of ${result.url | urlstrip}.

+ % else: +

No violations detected in ${page.title() | h}.

+ % endif + + +
% endif <%include file="/support/footer.mako" args="environ=environ"/> diff --git a/static/css/copyvios.css b/static/css/copyvios.css new file mode 100644 index 0000000..573e2f3 --- /dev/null +++ b/static/css/copyvios.css @@ -0,0 +1,88 @@ +div#cv-result-yes { + padding: 0 10px 0 10px; + margin: 0 5px 10px 5px; + background-color: #FEE; + border: 1px solid #F77; +} + +div#cv-result-no { + padding: 0 10px 0 10px; + margin: 0 5px 10px 5px; + background-color: #EFE; + border: 1px solid #7F7; +} + +div#cv-result-detail { + padding: 0 10px 0 10px; + margin: 10px 10px 20px 10px; + background-color: #F3F3F3; + border: 1px solid #AAA; +} + +table#cv-chain-table { + width: 100%; + border-spacing: 10px; +} + +h2#cv-result-header { + margin-bottom: 0; +} + +ul#cv-result-list { + margin-top: 0.5em; +} + +ul#cv-result-detail-list { + margin-bottom: 0; +} + +a#cv-cached { + position: relative; +} + +a#cv-cached span { + display: none; + position: absolute; + top: 20px; + left: -50px; + width: 500px; + padding: 5px; + z-index: 1; + background: #F3F3F3; + border: 1px solid #AAA; + color: black; +} + +a:active#cv-cached { color: #040; text-decoration: none; } +a:hover#cv-cached { text-decoration: none; } +a:hover#cv-cached span { display: block; } + +div.cv-chain-detail { + padding: 0 10px 0 10px; + background-color: #FFF; + border: 1px solid #BBB; +} + +span.cv-hl { + background: #FAA; +} + +span.cv-hl-in { + background: #FCC; + background: -webkit-gradient(linear, 0% 0%, 100% 0%, from(#FFF), to(#FAA)); /* Chrome, Safari 4+ */ + background: -webkit-linear-gradient(left, #FFF, #FAA); /* Chrome 10+, Safari 5+ */ + background: -moz-linear-gradient(left, #FFF, #FAA); /* Firefox 3.6+ */ + background: -ms-linear-gradient(left, #FFF, #FAA); /* IE 10+ */ + background: -o-linear-gradient(left, #FFF, #FAA); /* Opera 11.10+ */ + background: linear-gradient(left, #FFF, #FAA); +} + +span.cv-hl-out { + background: #FCC; + background: -webkit-gradient(linear, 0% 0%, 100% 0%, from(#FAA), to(#FFF)); /* Chrome, Safari 4+ */ + background: -webkit-linear-gradient(left, #FAA, #FFF); /* Chrome 10+, Safari 5+ */ + background: -moz-linear-gradient(left, #FAA, #FFF); /* Firefox 3.6+ */ + background: -ms-linear-gradient(left, #FAA, #FFF); /* IE 10+ */ + background: -o-linear-gradient(left, #FAA, #FFF); /* Opera 11.10+ */ + background: linear-gradient(left, #FAA, #FFF); +} diff --git a/static/js/copyvios.js b/static/js/copyvios.js new file mode 100644 index 0000000..30a3ed2 --- /dev/null +++ b/static/js/copyvios.js @@ -0,0 +1,12 @@ +function copyvio_toggle_details(details) { + link = document.getElementById("cv-result-detail-link"); + details = document.getElementById("cv-result-detail"); + + if (link.innerHTML == "Show details:") { + details.style.display = "block"; + link.innerHTML = "Hide details:"; + } else { + details.style.display = "none"; + link.innerHTML = "Show details:"; + } +}