Browse Source

Slightly modified interface; give more options (closes #17).

pull/24/head
Ben Kurtovic 10 years ago
parent
commit
710447a6bf
6 changed files with 153 additions and 82 deletions
  1. +26
    -14
      copyvios/checker.py
  2. +13
    -0
      static/script.js
  3. +34
    -14
      static/style.css
  4. +78
    -53
      templates/index.mako
  5. +1
    -1
      templates/settings.mako
  6. +1
    -0
      templates/support/header.mako

+ 26
- 14
copyvios/checker.py View File

@@ -25,7 +25,8 @@ def do_check():
query.project = query.project.lower() query.project = query.project.lower()


query.all_langs, query.all_projects = get_sites() query.all_langs, query.all_projects = get_sites()
if query.project and query.lang and (query.title or query.oldid):
query.submitted = query.project and query.lang and (query.title or query.oldid)
if query.submitted:
query.site = get_site(query) query.site = get_site(query)
if query.site: if query.site:
_get_results(query, follow=query.noredirect is None) _get_results(query, follow=query.noredirect is None)
@@ -51,7 +52,25 @@ def _get_results(query, follow=True):
query.redirected_from = page query.redirected_from = page
return _get_results(query, follow=False) return _get_results(query, follow=False)


if query.url:
if not query.action:
query.action = "compare" if query.url else "search"
if query.action == "search":
conn = get_cache_db()
use_engine = 1 if query.use_engine else 0
use_links = 1 if query.use_links else 0
mode = "{0}:{1}:".format(use_engine, use_links)
if not query.nocache:
query.result = _get_cached_results(page, conn, query, mode)
if not query.result:
query.result = page.copyvio_check(
min_confidence=T_SUSPECT, max_queries=10, max_time=45,
no_searches=not use_engine, no_links=not use_links)
query.result.cached = False
_cache_result(page, query.result, conn, mode)
elif query.action == "compare":
if not query.url:
query.error = "no URL"
return
scheme = urlparse(query.url).scheme scheme = urlparse(query.url).scheme
if not scheme and query.url[0] not in ":/": if not scheme and query.url[0] not in ":/":
query.url = "http://" + query.url query.url = "http://" + query.url
@@ -63,14 +82,7 @@ def _get_results(query, follow=True):
query.result = result query.result = result
query.result.cached = False query.result.cached = False
else: else:
conn = get_cache_db()
if not query.nocache:
query.result = _get_cached_results(page, conn, query)
if not query.result:
query.result = page.copyvio_check(
min_confidence=T_SUSPECT, max_queries=10, max_time=45)
query.result.cached = False
_cache_result(page, query.result, conn)
query.error = "bad action"


def _get_page_by_revid(site, revid): def _get_page_by_revid(site, revid):
res = site.api_query(action="query", prop="info|revisions", revids=revid, res = site.api_query(action="query", prop="info|revisions", revids=revid,
@@ -90,13 +102,13 @@ def _get_page_by_revid(site, revid):
page._load_content(res) page._load_content(res)
return page return page


def _get_cached_results(page, conn, query):
def _get_cached_results(page, conn, query, mode):
query1 = """DELETE FROM cache query1 = """DELETE FROM cache
WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)""" WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"""
query2 = """SELECT cache_url, cache_time, cache_queries, cache_process_time query2 = """SELECT cache_url, cache_time, cache_queries, cache_process_time
FROM cache FROM cache
WHERE cache_id = ? AND cache_hash = ?""" WHERE cache_id = ? AND cache_hash = ?"""
shahash = sha256(page.get().encode("utf8")).hexdigest()
shahash = sha256(mode + page.get().encode("utf8")).hexdigest()


with conn.cursor() as cursor: with conn.cursor() as cursor:
cursor.execute(query1) cursor.execute(query1)
@@ -129,13 +141,13 @@ def _format_date(cache_time):
return "{0} minutes".format(diff.seconds / 60) return "{0} minutes".format(diff.seconds / 60)
return "{0} seconds".format(diff.seconds) return "{0} seconds".format(diff.seconds)


def _cache_result(page, result, conn):
def _cache_result(page, result, conn, mode):
query = """INSERT INTO cache query = """INSERT INTO cache
VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?) VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)
ON DUPLICATE KEY UPDATE ON DUPLICATE KEY UPDATE
cache_url = ?, cache_time = CURRENT_TIMESTAMP, cache_url = ?, cache_time = CURRENT_TIMESTAMP,
cache_queries = ?, cache_process_time = ?""" cache_queries = ?, cache_process_time = ?"""
shahash = sha256(page.get().encode("utf8")).hexdigest()
shahash = sha256(mode + page.get().encode("utf8")).hexdigest()
args = (page.pageid, shahash, result.url, result.queries, result.time, args = (page.pageid, shahash, result.url, result.queries, result.time,
result.url, result.queries, result.time) result.url, result.queries, result.time)
with conn.cursor() as cursor: with conn.cursor() as cursor:


+ 13
- 0
static/script.js View File

@@ -87,3 +87,16 @@ function set_cookie(name, value, days) {
function delete_cookie(name) { function delete_cookie(name) {
set_cookie(name, "", -1); set_cookie(name, "", -1);
} }

$(document).ready(function() {
$("#action-search").change(function() {
$("#cv-cb-engine").prop("disabled", false);
$("#cv-cb-links").prop("disabled", false);
$("#url-box").prop("disabled", true);
}).change();
$("#action-compare").change(function() {
$("#cv-cb-engine").prop("disabled", true);
$("#cv-cb-links").prop("disabled", true);
$("#url-box").prop("disabled", false);
});
});

+ 34
- 14
static/style.css View File

@@ -18,15 +18,15 @@ h2 {
div#header { div#header {
font-size: 2.5em; font-size: 2.5em;
font-weight: bold; font-weight: bold;
margin: 30px 60px 30px 60px;
padding: 10px 15px 10px 15px;
margin: 20px 60px;
padding: 10px 15px;
border: 1px solid #777; border: 1px solid #777;
background-color: #FFF; background-color: #FFF;
} }


div#container { div#container {
line-height: 1.25; line-height: 1.25;
margin: 0 60px 75px 60px;
margin: 0 60px 65px 60px;
padding: 5px 15px 15px 15px; padding: 5px 15px 15px 15px;
border: 1px solid #777; border: 1px solid #777;
background-color: #FFF; background-color: #FFF;
@@ -50,8 +50,8 @@ div#info-box {
} }


div#cv-result { div#cv-result {
padding: 5px 10px 0 10px;
margin: 0 5px 10px 5px;
padding: 10px;
margin: 15px 5px 10px 5px;
} }


table#heading { table#heading {
@@ -62,6 +62,11 @@ table#cv-form {
width: 750px; width: 750px;
} }


table#cv-form-inner {
width: 100%;
border-spacing: 0;
}

table#cv-chain-table { table#cv-chain-table {
width: 100%; width: 100%;
margin-bottom: 10px; margin-bottom: 10px;
@@ -89,13 +94,24 @@ td#cv-col4 {
width: 15%; width: 15%;
} }


td#cv-inner-col1 {
width: 4%;
}

td#cv-inner-col2 {
width: 22%;
}

td#cv-inner-col3 {
width: 76%;
}

h2#cv-result-header { h2#cv-result-header {
margin-top: 0.2em;
margin-bottom: 0;
margin: 0;
} }


ul#cv-result-list { ul#cv-result-list {
margin-top: 0.5em;
margin: 0.5em 0;
} }


a#cv-cached { a#cv-cached {
@@ -115,12 +131,6 @@ a#cv-cached span {
color: black; color: black;
} }


div.divider {
border-bottom: 1px solid #AAA;
margin-top: 15px;
margin-bottom: 15px;
}

div.green-box { div.green-box {
background-color: #EFE; background-color: #EFE;
border: 1px solid #7F7; border: 1px solid #7F7;
@@ -155,6 +165,16 @@ input.cv-text {
width: 100%; width: 100%;
} }


input#cv-cb-engine {
margin-left: 0;
margin-right: 5px;
}

input#cv-cb-links {
margin-left: 20px;
margin-right: 5px;
}

span.cv-hl { span.cv-hl {
background: #FAA; background: #FAA;
} }


+ 78
- 53
templates/index.mako View File

@@ -5,8 +5,16 @@
<%include file="/support/header.mako" args="title='Earwig\'s Copyvio Detector'"/> <%include file="/support/header.mako" args="title='Earwig\'s Copyvio Detector'"/>
<%namespace module="copyvios.highlighter" import="highlight_delta"/>\ <%namespace module="copyvios.highlighter" import="highlight_delta"/>\
<%namespace module="copyvios.misc" import="httpsfix, urlstrip"/>\ <%namespace module="copyvios.misc" import="httpsfix, urlstrip"/>\
% if query.project and query.lang and (query.title or query.oldid):
% if query.error == "bad URI":
% if query.submitted:
% if query.error == "bad action":
<div id="info-box" class="red-box">
<p>Unknown action: <b><span class="mono">${query.action | h}</span></b>.</p>
</div>
% elif query.error == "no URL":
<div id="info-box" class="red-box">
<p>URL comparison mode requires a URL to be entered. Enter one in the text box below, or choose copyvio search mode to look for content similar to the article elsewhere on the web.</p>
</div>
% elif query.error == "bad URI":
<div id="info-box" class="red-box"> <div id="info-box" class="red-box">
<p>Unsupported URI scheme: <a href="${query.url | h}">${query.url | h}</a>.</p> <p>Unsupported URI scheme: <a href="${query.url | h}">${query.url | h}</a>.</p>
</div> </div>
@@ -32,7 +40,7 @@
</div> </div>
% endif % endif
%endif %endif
<p>This tool attempts to detect <a href="//en.wikipedia.org/wiki/WP:COPYVIO">copyright violations</a> in articles. Simply give the title of the page or ID of the revision you want to check and hit Submit. The tool will search for similar content elsewhere on the web using <a href="//info.yahoo.com/legal/us/yahoo/boss/pricing/">Yahoo! BOSS</a> and then display a report if a match is found. If you give a URL, it will skip the search engine step and directly display a report comparing the article to that particular webpage, like the <a href="//toolserver.org/~dcoetzee/duplicationdetector/">Duplication Detector</a>.</p>
<p>This tool attempts to detect <a href="//en.wikipedia.org/wiki/WP:COPYVIO">copyright violations</a> in articles. Simply give the title of the page or ID of the revision you want to check and hit Submit. The tool will search for similar content elsewhere on the web using <a href="//developer.yahoo.com/boss/search/">Yahoo! BOSS</a> and then display a report if a match is found. If you give a specific URL, it will skip the search engine step and directly display a report comparing the article to that particular webpage, like the <a href="//tools.wmflabs.org/dupdet/">Duplication Detector</a>.</p>
<p>Running a full check can take up to 45 seconds if other websites are slow. Please be patient. If you get a timeout, wait a moment and refresh the page.</p> <p>Running a full check can take up to 45 seconds if other websites are slow. Please be patient. If you get a timeout, wait a moment and refresh the page.</p>
<p>Specific websites can be skipped (for example, if their content is in the public domain) by being added to the <a href="//en.wikipedia.org/wiki/User:EarwigBot/Copyvios/Exclusions">excluded URL list</a>.</p> <p>Specific websites can be skipped (for example, if their content is in the public domain) by being added to the <a href="//en.wikipedia.org/wiki/User:EarwigBot/Copyvios/Exclusions">excluded URL list</a>.</p>
<form action="${request.script_root}" method="get"> <form action="${request.script_root}" method="get">
@@ -40,7 +48,7 @@
<tr> <tr>
<td>Site:</td> <td>Site:</td>
<td colspan="3"> <td colspan="3">
<span class="mono">http://</span>
<span class="mono">https://</span>
<select name="lang"> <select name="lang">
<% selected_lang = query.orig_lang if query.orig_lang else g.cookies["CopyviosDefaultLang"].value if "CopyviosDefaultLang" in g.cookies else g.bot.wiki.get_site().lang %>\ <% selected_lang = query.orig_lang if query.orig_lang else g.cookies["CopyviosDefaultLang"].value if "CopyviosDefaultLang" in g.cookies else g.bot.wiki.get_site().lang %>\
% for code, name in query.all_langs: % for code, name in query.all_langs:
@@ -84,24 +92,42 @@
</td> </td>
</tr> </tr>
<tr> <tr>
<td>URL&nbsp;(optional):</td>
<td>Action:</td>
<td colspan="3"> <td colspan="3">
% if query.url:
<input class="cv-text" type="text" name="url" value="${query.url | h}" />
% else:
<input class="cv-text" type="text" name="url" />
% endif
<table id="cv-form-inner">
<tr>
<td id="cv-inner-col1">
<input id="action-search" type="radio" name="action" value="search" ${'checked="checked"' if (query.action == "search" or not query.action) else ""} />
</td>
<td id="cv-inner-col2"><label for="action-search">Copyvio&nbsp;search:</label></td>
<td id="cv-inner-col3">
<input id="cv-cb-engine" type="checkbox" name="use_engine" value="1" {'checked="checked"' if query.use_engine or not query.submitted} />
<label for"cv-cb-engine">Use&nbsp;search&nbsp;engine</label>
<input id="cv-cb-links" type="checkbox" name="use_links" value="1" {'checked="checked"' if query.use_links or not query.submitted} />
<label for="cv-cb-links">Use&nbsp;links&nbsp;in&nbsp;page</label>
</td>
</tr>
<tr>
<td>
<input id="action-compare" type="radio" name="action" value="compare" ${'checked="checked"' if query.action == "compare" else ""} />
</td>
<td><label for="action-compare">URL&nbsp;comparison:</label></td>
<td>
<input id="url-box" class="cv-text" type="text" name="url"
% if query.url:
value="${query.url | h}"
% endif
/>
</td>
</tr>
</table>
</td> </td>
</tr> </tr>
% if query.nocache or (result and result.cached): % if query.nocache or (result and result.cached):
<tr> <tr>
<td>Bypass&nbsp;cache:</td>
<td><label for="cb-nocache">Bypass&nbsp;cache:</label></td>
<td colspan="3"> <td colspan="3">
% if query.nocache:
<input type="checkbox" name="nocache" value="1" checked="checked" />
% else:
<input type="checkbox" name="nocache" value="1" />
% endif
<input id="cb-nocache" type="checkbox" name="nocache" value="1" ${'checked="checked"' if query.nocache else ""} />
</td> </td>
</tr> </tr>
% endif % endif
@@ -114,7 +140,6 @@
</form> </form>
% if result: % if result:
<% hide_comparison = "CopyviosHideComparison" in g.cookies and g.cookies["CopyviosHideComparison"].value == "True" %> <% hide_comparison = "CopyviosHideComparison" in g.cookies and g.cookies["CopyviosHideComparison"].value == "True" %>
<div class="divider"></div>
<div id="cv-result" class="${'red' if result.confidence >= T_SUSPECT else 'yellow' if result.confidence >= T_POSSIBLE else 'green'}-box"> <div id="cv-result" class="${'red' if result.confidence >= T_SUSPECT else 'yellow' if result.confidence >= T_POSSIBLE else 'green'}-box">
<h2 id="cv-result-header"> <h2 id="cv-result-header">
% if result.confidence >= T_POSSIBLE: % if result.confidence >= T_POSSIBLE:
@@ -131,44 +156,44 @@
% endif % endif
% endif % endif
</h2> </h2>
<ul id="cv-result-list">
% if result.confidence < T_POSSIBLE and not query.url:
% if result.url:
<li>Best match: <a href="${result.url | h}">${result.url | urlstrip, h}</a>.</li>
% else:
<li>No matches found.</li>
% endif
% endif
</div>
<ul id="cv-result-list">
% if result.confidence < T_POSSIBLE and query.action == "search":
% if result.url: % if result.url:
<li><b><span class="mono">${round(result.confidence * 100, 1)}%</span></b> confidence of a violation.</li>
% endif
% if query.redirected_from:
<li>Redirected from <a href="${query.redirected_from.url}">${query.redirected_from.title | h}</a>. <a href="${request.url | httpsfix, h}&amp;noredirect=1">Check the original page.</a></li>
% endif
% if result.cached:
<li>
Results are <a id="cv-cached" href="#">cached<span>To save time (and money), this tool will retain the results of checks for up to 72 hours. This includes the URL of the "violated" source, but neither its content nor the content of the article. Future checks on the same page (assuming it remains unchanged) will not involve additional search queries, but a fresh comparison against the source URL will be made. If the page is modified, a new check will be run.</span></a> from <abbr title="${result.cache_time}">${result.cache_age} ago</abbr>. Retrieved in <span class="mono">${round(result.time, 3)}</span> seconds (originally generated in
% if result.queries:
<span class="mono">${round(result.original_time, 3)}</span>s using <span class="mono">${result.queries}</span> queries).
% else:
<span class="mono">${round(result.original_time, 3)}</span>s).
% endif
<a href="${request.url | httpsfix, h}&amp;nocache=1">Bypass the cache.</a>
</li>
<li>Best match: <a href="${result.url | h}">${result.url | urlstrip, h}</a>.</li>
% else: % else:
<li>Results generated in <span class="mono">${round(result.time, 3)}</span> seconds using <span class="mono">${result.queries}</span> queries.</li>
% endif
% if result.queries:
<li><i>Fun fact:</i> The Wikimedia Foundation paid Yahoo! Inc. <a href="http://info.yahoo.com/legal/us/yahoo/search/bosspricing/details.html">$${result.queries * 0.0008} USD</a> for these results.</li>
<li>No matches found.</li>
% endif % endif
<li><a id="cv-chain-link" href="#cv-chain-table" onclick="copyvio_toggle_details()">${"Show" if hide_comparison else "Hide"} comparison:</a></li>
</ul>
<table id="cv-chain-table" style="display: ${'none' if hide_comparison else 'table'};">
<tr>
<td class="cv-chain-cell">Article: <div class="cv-chain-detail"><p>${highlight_delta(result.article_chain, result.delta_chain)}</p></div></td>
<td class="cv-chain-cell">Source: <div class="cv-chain-detail"><p>${highlight_delta(result.source_chain, result.delta_chain)}</p></div></td>
</tr>
</table>
</div>
% endif
% if result.url:
<li><b><span class="mono">${round(result.confidence * 100, 1)}%</span></b> confidence of a violation.</li>
% endif
% if query.redirected_from:
<li>Redirected from <a href="${query.redirected_from.url}">${query.redirected_from.title | h}</a>. <a href="${request.url | httpsfix, h}&amp;noredirect=1">Check the original page.</a></li>
% endif
% if result.cached:
<li>
Results are <a id="cv-cached" href="#">cached<span>To save time (and money), this tool will retain the results of checks for up to 72 hours. This includes the URL of the "violated" source, but neither its content nor the content of the article. Future checks on the same page (assuming it remains unchanged) will not involve additional search queries, but a fresh comparison against the source URL will be made. If the page is modified, a new check will be run.</span></a> from <abbr title="${result.cache_time}">${result.cache_age} ago</abbr>. Retrieved in <span class="mono">${round(result.time, 3)}</span> seconds (originally generated in
% if result.queries:
<span class="mono">${round(result.original_time, 3)}</span>s using <span class="mono">${result.queries}</span> queries).
% else:
<span class="mono">${round(result.original_time, 3)}</span>s).
% endif
<a href="${request.url | httpsfix, h}&amp;nocache=1">Bypass the cache.</a>
</li>
% else:
<li>Results generated in <span class="mono">${round(result.time, 3)}</span> seconds using <span class="mono">${result.queries}</span> queries.</li>
% endif
% if result.queries:
<li><i>Fun fact:</i> The Wikimedia Foundation paid Yahoo! Inc. <a href="http://info.yahoo.com/legal/us/yahoo/search/bosspricing/details.html">$${result.queries * 0.0008} USD</a> for these results.</li>
% endif
<li><a id="cv-chain-link" href="#cv-chain-table" onclick="copyvio_toggle_details()">${"Show" if hide_comparison else "Hide"} comparison:</a></li>
</ul>
<table id="cv-chain-table" style="display: ${'none' if hide_comparison else 'table'};">
<tr>
<td class="cv-chain-cell">Article: <div class="cv-chain-detail"><p>${highlight_delta(result.article_chain, result.delta_chain)}</p></div></td>
<td class="cv-chain-cell">Source: <div class="cv-chain-detail"><p>${highlight_delta(result.source_chain, result.delta_chain)}</p></div></td>
</tr>
</table>
% endif % endif
<%include file="/support/footer.mako"/> <%include file="/support/footer.mako"/>

+ 1
- 1
templates/settings.mako View File

@@ -14,7 +14,7 @@
<tr> <tr>
<td>Default site:</td> <td>Default site:</td>
<td> <td>
<span class="mono">http://</span>
<span class="mono">https://</span>
<select name="lang"> <select name="lang">
<% selected_lang = g.cookies["CopyviosDefaultLang"].value if "CopyviosDefaultLang" in g.cookies else default_lang %>\ <% selected_lang = g.cookies["CopyviosDefaultLang"].value if "CopyviosDefaultLang" in g.cookies else default_lang %>\
% for code, name in langs: % for code, name in langs:


+ 1
- 0
templates/support/header.mako View File

@@ -9,6 +9,7 @@
<meta charset="utf-8"> <meta charset="utf-8">
<title>${title}</title> <title>${title}</title>
<link rel="stylesheet" href="${request.script_root}/static/style.min.css" type="text/css" /> <link rel="stylesheet" href="${request.script_root}/static/style.min.css" type="text/css" />
<script src="//code.jquery.com/jquery-1.11.1.min.js" type="text/javascript"></script>
<script src="${request.script_root}/static/script.min.js" type="text/javascript"></script> <script src="${request.script_root}/static/script.min.js" type="text/javascript"></script>
</head> </head>
<% selected = g.cookies["CopyviosBackground"].value if "CopyviosBackground" in g.cookies else "list" %>\ <% selected = g.cookies["CopyviosBackground"].value if "CopyviosBackground" in g.cookies else "list" %>\


Loading…
Cancel
Save