@@ -1,21 +1,21 @@ | |||
This is a [copyright violation](https://en.wikipedia.org/wiki/WP:COPYVIO) | |||
detector running on [Wikimedia Labs](https://tools.wmflabs.org/copyvios). | |||
detector running on [Wikimedia Cloud Services](https://copyvios.toolforge.org/). | |||
It can search the web for content similar to a given article, and graphically | |||
compare an article to a specific URL. Some technical details are expanded upon | |||
[in a blog post](http://benkurtovic.com/2014/08/20/copyvio-detector.html). | |||
[in a blog post](https://benkurtovic.com/2014/08/20/copyvio-detector.html). | |||
Dependencies | |||
============ | |||
* [earwigbot](https://github.com/earwig/earwigbot) >= 0.1 | |||
* [flask](http://flask.pocoo.org/) >= 0.10.1 | |||
* [flask](https://flask.palletsprojects.com/) >= 0.10.1 | |||
* [flask-mako](https://pythonhosted.org/Flask-Mako/) >= 0.3 | |||
* [mako](http://www.makotemplates.org/) >= 0.7.2 | |||
* [mako](https://www.makotemplates.org/) >= 0.7.2 | |||
* [mwparserfromhell](https://github.com/earwig/mwparserfromhell) >= 0.3 | |||
* [oursql](http://packages.python.org/oursql/) >= 0.9.3.1 | |||
* [requests](http://python-requests.org/) >= 2.9.1 | |||
* [SQLAlchemy](http://sqlalchemy.org/) >= 0.9.6 | |||
* [oursql](https://pythonhosted.org/oursql/) >= 0.9.3.1 | |||
* [requests](https://requests.readthedocs.io/) >= 2.9.1 | |||
* [SQLAlchemy](https://www.sqlalchemy.org/) >= 0.9.6 | |||
* [apsw](https://github.com/rogerbinns/apsw) >= 3.26.0 | |||
* [uglifycss](https://github.com/fmarcia/UglifyCSS/) | |||
* [uglifyjs](https://github.com/mishoo/UglifyJS/) >= 1.3.3 | |||
@@ -25,7 +25,7 @@ Running | |||
- If using Tool Labs, you should clone the repository to `~/www/python/src`, or | |||
otherwise symlink it to that directory. A | |||
[virtualenv](http://virtualenv.readthedocs.org/) should be created at | |||
[virtualenv](https://virtualenv.pypa.io/) should be created at | |||
`~/www/python/venv`. | |||
- Install all dependencies listed above. | |||
@@ -58,7 +58,7 @@ def setup_app(): | |||
def prepare_request(): | |||
g._db = None | |||
g.cookies = parse_cookies( | |||
request.script_root, request.environ.get("HTTP_COOKIE")) | |||
request.script_root or "/", request.environ.get("HTTP_COOKIE")) | |||
g.new_cookies = [] | |||
@app.after_request | |||
@@ -8,7 +8,7 @@ from .misc import parse_wiki_timestamp | |||
__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] | |||
TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py' | |||
TURNITIN_API_ENDPOINT = 'https://eranbot.toolforge.org/plagiabot/api.py' | |||
def search_turnitin(page_title, lang): | |||
""" Search the Plagiabot database for Turnitin reports for a page. | |||
@@ -2,7 +2,7 @@ | |||
"name" : "copyvios", | |||
"title" : "Copyvios", | |||
"description" : "Detects copyright violations in pages by searching for their contents online. Can also compare a page and a specific URL.", | |||
"url" : "https://tools.wmflabs.org/copyvios", | |||
"url" : "https://copyvios.toolforge.org/", | |||
"keywords" : "copyvios, copyright violations", | |||
"author" : "The Earwig", | |||
"repository" : "https://github.com/earwig/copyvios" | |||
@@ -40,9 +40,9 @@ | |||
% if help: | |||
<div id="help"> | |||
<h1>Copyvio Detector API</h1> | |||
<p>This is the first version of the <a href="//en.wikipedia.org/wiki/Application_programming_interface">API</a> for <a href="${request.script_root}">Earwig's Copyvio Detector</a>. It works, but some bugs might still need to be ironed out, so please <a href="https://github.com/earwig/copyvios/issues">report any</a> if you see them.</p> | |||
<p>This is the first version of the <a href="https://en.wikipedia.org/wiki/Application_programming_interface">API</a> for <a href="${request.script_root}/">Earwig's Copyvio Detector</a>. It works, but some bugs might still need to be ironed out, so please <a href="https://github.com/earwig/copyvios/issues">report any</a> if you see them.</p> | |||
<h2>Requests</h2> | |||
<p>The API responds to GET requests made to <span class="code">https://tools.wmflabs.org/copyvios/api.json</span>. Parameters are described in the tables below:</p> | |||
<p>The API responds to GET requests made to <span class="code">https://copyvios.toolforge.org/api.json</span>. Parameters are described in the tables below:</p> | |||
<table class="parameters"> | |||
<tr> | |||
<th colspan="4">Always</th> | |||
@@ -63,7 +63,7 @@ | |||
<td>format</td> | |||
<td><span class="code">json</span>, <span class="code">jsonfm</span></td> | |||
<td>No (default: <span class="code">json</span>)</td> | |||
<td>The default output format is <a href="http://json.org/">JSON</a>. <span class="code">jsonfm</span> mode produces the same output, but renders it as a formatted HTML document for debugging.</td> | |||
<td>The default output format is <a href="https://www.json.org/">JSON</a>. <span class="code">jsonfm</span> mode produces the same output, but renders it as a formatted HTML document for debugging.</td> | |||
</tr> | |||
<tr> | |||
<td>version</td> | |||
@@ -254,7 +254,7 @@ | |||
<h2>Etiquette</h2> | |||
The tool uses the same workers to handle all requests, so making concurrent API calls is only going to slow you down. Most operations are not rate-limited, but full searches with <span class="code">use_engine=True</span> are globally limited to around a thousand per day. Be respectful! | |||
<h2>Example</h2> | |||
<p><a class="no-color" href="https://tools.wmflabs.org/copyvios/api.json?version=1&action=search&project=wikipedia&lang=en&title=User:EarwigBot/Copyvios/Tests/2"><span class="code">https://tools.wmflabs.org/copyvios/api.json?<span class="param-key">version</span>=<span class="param-val">1</span>&<span class="param-key">action</span>=<span class="param-val">search</span>&<span class="param-key">project</span>=<span class="param-val">wikipedia</span>&<span class="param-key">lang</span>=<span class="param-val">en</span>&<span class="param-key">title</span>=<span class="param-val">User:EarwigBot/Copyvios/Tests/2</span></span></a></p> | |||
<p><a class="no-color" href="https://copyvios.toolforge.org/api.json?version=1&action=search&project=wikipedia&lang=en&title=User:EarwigBot/Copyvios/Tests/2"><span class="code">https://copyvios.toolforge.org/api.json?<span class="param-key">version</span>=<span class="param-val">1</span>&<span class="param-key">action</span>=<span class="param-val">search</span>&<span class="param-key">project</span>=<span class="param-val">wikipedia</span>&<span class="param-key">lang</span>=<span class="param-val">en</span>&<span class="param-key">title</span>=<span class="param-val">User:EarwigBot/Copyvios/Tests/2</span></span></a></p> | |||
<pre>{ | |||
"status": "ok", | |||
"meta": { | |||
@@ -35,11 +35,11 @@ | |||
</p></div> | |||
% elif not query.site: | |||
<div id="info-box" class="red-box"> | |||
<p>The given site (project=<b><span class="mono">${query.project | h}</span></b>, language=<b><span class="mono">${query.lang | h}</span></b>) doesn't seem to exist. It may also be closed or private. <a href="//${query.lang | h}.${query.project | h}.org/">Confirm its URL.</a></p> | |||
<p>The given site (project=<b><span class="mono">${query.project | h}</span></b>, language=<b><span class="mono">${query.lang | h}</span></b>) doesn't seem to exist. It may also be closed or private. <a href="https://${query.lang | h}.${query.project | h}.org/">Confirm its URL.</a></p> | |||
</div> | |||
% elif query.oldid and not result: | |||
<div id="info-box" class="red-box"> | |||
<p>The given revision ID doesn't seem to exist: <a href="//${query.site.domain | h}/w/index.php?oldid=${query.oldid | h}">${query.oldid | h}</a>.</p> | |||
<p>The given revision ID doesn't seem to exist: <a href="https://${query.site.domain | h}/w/index.php?oldid=${query.oldid | h}">${query.oldid | h}</a>.</p> | |||
</div> | |||
% elif query.title and not result: | |||
<div id="info-box" class="red-box"> | |||
@@ -47,10 +47,10 @@ | |||
</div> | |||
% endif | |||
%endif | |||
<p>This tool attempts to detect <a href="//en.wikipedia.org/wiki/WP:COPYVIO">copyright violations</a> in articles. In <i>search mode</i>, it will check for similar content elsewhere on the web using <a href="https://developers.google.com/custom-search/">Google</a>, external links present in the text of the page, or <a href="//en.wikipedia.org/wiki/Wikipedia:Turnitin">Turnitin</a> (provided by <a href="//en.wikipedia.org/wiki/User:EranBot">EranBot</a>), depending on which options are selected. In <i>comparison mode</i>, the tool will compare the article to a specific webpage without making additional searches, like the <a href="//tools.wmflabs.org/dupdet/">Duplication Detector</a>.</p> | |||
<p>This tool attempts to detect <a href="https://en.wikipedia.org/wiki/WP:COPYVIO">copyright violations</a> in articles. In <i>search mode</i>, it will check for similar content elsewhere on the web using <a href="https://developers.google.com/custom-search/">Google</a>, external links present in the text of the page, or <a href="https://en.wikipedia.org/wiki/Wikipedia:Turnitin">Turnitin</a> (provided by <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>), depending on which options are selected. In <i>comparison mode</i>, the tool will compare the article to a specific webpage without making additional searches, like the <a href="https://dupdet.toolforge.org/">Duplication Detector</a>.</p> | |||
<p>Running a full check can take up to a minute if other websites are slow or if the tool is under heavy use. Please be patient. If you get a timeout, wait a moment and refresh the page.</p> | |||
<p>Be aware that other websites can copy from Wikipedia, so check the results carefully, especially for older or well-developed articles. Specific websites can be skipped by being added to the <a href="//en.wikipedia.org/wiki/User:EarwigBot/Copyvios/Exclusions">excluded URL list</a>.</p> | |||
<form id="cv-form" action="${request.script_root}" method="get"> | |||
<p>Be aware that other websites can copy from Wikipedia, so check the results carefully, especially for older or well-developed articles. Specific websites can be skipped by being added to the <a href="https://en.wikipedia.org/wiki/User:EarwigBot/Copyvios/Exclusions">excluded URL list</a>.</p> | |||
<form id="cv-form" action="${request.script_root}/" method="get"> | |||
<table id="cv-form-outer"> | |||
<tr> | |||
<td>Site:</td> | |||
@@ -165,7 +165,7 @@ | |||
% else: | |||
seconds. | |||
% endif | |||
<a href="${request.script_root | h}?lang=${query.lang | h}&project=${query.project | h}&oldid=${query.oldid or query.page.lastrevid | h}&action=${query.action | h}&${"use_engine={0}&use_links={1}".format(int(query.use_engine not in ("0", "false")), int(query.use_links not in ("0", "false"))) if query.action == "search" else "" | h}${"url=" if query.action == "compare" else ""}${query.url if query.action == "compare" else "" | u}">Permalink.</a> | |||
<a href="${request.script_root | h}/?lang=${query.lang | h}&project=${query.project | h}&oldid=${query.oldid or query.page.lastrevid | h}&action=${query.action | h}&${"use_engine={0}&use_links={1}".format(int(query.use_engine not in ("0", "false")), int(query.use_links not in ("0", "false"))) if query.action == "search" else "" | h}${"url=" if query.action == "compare" else ""}${query.url if query.action == "compare" else "" | u}">Permalink.</a> | |||
</div> | |||
<div id="cv-result" class="${'red' if result.confidence >= T_SUSPECT else 'yellow' if result.confidence >= T_POSSIBLE else 'green'}-box"> | |||
@@ -179,11 +179,11 @@ | |||
<td> | |||
<a href="${query.page.url}">${query.page.title | h}</a> | |||
% if query.oldid: | |||
@<a href="//${query.site.domain | h}/w/index.php?oldid=${query.oldid | h}">${query.oldid | h}</a> | |||
@<a href="https://${query.site.domain | h}/w/index.php?oldid=${query.oldid | h}">${query.oldid | h}</a> | |||
% endif | |||
% if query.redirected_from: | |||
<br /> | |||
<span id="redirected-from">Redirected from <a href="//${query.site.domain | h}/w/index.php?title=${query.redirected_from.title | u}&redirect=no">${query.redirected_from.title | h}</a>. <a href="${request.url | httpsfix, h}&noredirect=1">Check original.</a></span> | |||
<span id="redirected-from">Redirected from <a href="https://${query.site.domain | h}/w/index.php?title=${query.redirected_from.title | u}&redirect=no">${query.redirected_from.title | h}</a>. <a href="${request.url | httpsfix, h}&noredirect=1">Check original.</a></span> | |||
% endif | |||
</td> | |||
<td> | |||
@@ -225,7 +225,7 @@ | |||
% if query.turnitin_result.reports: | |||
<table id="turnitin-table"><tbody> | |||
% for report in turnitin_result.reports: | |||
<tr><td class="turnitin-table-cell"><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Report ${report.reportid}</a> for text added at <a href="https://${query.lang}.wikipedia.org/w/index.php?title=${query.title}&diff=${report.diffid}"> ${report.time_posted.strftime("%H:%M, %d %B %Y (UTC)")}</a>: | |||
<tr><td class="turnitin-table-cell"><a href="https://eranbot.toolforge.org/ithenticate.py?rid=${report.reportid}">Report ${report.reportid}</a> for text added at <a href="https://${query.lang}.wikipedia.org/w/index.php?title=${query.title}&diff=${report.diffid}"> ${report.time_posted.strftime("%H:%M, %d %B %Y (UTC)")}</a>: | |||
<ul> | |||
% for source in report.sources: | |||
<li>${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url'] | h}">${source['url'] | h}</a></li> | |||
@@ -269,7 +269,7 @@ | |||
% endif | |||
</td> | |||
<td> | |||
<a href="${request.script_root | h}?lang=${query.lang | h}&project=${query.project | h}&oldid=${query.oldid or query.page.lastrevid | h}&action=compare&url=${source.url | u}">Compare</a> | |||
<a href="${request.script_root | h}/?lang=${query.lang | h}&project=${query.project | h}&oldid=${query.oldid or query.page.lastrevid | h}&action=compare&url=${source.url | u}">Compare</a> | |||
</td> | |||
</tr> | |||
% endfor | |||
@@ -42,7 +42,7 @@ | |||
</tr> | |||
<% | |||
background_options = [ | |||
("list", 'Randomly select from <a href="http://commons.wikimedia.org/wiki/User:The_Earwig/POTD">a subset</a> of previous <a href="//commons.wikimedia.org/">Wikimedia Commons</a> <a href="//commons.wikimedia.org/wiki/Commons:Picture_of_the_day">Pictures of the Day</a> that work well as widescreen backgrounds, refreshed daily (default).'), | |||
("list", 'Randomly select from <a href="https://commons.wikimedia.org/wiki/User:The_Earwig/POTD">a subset</a> of previous <a href="https://commons.wikimedia.org/">Wikimedia Commons</a> <a href="https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day">Pictures of the Day</a> that work well as widescreen backgrounds, refreshed daily (default).'), | |||
("potd", 'Use the current Commons Picture of the Day, unfiltered. Certain POTDs may be unsuitable as backgrounds due to their aspect ratio or subject matter.'), | |||
("plain", "Use a plain background."), | |||
] | |||
@@ -4,13 +4,13 @@ | |||
%>\ | |||
</div> | |||
<div id="footer"> | |||
<p>Copyright © 2009–${datetime.now().year} <a href="//en.wikipedia.org/wiki/User:The_Earwig">Ben Kurtovic</a> • \ | |||
<p>Copyright © 2009–${datetime.now().year} <a href="https://en.wikipedia.org/wiki/User:The_Earwig">Ben Kurtovic</a> • \ | |||
<a href="${request.script_root}/api">API</a> • \ | |||
<a href="https://github.com/earwig/copyvios">Source Code</a> • \ | |||
% if ("CopyviosBackground" in g.cookies and g.cookies["CopyviosBackground"].value in ["potd", "list"]) or "CopyviosBackground" not in g.cookies: | |||
<a href="${g.descurl | h}">Background</a> • \ | |||
% endif | |||
<a href="http://validator.w3.org/check?uri=referer">Valid HTML5</a> | |||
<a href="https://validator.w3.org/check?uri=referer">Valid HTML5</a> | |||
</p> | |||
</div> | |||
</body> | |||
@@ -21,7 +21,7 @@ | |||
<div id="header"> | |||
<table id="heading"> | |||
<tr> | |||
<td id="head-home"><a id="a-home" href="${request.script_root}">Earwig's Copyvio Detector</a></td> | |||
<td id="head-home"><a id="a-home" href="${request.script_root}/">Earwig's Copyvio Detector</a></td> | |||
<td id="head-settings"><a id="a-settings" href="${request.script_root}/settings">Settings</a></td> | |||
</tr> | |||
</table> | |||