@@ -2,6 +2,7 @@ | |||||
# -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||
from functools import wraps | from functools import wraps | ||||
from json import dumps | |||||
from logging import DEBUG, INFO, getLogger | from logging import DEBUG, INFO, getLogger | ||||
from logging.handlers import TimedRotatingFileHandler | from logging.handlers import TimedRotatingFileHandler | ||||
from time import asctime | from time import asctime | ||||
@@ -9,10 +10,11 @@ from traceback import format_exc | |||||
from earwigbot.bot import Bot | from earwigbot.bot import Bot | ||||
from earwigbot.wiki.copyvios import globalize | from earwigbot.wiki.copyvios import globalize | ||||
from flask import Flask, g, request | |||||
from flask import Flask, g, make_response, request | |||||
from flask.ext.mako import MakoTemplates, render_template, TemplateError | from flask.ext.mako import MakoTemplates, render_template, TemplateError | ||||
from flup.server.fcgi import WSGIServer | from flup.server.fcgi import WSGIServer | ||||
from copyvios.api import format_api_error, handle_api_request | |||||
from copyvios.checker import do_check | from copyvios.checker import do_check | ||||
from copyvios.cookies import parse_cookies | from copyvios.cookies import parse_cookies | ||||
from copyvios.settings import process_settings | from copyvios.settings import process_settings | ||||
@@ -85,5 +87,26 @@ def settings(): | |||||
"default_lang": default.lang, "default_project": default.project} | "default_lang": default.lang, "default_project": default.project} | ||||
return render_template("settings.mako", **kwargs) | return render_template("settings.mako", **kwargs) | ||||
@app.route("/api.json") | |||||
def api(): | |||||
if not request.args: | |||||
return render_template("api.mako", help=True) | |||||
format = request.args.get("format", "json") | |||||
if format in ["json", "jsonfm"]: | |||||
try: | |||||
result = handle_api_request() | |||||
except Exception as exc: | |||||
result = format_api_error("unhandled_exception", exc) | |||||
else: | |||||
errmsg = u"Unknown format: '{0}'".format(format) | |||||
result = format_api_error("unknown_format", errmsg) | |||||
if format == "jsonfm": | |||||
return render_template("api.mako", help=False, result=result) | |||||
resp = make_response(dumps(result)) | |||||
resp.mimetype = "application/json" | |||||
return resp | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
WSGIServer(app).run() | WSGIServer(app).run() |
@@ -0,0 +1,103 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from .checker import do_check, T_POSSIBLE, T_SUSPECT | |||||
from .misc import Query | |||||
from .sites import get_sites | |||||
__all__ = ["format_api_error", "handle_api_request"] | |||||
_HOOKS = { | |||||
"compare": _hook_check, | |||||
"search": _hook_check, | |||||
"sites": _hook_sites, | |||||
} | |||||
_CHECK_ERRORS = { | |||||
"no search method": "Either 'use_engine' or 'use_links' must be true", | |||||
"no URL": "The parameter 'url' is required for URL comparisons", | |||||
"bad URI": "The given URI scheme is unsupported", | |||||
"no data": "No text could be found in the given URL (note that only HTML " | |||||
"and plain text pages are supported, and content generated by " | |||||
"JavaScript or found inside iframes is ignored)", | |||||
"timeout": "The given URL timed out before any data could be retrieved", | |||||
"search error": "An error occurred while using the search engine; try " | |||||
"reloading or setting 'use_engine' to 0", | |||||
} | |||||
def _serialize_page(page): | |||||
return {"title": page.title, "url": page.url} | |||||
def _serialize_source(source, show_skip=True): | |||||
if not source: | |||||
return {"url": None, "confidence": 0.0, "violation": "none"} | |||||
conf = source.confidence | |||||
data = { | |||||
"url": source.url, | |||||
"confidence": conf, | |||||
"violation": "suspected" if conf >= T_SUSPECT else | |||||
"possible" if conf >= T_POSSIBLE else "none" | |||||
} | |||||
if show_skip: | |||||
data["skipped"] = source.skipped | |||||
return data | |||||
def format_api_error(code, info): | |||||
if isinstance(info, BaseException): | |||||
info = type(info).__name__ + ": " + str(info) | |||||
elif isinstance(info, unicode): | |||||
info = info.encode("utf8") | |||||
return {"status": "error", "error": {"code": code, "info": info}} | |||||
def handle_api_request(): | |||||
query = Query() | |||||
action = query.action.lower() if query.action else "" | |||||
return _HOOKS.get(action, _hook_default)(query) | |||||
def _hook_default(query): | |||||
info = u"Unknown action: '{0}'".format(query.action.lower()) | |||||
return format_api_error("unknown_action", info) | |||||
def _hook_check(query): | |||||
do_check(query) | |||||
if not query.submitted: | |||||
info = ("The query parameters 'project', 'lang', and either 'title' " | |||||
"or 'oldid' are required for checks") | |||||
return format_api_error("missing_params", info) | |||||
if query.error: | |||||
info = _CHECK_ERRORS.get(query.error, "An unknown error occurred") | |||||
return format_api_error(query.error.replace(" ", "_"), info) | |||||
elif not query.site: | |||||
info = (u"The given site (project={0}, lang={1}) either doesn't exist," | |||||
u" is closed, or is private").format(query.project, query.lang) | |||||
return format_api_error("bad_site", info) | |||||
elif not query.result: | |||||
if query.oldid: | |||||
info = u"The given revision ID doesn't seem to exist: {0}" | |||||
return format_api_error("bad_oldid", info.format(query.oldid)) | |||||
else: | |||||
info = u"The given page doesn't seem to exist: {0}" | |||||
return format_api_error("bad_title", info.format(query.page.title)) | |||||
result = query.result | |||||
data = { | |||||
"status": "ok", | |||||
"meta": { | |||||
"time": result.time, | |||||
"queries": result.queries, | |||||
"cached": result.cached, | |||||
"redirected": bool(query.redirected_from) | |||||
}, | |||||
"page": _serialize_page(query.page), | |||||
"best": _serialize_source(result.best, show_skip=False), | |||||
"sources": [_serialize_source(source) for source in result.sources] | |||||
} | |||||
if result.cached: | |||||
data["meta"]["cache_time"] = result.cache_time | |||||
if query.redirected_from: | |||||
data["original_page"] = _serialize_page(query.redirected_from) | |||||
return data | |||||
def _hook_sites(query): | |||||
langs, projects = get_sites() | |||||
return {"status": "ok", "langs": langs, "projects": projects} |
@@ -17,8 +17,9 @@ __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"] | |||||
T_POSSIBLE = 0.4 | T_POSSIBLE = 0.4 | ||||
T_SUSPECT = 0.75 | T_SUSPECT = 0.75 | ||||
def do_check(): | |||||
query = Query() | |||||
def do_check(query=None): | |||||
if not query: | |||||
query = Query() | |||||
if query.lang: | if query.lang: | ||||
query.lang = query.orig_lang = query.lang.lower() | query.lang = query.orig_lang = query.lang.lower() | ||||
if "::" in query.lang: | if "::" in query.lang: | ||||
@@ -7,7 +7,7 @@ from markupsafe import escape | |||||
__all__ = ["highlight_delta"] | __all__ = ["highlight_delta"] | ||||
def highlight_delta(context, chain, delta=None): | |||||
def highlight_delta(context, chain, delta): | |||||
degree = chain.degree - 1 | degree = chain.degree - 1 | ||||
highlights = [False] * degree | highlights = [False] * degree | ||||
block = [chain.START] * degree | block = [chain.START] * degree | ||||
@@ -26,7 +26,7 @@ def highlight_delta(context, chain, delta=None): | |||||
i = degree | i = degree | ||||
numwords = len(chain.text.split()) | numwords = len(chain.text.split()) | ||||
processed = [] | |||||
result = [] | |||||
paragraphs = chain.text.split("\n") | paragraphs = chain.text.split("\n") | ||||
while paragraphs: | while paragraphs: | ||||
words = [] | words = [] | ||||
@@ -39,10 +39,10 @@ def highlight_delta(context, chain, delta=None): | |||||
words.append(_highlight_word(word, before, after, first, last)) | words.append(_highlight_word(word, before, after, first, last)) | ||||
else: | else: | ||||
words.append(unicode(escape(word))) | words.append(unicode(escape(word))) | ||||
processed.append(u" ".join(words)) | |||||
result.append(u" ".join(words)) | |||||
i += 1 | i += 1 | ||||
return u"<br /><br />".join(processed) | |||||
return u"<br /><br />".join(result) | |||||
def _get_next(paragraphs): | def _get_next(paragraphs): | ||||
paragraph = paragraphs.pop(0) | paragraph = paragraphs.pop(0) | ||||
@@ -0,0 +1,3 @@ | |||||
.code { | |||||
font-family: monospace; | |||||
} |
@@ -0,0 +1,21 @@ | |||||
<!DOCTYPE html> | |||||
<html lang="en"> | |||||
<head> | |||||
<meta charset="utf-8"> | |||||
<title>API - Earwig's Copyvio Detector</title> | |||||
<link rel="stylesheet" href="${request.script_root}/static/api.min.css" type="text/css" /> | |||||
</head> | |||||
<body> | |||||
% if help: | |||||
<div id="help"> | |||||
<p>This is the first version of the <a href="//en.wikipedia.org/wiki/Application_programming_interface">API</a> for <a href="${request.script_root}">Earwig's Copyvio Detector</a>. It works, but some bugs might still need to be ironed out, so please <a href="https://github.com/earwig/copyvios/issues">report any</a> if you see them.</p> | |||||
</div> | |||||
% endif | |||||
% if result: | |||||
<div id="result"> | |||||
<p>You are using <span class="code">jsonfm</span> output mode, which renders JSON data as a formatted HTML document. This is intended for testing and debugging only.</p> | |||||
<!-- walk tree --> | |||||
</div> | |||||
% endif | |||||
</body> | |||||
</html> |
@@ -30,14 +30,14 @@ | |||||
<div id="info-box" class="red-box"> | <div id="info-box" class="red-box"> | ||||
<p>The given site (project=<b><span class="mono">${query.project | h}</span></b>, language=<b><span class="mono">${query.lang | h}</span></b>) doesn't seem to exist. It may also be closed or private. <a href="//${query.lang | h}.${query.project | h}.org/">Confirm its URL.</a></p> | <p>The given site (project=<b><span class="mono">${query.project | h}</span></b>, language=<b><span class="mono">${query.lang | h}</span></b>) doesn't seem to exist. It may also be closed or private. <a href="//${query.lang | h}.${query.project | h}.org/">Confirm its URL.</a></p> | ||||
</div> | </div> | ||||
% elif query.title and not result: | |||||
<div id="info-box" class="red-box"> | |||||
<p>The given page doesn't seem to exist: <a href="${query.page.url}">${query.page.title | h}</a>.</p> | |||||
</div> | |||||
% elif query.oldid and not result: | % elif query.oldid and not result: | ||||
<div id="info-box" class="red-box"> | <div id="info-box" class="red-box"> | ||||
<p>The given revision ID doesn't seem to exist: <a href="//${query.site.domain | h}/w/index.php?oldid=${query.oldid | h}">${query.oldid | h}</a>.</p> | <p>The given revision ID doesn't seem to exist: <a href="//${query.site.domain | h}/w/index.php?oldid=${query.oldid | h}">${query.oldid | h}</a>.</p> | ||||
</div> | </div> | ||||
% elif query.title and not result: | |||||
<div id="info-box" class="red-box"> | |||||
<p>The given page doesn't seem to exist: <a href="${query.page.url}">${query.page.title | h}</a>.</p> | |||||
</div> | |||||
% endif | % endif | ||||
%endif | %endif | ||||
<p>This tool attempts to detect <a href="//en.wikipedia.org/wiki/WP:COPYVIO">copyright violations</a> in articles. In search mode, it will check for similar content elsewhere on the web using <a href="//developer.yahoo.com/boss/search/">Yahoo! BOSS</a> and/or external links present in the text of the page, depending on which options are selected. In comparison mode, the tool will skip the searching step and display a report comparing the article to the given webpage, like the <a href="//tools.wmflabs.org/dupdet/">Duplication Detector</a>.</p> | <p>This tool attempts to detect <a href="//en.wikipedia.org/wiki/WP:COPYVIO">copyright violations</a> in articles. In search mode, it will check for similar content elsewhere on the web using <a href="//developer.yahoo.com/boss/search/">Yahoo! BOSS</a> and/or external links present in the text of the page, depending on which options are selected. In comparison mode, the tool will skip the searching step and display a report comparing the article to the given webpage, like the <a href="//tools.wmflabs.org/dupdet/">Duplication Detector</a>.</p> | ||||
@@ -1,8 +1,9 @@ | |||||
<%! from flask import g %>\ | |||||
<%! from flask import g, request %>\ | |||||
</div> | </div> | ||||
<div id="footer"> | <div id="footer"> | ||||
<p>Copyright © 2009–2014 <a href="//en.wikipedia.org/wiki/User:The_Earwig">Ben Kurtovic</a> • \ | <p>Copyright © 2009–2014 <a href="//en.wikipedia.org/wiki/User:The_Earwig">Ben Kurtovic</a> • \ | ||||
<a href="https://github.com/earwig/copyvios">View Source</a> • \ | |||||
<a href="${request.script_root}/api.json">API</a> • \ | |||||
<a href="https://github.com/earwig/copyvios">Source Code</a> • \ | |||||
% if ("CopyviosBackground" in g.cookies and g.cookies["CopyviosBackground"].value in ["potd", "list"]) or "CopyviosBackground" not in g.cookies: | % if ("CopyviosBackground" in g.cookies and g.cookies["CopyviosBackground"].value in ["potd", "list"]) or "CopyviosBackground" not in g.cookies: | ||||
<a href="${g.descurl | h}">Background</a> • \ | <a href="${g.descurl | h}">Background</a> • \ | ||||
% endif | % endif | ||||