@@ -2,6 +2,7 @@ | |||
# -*- coding: utf-8 -*- | |||
from functools import wraps | |||
from json import dumps | |||
from logging import DEBUG, INFO, getLogger | |||
from logging.handlers import TimedRotatingFileHandler | |||
from time import asctime | |||
@@ -9,10 +10,11 @@ from traceback import format_exc | |||
from earwigbot.bot import Bot | |||
from earwigbot.wiki.copyvios import globalize | |||
from flask import Flask, g, request | |||
from flask import Flask, g, make_response, request | |||
from flask.ext.mako import MakoTemplates, render_template, TemplateError | |||
from flup.server.fcgi import WSGIServer | |||
from copyvios.api import format_api_error, handle_api_request | |||
from copyvios.checker import do_check | |||
from copyvios.cookies import parse_cookies | |||
from copyvios.settings import process_settings | |||
@@ -85,5 +87,26 @@ def settings(): | |||
"default_lang": default.lang, "default_project": default.project} | |||
return render_template("settings.mako", **kwargs) | |||
@app.route("/api.json") | |||
def api(): | |||
if not request.args: | |||
return render_template("api.mako", help=True) | |||
format = request.args.get("format", "json") | |||
if format in ["json", "jsonfm"]: | |||
try: | |||
result = handle_api_request() | |||
except Exception as exc: | |||
result = format_api_error("unhandled_exception", exc) | |||
else: | |||
errmsg = u"Unknown format: '{0}'".format(format) | |||
result = format_api_error("unknown_format", errmsg) | |||
if format == "jsonfm": | |||
return render_template("api.mako", help=False, result=result) | |||
resp = make_response(dumps(result)) | |||
resp.mimetype = "application/json" | |||
return resp | |||
if __name__ == '__main__': | |||
WSGIServer(app).run() |
@@ -0,0 +1,103 @@ | |||
# -*- coding: utf-8 -*- | |||
from .checker import do_check, T_POSSIBLE, T_SUSPECT | |||
from .misc import Query | |||
from .sites import get_sites | |||
__all__ = ["format_api_error", "handle_api_request"] | |||
_HOOKS = { | |||
"compare": _hook_check, | |||
"search": _hook_check, | |||
"sites": _hook_sites, | |||
} | |||
_CHECK_ERRORS = { | |||
"no search method": "Either 'use_engine' or 'use_links' must be true", | |||
"no URL": "The parameter 'url' is required for URL comparisons", | |||
"bad URI": "The given URI scheme is unsupported", | |||
"no data": "No text could be found in the given URL (note that only HTML " | |||
"and plain text pages are supported, and content generated by " | |||
"JavaScript or found inside iframes is ignored)", | |||
"timeout": "The given URL timed out before any data could be retrieved", | |||
"search error": "An error occurred while using the search engine; try " | |||
"reloading or setting 'use_engine' to 0", | |||
} | |||
def _serialize_page(page): | |||
return {"title": page.title, "url": page.url} | |||
def _serialize_source(source, show_skip=True): | |||
if not source: | |||
return {"url": None, "confidence": 0.0, "violation": "none"} | |||
conf = source.confidence | |||
data = { | |||
"url": source.url, | |||
"confidence": conf, | |||
"violation": "suspected" if conf >= T_SUSPECT else | |||
"possible" if conf >= T_POSSIBLE else "none" | |||
} | |||
if show_skip: | |||
data["skipped"] = source.skipped | |||
return data | |||
def format_api_error(code, info): | |||
if isinstance(info, BaseException): | |||
info = type(info).__name__ + ": " + str(info) | |||
elif isinstance(info, unicode): | |||
info = info.encode("utf8") | |||
return {"status": "error", "error": {"code": code, "info": info}} | |||
def handle_api_request(): | |||
query = Query() | |||
action = query.action.lower() if query.action else "" | |||
return _HOOKS.get(action, _hook_default)(query) | |||
def _hook_default(query): | |||
info = u"Unknown action: '{0}'".format(query.action.lower()) | |||
return format_api_error("unknown_action", info) | |||
def _hook_check(query): | |||
do_check(query) | |||
if not query.submitted: | |||
info = ("The query parameters 'project', 'lang', and either 'title' " | |||
"or 'oldid' are required for checks") | |||
return format_api_error("missing_params", info) | |||
if query.error: | |||
info = _CHECK_ERRORS.get(query.error, "An unknown error occurred") | |||
return format_api_error(query.error.replace(" ", "_"), info) | |||
elif not query.site: | |||
info = (u"The given site (project={0}, lang={1}) either doesn't exist," | |||
u" is closed, or is private").format(query.project, query.lang) | |||
return format_api_error("bad_site", info) | |||
elif not query.result: | |||
if query.oldid: | |||
info = u"The given revision ID doesn't seem to exist: {0}" | |||
return format_api_error("bad_oldid", info.format(query.oldid)) | |||
else: | |||
info = u"The given page doesn't seem to exist: {0}" | |||
return format_api_error("bad_title", info.format(query.page.title)) | |||
result = query.result | |||
data = { | |||
"status": "ok", | |||
"meta": { | |||
"time": result.time, | |||
"queries": result.queries, | |||
"cached": result.cached, | |||
"redirected": bool(query.redirected_from) | |||
}, | |||
"page": _serialize_page(query.page), | |||
"best": _serialize_source(result.best, show_skip=False), | |||
"sources": [_serialize_source(source) for source in result.sources] | |||
} | |||
if result.cached: | |||
data["meta"]["cache_time"] = result.cache_time | |||
if query.redirected_from: | |||
data["original_page"] = _serialize_page(query.redirected_from) | |||
return data | |||
def _hook_sites(query): | |||
langs, projects = get_sites() | |||
return {"status": "ok", "langs": langs, "projects": projects} |
@@ -17,8 +17,9 @@ __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"] | |||
T_POSSIBLE = 0.4 | |||
T_SUSPECT = 0.75 | |||
def do_check(): | |||
query = Query() | |||
def do_check(query=None): | |||
if not query: | |||
query = Query() | |||
if query.lang: | |||
query.lang = query.orig_lang = query.lang.lower() | |||
if "::" in query.lang: | |||
@@ -7,7 +7,7 @@ from markupsafe import escape | |||
__all__ = ["highlight_delta"] | |||
def highlight_delta(context, chain, delta=None): | |||
def highlight_delta(context, chain, delta): | |||
degree = chain.degree - 1 | |||
highlights = [False] * degree | |||
block = [chain.START] * degree | |||
@@ -26,7 +26,7 @@ def highlight_delta(context, chain, delta=None): | |||
i = degree | |||
numwords = len(chain.text.split()) | |||
processed = [] | |||
result = [] | |||
paragraphs = chain.text.split("\n") | |||
while paragraphs: | |||
words = [] | |||
@@ -39,10 +39,10 @@ def highlight_delta(context, chain, delta=None): | |||
words.append(_highlight_word(word, before, after, first, last)) | |||
else: | |||
words.append(unicode(escape(word))) | |||
processed.append(u" ".join(words)) | |||
result.append(u" ".join(words)) | |||
i += 1 | |||
return u"<br /><br />".join(processed) | |||
return u"<br /><br />".join(result) | |||
def _get_next(paragraphs): | |||
paragraph = paragraphs.pop(0) | |||
@@ -0,0 +1,3 @@ | |||
.code { | |||
font-family: monospace; | |||
} |
@@ -0,0 +1,21 @@ | |||
<!DOCTYPE html> | |||
<html lang="en"> | |||
<head> | |||
<meta charset="utf-8"> | |||
<title>API - Earwig's Copyvio Detector</title> | |||
<link rel="stylesheet" href="${request.script_root}/static/api.min.css" type="text/css" /> | |||
</head> | |||
<body> | |||
% if help: | |||
<div id="help"> | |||
<p>This is the first version of the <a href="//en.wikipedia.org/wiki/Application_programming_interface">API</a> for <a href="${request.script_root}">Earwig's Copyvio Detector</a>. It works, but some bugs might still need to be ironed out, so please <a href="https://github.com/earwig/copyvios/issues">report any</a> if you see them.</p> | |||
</div> | |||
% endif | |||
% if result: | |||
<div id="result"> | |||
<p>You are using <span class="code">jsonfm</span> output mode, which renders JSON data as a formatted HTML document. This is intended for testing and debugging only.</p> | |||
<!-- walk tree --> | |||
</div> | |||
% endif | |||
</body> | |||
</html> |
@@ -30,14 +30,14 @@ | |||
<div id="info-box" class="red-box"> | |||
<p>The given site (project=<b><span class="mono">${query.project | h}</span></b>, language=<b><span class="mono">${query.lang | h}</span></b>) doesn't seem to exist. It may also be closed or private. <a href="//${query.lang | h}.${query.project | h}.org/">Confirm its URL.</a></p> | |||
</div> | |||
% elif query.title and not result: | |||
<div id="info-box" class="red-box"> | |||
<p>The given page doesn't seem to exist: <a href="${query.page.url}">${query.page.title | h}</a>.</p> | |||
</div> | |||
% elif query.oldid and not result: | |||
<div id="info-box" class="red-box"> | |||
<p>The given revision ID doesn't seem to exist: <a href="//${query.site.domain | h}/w/index.php?oldid=${query.oldid | h}">${query.oldid | h}</a>.</p> | |||
</div> | |||
% elif query.title and not result: | |||
<div id="info-box" class="red-box"> | |||
<p>The given page doesn't seem to exist: <a href="${query.page.url}">${query.page.title | h}</a>.</p> | |||
</div> | |||
% endif | |||
%endif | |||
<p>This tool attempts to detect <a href="//en.wikipedia.org/wiki/WP:COPYVIO">copyright violations</a> in articles. In search mode, it will check for similar content elsewhere on the web using <a href="//developer.yahoo.com/boss/search/">Yahoo! BOSS</a> and/or external links present in the text of the page, depending on which options are selected. In comparison mode, the tool will skip the searching step and display a report comparing the article to the given webpage, like the <a href="//tools.wmflabs.org/dupdet/">Duplication Detector</a>.</p> | |||
@@ -1,8 +1,9 @@ | |||
<%! from flask import g %>\ | |||
<%! from flask import g, request %>\ | |||
</div> | |||
<div id="footer"> | |||
<p>Copyright © 2009–2014 <a href="//en.wikipedia.org/wiki/User:The_Earwig">Ben Kurtovic</a> • \ | |||
<a href="https://github.com/earwig/copyvios">View Source</a> • \ | |||
<a href="${request.script_root}/api.json">API</a> • \ | |||
<a href="https://github.com/earwig/copyvios">Source Code</a> • \ | |||
% if ("CopyviosBackground" in g.cookies and g.cookies["CopyviosBackground"].value in ["potd", "list"]) or "CopyviosBackground" not in g.cookies: | |||
<a href="${g.descurl | h}">Background</a> • \ | |||
% endif | |||