From 6639338fa85d387b343603b9342e0095104ca4d2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 12 Sep 2014 23:25:04 -0500 Subject: [PATCH] First version of new API (for #7) --- app.fcgi | 25 +++++++++- copyvios/api.py | 103 ++++++++++++++++++++++++++++++++++++++++++ copyvios/checker.py | 5 +- copyvios/highlighter.py | 8 ++-- static/api.css | 3 ++ templates/api.mako | 21 +++++++++ templates/index.mako | 8 ++-- templates/support/footer.mako | 5 +- 8 files changed, 165 insertions(+), 13 deletions(-) create mode 100644 copyvios/api.py create mode 100644 static/api.css create mode 100644 templates/api.mako diff --git a/app.fcgi b/app.fcgi index ebef1c7..3327910 100755 --- a/app.fcgi +++ b/app.fcgi @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- from functools import wraps +from json import dumps from logging import DEBUG, INFO, getLogger from logging.handlers import TimedRotatingFileHandler from time import asctime @@ -9,10 +10,11 @@ from traceback import format_exc from earwigbot.bot import Bot from earwigbot.wiki.copyvios import globalize -from flask import Flask, g, request +from flask import Flask, g, make_response, request from flask.ext.mako import MakoTemplates, render_template, TemplateError from flup.server.fcgi import WSGIServer +from copyvios.api import format_api_error, handle_api_request from copyvios.checker import do_check from copyvios.cookies import parse_cookies from copyvios.settings import process_settings @@ -85,5 +87,26 @@ def settings(): "default_lang": default.lang, "default_project": default.project} return render_template("settings.mako", **kwargs) +@app.route("/api.json") +def api(): + if not request.args: + return render_template("api.mako", help=True) + + format = request.args.get("format", "json") + if format in ["json", "jsonfm"]: + try: + result = handle_api_request() + except Exception as exc: + result = format_api_error("unhandled_exception", exc) + else: + errmsg = u"Unknown format: '{0}'".format(format) + result = format_api_error("unknown_format", errmsg) + + if format == "jsonfm": + return render_template("api.mako", help=False, result=result) + resp = make_response(dumps(result)) + resp.mimetype = "application/json" + return resp + if __name__ == '__main__': WSGIServer(app).run() diff --git a/copyvios/api.py b/copyvios/api.py new file mode 100644 index 0000000..93f9964 --- /dev/null +++ b/copyvios/api.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +from .checker import do_check, T_POSSIBLE, T_SUSPECT +from .misc import Query +from .sites import get_sites + +__all__ = ["format_api_error", "handle_api_request"] + +_HOOKS = { + "compare": _hook_check, + "search": _hook_check, + "sites": _hook_sites, +} + +_CHECK_ERRORS = { + "no search method": "Either 'use_engine' or 'use_links' must be true", + "no URL": "The parameter 'url' is required for URL comparisons", + "bad URI": "The given URI scheme is unsupported", + "no data": "No text could be found in the given URL (note that only HTML " + "and plain text pages are supported, and content generated by " + "JavaScript or found inside iframes is ignored)", + "timeout": "The given URL timed out before any data could be retrieved", + "search error": "An error occurred while using the search engine; try " + "reloading or setting 'use_engine' to 0", +} + +def _serialize_page(page): + return {"title": page.title, "url": page.url} + +def _serialize_source(source, show_skip=True): + if not source: + return {"url": None, "confidence": 0.0, "violation": "none"} + + conf = source.confidence + data = { + "url": source.url, + "confidence": conf, + "violation": "suspected" if conf >= T_SUSPECT else + "possible" if conf >= T_POSSIBLE else "none" + } + if show_skip: + data["skipped"] = source.skipped + return data + +def format_api_error(code, info): + if isinstance(info, BaseException): + info = type(info).__name__ + ": " + str(info) + elif isinstance(info, unicode): + info = info.encode("utf8") + return {"status": "error", "error": {"code": code, "info": info}} + +def handle_api_request(): + query = Query() + action = query.action.lower() if query.action else "" + return _HOOKS.get(action, _hook_default)(query) + +def _hook_default(query): + info = u"Unknown action: '{0}'".format(query.action.lower()) + return format_api_error("unknown_action", info) + +def _hook_check(query): + do_check(query) + if not query.submitted: + info = ("The query parameters 'project', 'lang', and either 'title' " + "or 'oldid' are required for checks") + return format_api_error("missing_params", info) + if query.error: + info = _CHECK_ERRORS.get(query.error, "An unknown error occurred") + return format_api_error(query.error.replace(" ", "_"), info) + elif not query.site: + info = (u"The given site (project={0}, lang={1}) either doesn't exist," + u" is closed, or is private").format(query.project, query.lang) + return format_api_error("bad_site", info) + elif not query.result: + if query.oldid: + info = u"The given revision ID doesn't seem to exist: {0}" + return format_api_error("bad_oldid", info.format(query.oldid)) + else: + info = u"The given page doesn't seem to exist: {0}" + return format_api_error("bad_title", info.format(query.page.title)) + + result = query.result + data = { + "status": "ok", + "meta": { + "time": result.time, + "queries": result.queries, + "cached": result.cached, + "redirected": bool(query.redirected_from) + }, + "page": _serialize_page(query.page), + "best": _serialize_source(result.best, show_skip=False), + "sources": [_serialize_source(source) for source in result.sources] + } + if result.cached: + data["meta"]["cache_time"] = result.cache_time + if query.redirected_from: + data["original_page"] = _serialize_page(query.redirected_from) + return data + +def _hook_sites(query): + langs, projects = get_sites() + return {"status": "ok", "langs": langs, "projects": projects} diff --git a/copyvios/checker.py b/copyvios/checker.py index 084f8dd..e961890 100644 --- a/copyvios/checker.py +++ b/copyvios/checker.py @@ -17,8 +17,9 @@ __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"] T_POSSIBLE = 0.4 T_SUSPECT = 0.75 -def do_check(): - query = Query() +def do_check(query=None): + if not query: + query = Query() if query.lang: query.lang = query.orig_lang = query.lang.lower() if "::" in query.lang: diff --git a/copyvios/highlighter.py b/copyvios/highlighter.py index ca53357..a6abcd0 100644 --- a/copyvios/highlighter.py +++ b/copyvios/highlighter.py @@ -7,7 +7,7 @@ from markupsafe import escape __all__ = ["highlight_delta"] -def highlight_delta(context, chain, delta=None): +def highlight_delta(context, chain, delta): degree = chain.degree - 1 highlights = [False] * degree block = [chain.START] * degree @@ -26,7 +26,7 @@ def highlight_delta(context, chain, delta=None): i = degree numwords = len(chain.text.split()) - processed = [] + result = [] paragraphs = chain.text.split("\n") while paragraphs: words = [] @@ -39,10 +39,10 @@ def highlight_delta(context, chain, delta=None): words.append(_highlight_word(word, before, after, first, last)) else: words.append(unicode(escape(word))) - processed.append(u" ".join(words)) + result.append(u" ".join(words)) i += 1 - return u"

".join(processed) + return u"

".join(result) def _get_next(paragraphs): paragraph = paragraphs.pop(0) diff --git a/static/api.css b/static/api.css new file mode 100644 index 0000000..2d6858d --- /dev/null +++ b/static/api.css @@ -0,0 +1,3 @@ +.code { + font-family: monospace; +} diff --git a/templates/api.mako b/templates/api.mako new file mode 100644 index 0000000..589c8e6 --- /dev/null +++ b/templates/api.mako @@ -0,0 +1,21 @@ + + + + + API - Earwig's Copyvio Detector + + + + % if help: +
+

This is the first version of the API for Earwig's Copyvio Detector. It works, but some bugs might still need to be ironed out, so please report any if you see them.

+
+ % endif + % if result: +
+

You are using jsonfm output mode, which renders JSON data as a formatted HTML document. This is intended for testing and debugging only.

+ +
+ % endif + + diff --git a/templates/index.mako b/templates/index.mako index 7c7f1dd..7536677 100644 --- a/templates/index.mako +++ b/templates/index.mako @@ -30,14 +30,14 @@

The given site (project=${query.project | h}, language=${query.lang | h}) doesn't seem to exist. It may also be closed or private. Confirm its URL.

- % elif query.title and not result: -
-

The given page doesn't seem to exist: ${query.page.title | h}.

-
% elif query.oldid and not result:

The given revision ID doesn't seem to exist: ${query.oldid | h}.

+ % elif query.title and not result: +
+

The given page doesn't seem to exist: ${query.page.title | h}.

+
% endif %endif

This tool attempts to detect copyright violations in articles. In search mode, it will check for similar content elsewhere on the web using Yahoo! BOSS and/or external links present in the text of the page, depending on which options are selected. In comparison mode, the tool will skip the searching step and display a report comparing the article to the given webpage, like the Duplication Detector.

diff --git a/templates/support/footer.mako b/templates/support/footer.mako index 58060b1..b648ad7 100644 --- a/templates/support/footer.mako +++ b/templates/support/footer.mako @@ -1,8 +1,9 @@ -<%! from flask import g %>\ +<%! from flask import g, request %>\