Browse Source

First version of new API (for #7)

pull/24/head
Ben Kurtovic 9 years ago
parent
commit
6639338fa8
8 changed files with 165 additions and 13 deletions
  1. +24
    -1
      app.fcgi
  2. +103
    -0
      copyvios/api.py
  3. +3
    -2
      copyvios/checker.py
  4. +4
    -4
      copyvios/highlighter.py
  5. +3
    -0
      static/api.css
  6. +21
    -0
      templates/api.mako
  7. +4
    -4
      templates/index.mako
  8. +3
    -2
      templates/support/footer.mako

+ 24
- 1
app.fcgi View File

@@ -2,6 +2,7 @@
# -*- coding: utf-8 -*-

from functools import wraps
from json import dumps
from logging import DEBUG, INFO, getLogger
from logging.handlers import TimedRotatingFileHandler
from time import asctime
@@ -9,10 +10,11 @@ from traceback import format_exc

from earwigbot.bot import Bot
from earwigbot.wiki.copyvios import globalize
from flask import Flask, g, request
from flask import Flask, g, make_response, request
from flask.ext.mako import MakoTemplates, render_template, TemplateError
from flup.server.fcgi import WSGIServer

from copyvios.api import format_api_error, handle_api_request
from copyvios.checker import do_check
from copyvios.cookies import parse_cookies
from copyvios.settings import process_settings
@@ -85,5 +87,26 @@ def settings():
"default_lang": default.lang, "default_project": default.project}
return render_template("settings.mako", **kwargs)

@app.route("/api.json")
def api():
if not request.args:
return render_template("api.mako", help=True)

format = request.args.get("format", "json")
if format in ["json", "jsonfm"]:
try:
result = handle_api_request()
except Exception as exc:
result = format_api_error("unhandled_exception", exc)
else:
errmsg = u"Unknown format: '{0}'".format(format)
result = format_api_error("unknown_format", errmsg)

if format == "jsonfm":
return render_template("api.mako", help=False, result=result)
resp = make_response(dumps(result))
resp.mimetype = "application/json"
return resp

if __name__ == '__main__':
WSGIServer(app).run()

+ 103
- 0
copyvios/api.py View File

@@ -0,0 +1,103 @@
# -*- coding: utf-8 -*-

from .checker import do_check, T_POSSIBLE, T_SUSPECT
from .misc import Query
from .sites import get_sites

__all__ = ["format_api_error", "handle_api_request"]

_HOOKS = {
"compare": _hook_check,
"search": _hook_check,
"sites": _hook_sites,
}

_CHECK_ERRORS = {
"no search method": "Either 'use_engine' or 'use_links' must be true",
"no URL": "The parameter 'url' is required for URL comparisons",
"bad URI": "The given URI scheme is unsupported",
"no data": "No text could be found in the given URL (note that only HTML "
"and plain text pages are supported, and content generated by "
"JavaScript or found inside iframes is ignored)",
"timeout": "The given URL timed out before any data could be retrieved",
"search error": "An error occurred while using the search engine; try "
"reloading or setting 'use_engine' to 0",
}

def _serialize_page(page):
return {"title": page.title, "url": page.url}

def _serialize_source(source, show_skip=True):
if not source:
return {"url": None, "confidence": 0.0, "violation": "none"}

conf = source.confidence
data = {
"url": source.url,
"confidence": conf,
"violation": "suspected" if conf >= T_SUSPECT else
"possible" if conf >= T_POSSIBLE else "none"
}
if show_skip:
data["skipped"] = source.skipped
return data

def format_api_error(code, info):
if isinstance(info, BaseException):
info = type(info).__name__ + ": " + str(info)
elif isinstance(info, unicode):
info = info.encode("utf8")
return {"status": "error", "error": {"code": code, "info": info}}

def handle_api_request():
query = Query()
action = query.action.lower() if query.action else ""
return _HOOKS.get(action, _hook_default)(query)

def _hook_default(query):
info = u"Unknown action: '{0}'".format(query.action.lower())
return format_api_error("unknown_action", info)

def _hook_check(query):
do_check(query)
if not query.submitted:
info = ("The query parameters 'project', 'lang', and either 'title' "
"or 'oldid' are required for checks")
return format_api_error("missing_params", info)
if query.error:
info = _CHECK_ERRORS.get(query.error, "An unknown error occurred")
return format_api_error(query.error.replace(" ", "_"), info)
elif not query.site:
info = (u"The given site (project={0}, lang={1}) either doesn't exist,"
u" is closed, or is private").format(query.project, query.lang)
return format_api_error("bad_site", info)
elif not query.result:
if query.oldid:
info = u"The given revision ID doesn't seem to exist: {0}"
return format_api_error("bad_oldid", info.format(query.oldid))
else:
info = u"The given page doesn't seem to exist: {0}"
return format_api_error("bad_title", info.format(query.page.title))

result = query.result
data = {
"status": "ok",
"meta": {
"time": result.time,
"queries": result.queries,
"cached": result.cached,
"redirected": bool(query.redirected_from)
},
"page": _serialize_page(query.page),
"best": _serialize_source(result.best, show_skip=False),
"sources": [_serialize_source(source) for source in result.sources]
}
if result.cached:
data["meta"]["cache_time"] = result.cache_time
if query.redirected_from:
data["original_page"] = _serialize_page(query.redirected_from)
return data

def _hook_sites(query):
langs, projects = get_sites()
return {"status": "ok", "langs": langs, "projects": projects}

+ 3
- 2
copyvios/checker.py View File

@@ -17,8 +17,9 @@ __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]
T_POSSIBLE = 0.4
T_SUSPECT = 0.75

def do_check():
query = Query()
def do_check(query=None):
if not query:
query = Query()
if query.lang:
query.lang = query.orig_lang = query.lang.lower()
if "::" in query.lang:


+ 4
- 4
copyvios/highlighter.py View File

@@ -7,7 +7,7 @@ from markupsafe import escape

__all__ = ["highlight_delta"]

def highlight_delta(context, chain, delta=None):
def highlight_delta(context, chain, delta):
degree = chain.degree - 1
highlights = [False] * degree
block = [chain.START] * degree
@@ -26,7 +26,7 @@ def highlight_delta(context, chain, delta=None):

i = degree
numwords = len(chain.text.split())
processed = []
result = []
paragraphs = chain.text.split("\n")
while paragraphs:
words = []
@@ -39,10 +39,10 @@ def highlight_delta(context, chain, delta=None):
words.append(_highlight_word(word, before, after, first, last))
else:
words.append(unicode(escape(word)))
processed.append(u" ".join(words))
result.append(u" ".join(words))
i += 1

return u"<br /><br />".join(processed)
return u"<br /><br />".join(result)

def _get_next(paragraphs):
paragraph = paragraphs.pop(0)


+ 3
- 0
static/api.css View File

@@ -0,0 +1,3 @@
.code {
font-family: monospace;
}

+ 21
- 0
templates/api.mako View File

@@ -0,0 +1,21 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>API - Earwig's Copyvio Detector</title>
<link rel="stylesheet" href="${request.script_root}/static/api.min.css" type="text/css" />
</head>
<body>
% if help:
<div id="help">
<p>This is the first version of the <a href="//en.wikipedia.org/wiki/Application_programming_interface">API</a> for <a href="${request.script_root}">Earwig's Copyvio Detector</a>. It works, but some bugs might still need to be ironed out, so please <a href="https://github.com/earwig/copyvios/issues">report any</a> if you see them.</p>
</div>
% endif
% if result:
<div id="result">
<p>You are using <span class="code">jsonfm</span> output mode, which renders JSON data as a formatted HTML document. This is intended for testing and debugging only.</p>
<!-- walk tree -->
</div>
% endif
</body>
</html>

+ 4
- 4
templates/index.mako View File

@@ -30,14 +30,14 @@
<div id="info-box" class="red-box">
<p>The given site (project=<b><span class="mono">${query.project | h}</span></b>, language=<b><span class="mono">${query.lang | h}</span></b>) doesn't seem to exist. It may also be closed or private. <a href="//${query.lang | h}.${query.project | h}.org/">Confirm its URL.</a></p>
</div>
% elif query.title and not result:
<div id="info-box" class="red-box">
<p>The given page doesn't seem to exist: <a href="${query.page.url}">${query.page.title | h}</a>.</p>
</div>
% elif query.oldid and not result:
<div id="info-box" class="red-box">
<p>The given revision ID doesn't seem to exist: <a href="//${query.site.domain | h}/w/index.php?oldid=${query.oldid | h}">${query.oldid | h}</a>.</p>
</div>
% elif query.title and not result:
<div id="info-box" class="red-box">
<p>The given page doesn't seem to exist: <a href="${query.page.url}">${query.page.title | h}</a>.</p>
</div>
% endif
%endif
<p>This tool attempts to detect <a href="//en.wikipedia.org/wiki/WP:COPYVIO">copyright violations</a> in articles. In search mode, it will check for similar content elsewhere on the web using <a href="//developer.yahoo.com/boss/search/">Yahoo! BOSS</a> and/or external links present in the text of the page, depending on which options are selected. In comparison mode, the tool will skip the searching step and display a report comparing the article to the given webpage, like the <a href="//tools.wmflabs.org/dupdet/">Duplication Detector</a>.</p>


+ 3
- 2
templates/support/footer.mako View File

@@ -1,8 +1,9 @@
<%! from flask import g %>\
<%! from flask import g, request %>\
</div>
<div id="footer">
<p>Copyright &copy; 2009&ndash;2014 <a href="//en.wikipedia.org/wiki/User:The_Earwig">Ben Kurtovic</a> &bull; \
<a href="https://github.com/earwig/copyvios">View Source</a> &bull; \
<a href="${request.script_root}/api.json">API</a> &bull; \
<a href="https://github.com/earwig/copyvios">Source Code</a> &bull; \
% if ("CopyviosBackground" in g.cookies and g.cookies["CopyviosBackground"].value in ["potd", "list"]) or "CopyviosBackground" not in g.cookies:
<a href="${g.descurl | h}">Background</a> &bull; \
% endif


Loading…
Cancel
Save