From 2a81217de83cd0e2e93ac39a6f03cdd3045c1bc0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 9 Jun 2016 17:00:12 -0400 Subject: [PATCH] Add support for detailed text comparison in API (T132949) --- copyvios/api.py | 13 +++++++++++-- templates/api.mako | 19 +++++++++++++------ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/copyvios/api.py b/copyvios/api.py index 1d803dd..0be95e1 100644 --- a/copyvios/api.py +++ b/copyvios/api.py @@ -2,6 +2,7 @@ from collections import OrderedDict +from .highlighter import highlight_delta from .checker import do_check, T_POSSIBLE, T_SUSPECT from .misc import Query, cache from .sites import update_sites @@ -40,6 +41,11 @@ def _serialize_source(source, show_skip=True): data["excluded"] = source.excluded return data +def _serialize_detail(result): + article = highlight_delta(result.article_chain, result.best.chains[1]) + source = highlight_delta(result.best.chains[0], result.best.chains[1]) + return OrderedDict((("article", article), ("source", source))) + def format_api_error(code, info): if isinstance(info, BaseException): info = type(info).__name__ + ": " + str(info) @@ -90,12 +96,15 @@ def _hook_check(query): data["original_page"] = _serialize_page(query.redirected_from) data["best"] = _serialize_source(result.best, show_skip=False) data["sources"] = [_serialize_source(source) for source in result.sources] + if query.detail in ("1", "true"): + data["detail"] = _serialize_detail(result) return data def _hook_sites(query): update_sites() - return OrderedDict((("status", "ok"), - ("langs", cache.langs), ("projects", cache.projects))) + return OrderedDict(( + ("status", "ok"), ("langs", cache.langs), ("projects", cache.projects) + )) _HOOKS = { "compare": _hook_check, diff --git a/templates/api.mako b/templates/api.mako index 6f4732e..7111462 100644 --- a/templates/api.mako +++ b/templates/api.mako @@ -112,6 +112,12 @@ Yes The URL of the suspected violation source that will be compared to the page. + + detail + boolean + No (default: false) + Whether to include the detailed HTML text comparison available in the regular interface. If not, only the confidence percentage is available. + @@ -219,7 +225,11 @@ "excluded": booleanwhether the source was skipped for being in the excluded URL list }, ... - ] + ], + only if action=compare and detail=true "detail": { + "article": stringarticle text, with shared passages marked with HTML, + "source": stringbest source text, with shared passages marked with HTML + } }

In the case of action=search, sources will contain one entry for each source checked (or skipped if the check ends early), sorted in order of confidence, with skipped and excluded sources at the bottom.

In the case of action=compare, best will always contain information about the URL that was given, so response["best"]["url"] will never be null. Also, sources will always contain one entry, with the same data as best, since only one source is checked in comparison mode.

@@ -241,11 +251,8 @@ ... ] } -

Caveats

- +

Etiquette

+ The tool uses the same workers to handle all requests, so making concurrent API calls is only going to slow you down. Most operations are not rate-limited, but full searches with use_engine=True are globally limited to a few thousand per day. Be respectful!

Example

https://tools.wmflabs.org/copyvios/api.json?version=1&action=search&project=wikipedia&lang=en&title=User:EarwigBot/Copyvios/Tests/2

{