A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

api.py 4.9 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # -*- coding: utf-8 -*-
  2. from collections import OrderedDict
  3. from .highlighter import highlight_delta
  4. from .checker import do_check, T_POSSIBLE, T_SUSPECT
  5. from .misc import Query, cache
  6. from .sites import update_sites
  7. __all__ = ["format_api_error", "handle_api_request"]
  8. _CHECK_ERRORS = {
  9. "no search method": "Either 'use_engine' or 'use_links' must be true",
  10. "bad oldid": "The revision ID is invalid",
  11. "no URL": "The parameter 'url' is required for URL comparisons",
  12. "bad URI": "The given URI scheme is unsupported",
  13. "no data": "No text could be found in the given URL (note that only HTML "
  14. "and plain text pages are supported, and content generated by "
  15. "JavaScript or found inside iframes is ignored)",
  16. "timeout": "The given URL timed out before any data could be retrieved",
  17. "search error": "An error occurred while using the search engine; try "
  18. "reloading or setting 'use_engine' to 0",
  19. }
  20. def _serialize_page(page):
  21. return OrderedDict((("title", page.title), ("url", page.url)))
  22. def _serialize_source(source, show_skip=True):
  23. if not source:
  24. return OrderedDict((
  25. ("url", None), ("confidence", 0.0), ("violation", "none")))
  26. conf = source.confidence
  27. data = OrderedDict((
  28. ("url", source.url),
  29. ("confidence", conf),
  30. ("violation", "suspected" if conf >= T_SUSPECT else
  31. "possible" if conf >= T_POSSIBLE else "none")
  32. ))
  33. if show_skip:
  34. data["skipped"] = source.skipped
  35. data["excluded"] = source.excluded
  36. return data
  37. def _serialize_detail(result):
  38. source_chain, delta = result.best.chains
  39. article = highlight_delta(None, result.article_chain, delta)
  40. source = highlight_delta(None, source_chain, delta)
  41. return OrderedDict((("article", article), ("source", source)))
  42. def format_api_error(code, info):
  43. if isinstance(info, BaseException):
  44. info = type(info).__name__ + ": " + str(info)
  45. elif isinstance(info, unicode):
  46. info = info.encode("utf8")
  47. error_inner = OrderedDict((("code", code), ("info", info)))
  48. return OrderedDict((("status", "error"), ("error", error_inner)))
  49. def _hook_default(query):
  50. info = u"Unknown action: '{0}'".format(query.action.lower())
  51. return format_api_error("unknown_action", info)
  52. def _hook_check(query):
  53. do_check(query)
  54. if not query.submitted:
  55. info = ("The query parameters 'project', 'lang', and either 'title' "
  56. "or 'oldid' are required for checks")
  57. return format_api_error("missing_params", info)
  58. if query.error:
  59. info = _CHECK_ERRORS.get(query.error, "An unknown error occurred")
  60. return format_api_error(query.error.replace(" ", "_"), info)
  61. elif not query.site:
  62. info = (u"The given site (project={0}, lang={1}) either doesn't exist,"
  63. u" is closed, or is private").format(query.project, query.lang)
  64. return format_api_error("bad_site", info)
  65. elif not query.result:
  66. if query.oldid:
  67. info = u"The given revision ID doesn't seem to exist: {0}"
  68. return format_api_error("bad_oldid", info.format(query.oldid))
  69. else:
  70. info = u"The given page doesn't seem to exist: {0}"
  71. return format_api_error("bad_title", info.format(query.page.title))
  72. result = query.result
  73. data = OrderedDict((
  74. ("status", "ok"),
  75. ("meta", OrderedDict((
  76. ("time", result.time),
  77. ("queries", result.queries),
  78. ("cached", result.cached),
  79. ("redirected", bool(query.redirected_from))
  80. ))),
  81. ("page", _serialize_page(query.page))
  82. ))
  83. if result.cached:
  84. data["meta"]["cache_time"] = result.cache_time
  85. if query.redirected_from:
  86. data["original_page"] = _serialize_page(query.redirected_from)
  87. data["best"] = _serialize_source(result.best, show_skip=False)
  88. data["sources"] = [_serialize_source(source) for source in result.sources]
  89. if query.detail in ("1", "true"):
  90. data["detail"] = _serialize_detail(result)
  91. return data
  92. def _hook_sites(query):
  93. update_sites()
  94. return OrderedDict((
  95. ("status", "ok"), ("langs", cache.langs), ("projects", cache.projects)
  96. ))
  97. _HOOKS = {
  98. "compare": _hook_check,
  99. "search": _hook_check,
  100. "sites": _hook_sites,
  101. }
  102. def handle_api_request():
  103. query = Query()
  104. if query.version:
  105. try:
  106. query.version = int(query.version)
  107. except ValueError:
  108. info = "The version string is invalid: {0}".format(query.version)
  109. return format_api_error("invalid_version", info)
  110. else:
  111. query.version = 1
  112. if query.version == 1:
  113. action = query.action.lower() if query.action else ""
  114. return _HOOKS.get(action, _hook_default)(query)
  115. info = "The API version is unsupported: {0}".format(query.version)
  116. return format_api_error("unsupported_version", info)