@@ -3,7 +3,7 @@ | |||
*.egg-info | |||
.DS_Store | |||
__pycache__ | |||
venv | |||
.earwigbot | |||
logs/* | |||
!logs/.gitinclude |
@@ -0,0 +1,11 @@ | |||
repos: | |||
- repo: https://github.com/astral-sh/ruff-pre-commit | |||
rev: v0.6.2 | |||
hooks: | |||
- id: ruff | |||
args: [--fix] | |||
- id: ruff-format | |||
- repo: https://github.com/RobertCraigie/pyright-python | |||
rev: v1.1.377 | |||
hooks: | |||
- id: pyright |
@@ -1,4 +1,4 @@ | |||
Copyright (c) 2009-2021 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (c) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||
of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,50 +1,46 @@ | |||
This is a [copyright violation](https://en.wikipedia.org/wiki/WP:COPYVIO) | |||
detector running on [Wikimedia Cloud Services](https://copyvios.toolforge.org/). | |||
detector web tool for Wikipedia articles running on | |||
[Wikimedia Cloud Services](https://wikitech.wikimedia.org/wiki/Help:Cloud_Services_introduction) | |||
at [copyvios.toolforge.org](https://copyvios.toolforge.org/). | |||
It can search the web for content similar to a given article, and graphically | |||
compare an article to a specific URL. Some technical details are expanded upon | |||
[in a blog post](https://benkurtovic.com/2014/08/20/copyvio-detector.html). | |||
compare an article to specific URLs. Some technical details are expanded upon | |||
[in a blog post](https://benkurtovic.com/2014/08/20/copyvio-detector.html), | |||
though much of it is outdated. | |||
Dependencies | |||
Installation | |||
============ | |||
* [earwigbot](https://github.com/earwig/earwigbot) >= 0.1 | |||
* [flask](https://flask.palletsprojects.com/) >= 0.10.1 | |||
* [flask-mako](https://pythonhosted.org/Flask-Mako/) >= 0.3 | |||
* [mako](https://www.makotemplates.org/) >= 0.7.2 | |||
* [mwparserfromhell](https://github.com/earwig/mwparserfromhell) >= 0.3 | |||
* [oursql](https://pythonhosted.org/oursql/) >= 0.9.3.1 | |||
* [requests](https://requests.readthedocs.io/) >= 2.9.1 | |||
* [SQLAlchemy](https://www.sqlalchemy.org/) >= 0.9.6 | |||
* [apsw](https://github.com/rogerbinns/apsw) >= 3.26.0 | |||
* [uglifyjs](https://github.com/mishoo/UglifyJS) >= 3.12.6 | |||
* [cssnano](https://github.com/cssnano/cssnano) >= 4.1.10 | |||
* [postcss-cli](https://github.com/postcss/postcss-cli) >= 8.3.1 | |||
- If using Toolforge, clone the repository to `~/www/python/src`, or otherwise | |||
symlink it to that directory. | |||
Running | |||
======= | |||
- Create a virtual environment and install the dependencies. On Toolforge, | |||
this should be in `~/www/python/venv`, otherwise it can be in a subdirectory | |||
of the git project named `venv`: | |||
python3 -m venv venv | |||
. venv/bin/activate | |||
pip install -e . | |||
- If using Toolforge, you should clone the repository to `~/www/python/src`, or | |||
otherwise symlink it to that directory. A | |||
[virtualenv](https://virtualenv.pypa.io/) should be created at | |||
`~/www/python/venv`. | |||
- If you intend to modify CSS or JS, install the frontend dependencies: | |||
- Install all dependencies listed above. | |||
npm install -g uglify-js cssnano postcss postcss-cli | |||
- Create an SQL database with the `cache` and `cache_data` tables defined by | |||
[earwigbot-plugins](https://github.com/earwig/earwigbot-plugins/blob/develop/tasks/schema/afc_copyvios.sql). | |||
- Create an SQL database with the tables defined by `schema.sql`. | |||
- Create an earwigbot instance in `.earwigbot` (run `earwigbot .earwigbot`). In | |||
`.earwigbot/config.yml`, fill out the connection info for the database by | |||
- Create an earwigbot instance in `.earwigbot` (run `earwigbot .earwigbot`). | |||
In `.earwigbot/config.yml`, fill out the connection info for the database by | |||
adding the following to the `wiki` section: | |||
_copyviosSQL: | |||
copyvios: | |||
engine: mysql | |||
host: <hostname of database server> | |||
db: <name of database> | |||
db: <name of database> | |||
If additional arguments are needed by `oursql.connect()`, like usernames or | |||
passwords, they should be added to the `_copyviosSQL` section. | |||
Running | |||
======= | |||
- Run `./build.py` to minify JS and CSS files. | |||
- Run `./build.py` to minify JS and CSS files after making any frontend | |||
changes. | |||
- Start the web server (on Toolforge, `webservice uwsgi-python start`). | |||
- Start your WSGI server pointing to app:app. |
@@ -1,10 +1,9 @@ | |||
#! /usr/bin/env python | |||
# -*- coding: utf-8 -*- | |||
import logging | |||
from functools import wraps | |||
from hashlib import md5 | |||
from json import dumps | |||
from logging import DEBUG, INFO, getLogger | |||
from logging.handlers import TimedRotatingFileHandler | |||
from os import path | |||
from time import asctime | |||
@@ -13,7 +12,7 @@ from traceback import format_exc | |||
from earwigbot.bot import Bot | |||
from earwigbot.wiki.copyvios import globalize | |||
from flask import Flask, g, make_response, request | |||
from flask_mako import MakoTemplates, render_template, TemplateError | |||
from flask_mako import MakoTemplates, TemplateError, render_template | |||
from copyvios.api import format_api_error, handle_api_request | |||
from copyvios.checker import do_check | |||
@@ -26,24 +25,27 @@ app = Flask(__name__) | |||
MakoTemplates(app) | |||
hand = TimedRotatingFileHandler("logs/app.log", when="midnight", backupCount=7) | |||
hand.setLevel(DEBUG) | |||
hand.setLevel(logging.DEBUG) | |||
app.logger.addHandler(hand) | |||
app.logger.info(u"Flask server started " + asctime()) | |||
app.logger.info("Flask server started " + asctime()) | |||
app._hash_cache = {} | |||
def catch_errors(func): | |||
@wraps(func) | |||
def inner(*args, **kwargs): | |||
try: | |||
return func(*args, **kwargs) | |||
except TemplateError as exc: | |||
app.logger.error(u"Caught exception:\n{0}".format(exc.text)) | |||
app.logger.error(f"Caught exception:\n{exc.text}") | |||
return render_template("error.mako", traceback=exc.text) | |||
except Exception: | |||
app.logger.exception(u"Caught exception:") | |||
app.logger.exception("Caught exception:") | |||
return render_template("error.mako", traceback=format_exc()) | |||
return inner | |||
@app.before_first_request | |||
def setup_app(): | |||
cache.bot = Bot(".earwigbot", 100) | |||
@@ -54,31 +56,43 @@ def setup_app(): | |||
globalize(num_workers=8) | |||
@app.before_request | |||
def prepare_request(): | |||
g._db = None | |||
g.cookies = parse_cookies( | |||
request.script_root or "/", request.environ.get("HTTP_COOKIE")) | |||
request.script_root or "/", request.environ.get("HTTP_COOKIE") | |||
) | |||
g.new_cookies = [] | |||
@app.after_request | |||
def add_new_cookies(response): | |||
for cookie in g.new_cookies: | |||
response.headers.add("Set-Cookie", cookie) | |||
return response | |||
@app.after_request | |||
def write_access_log(response): | |||
msg = u"%s %s %s %s -> %s" | |||
app.logger.debug(msg, asctime(), request.method, request.path, | |||
request.values.to_dict(), response.status_code) | |||
msg = "%s %s %s %s -> %s" | |||
app.logger.debug( | |||
msg, | |||
asctime(), | |||
request.method, | |||
request.path, | |||
request.values.to_dict(), | |||
response.status_code, | |||
) | |||
return response | |||
@app.teardown_appcontext | |||
def close_databases(error): | |||
if g._db: | |||
g._db.close() | |||
def external_url_handler(error, endpoint, values): | |||
if endpoint == "static" and "file" in values: | |||
fpath = path.join(app.static_folder, values["file"]) | |||
@@ -90,11 +104,13 @@ def external_url_handler(error, endpoint, values): | |||
with open(fpath, "rb") as f: | |||
hashstr = md5(f.read()).hexdigest() | |||
app._hash_cache[fpath] = (mtime, hashstr) | |||
return "/static/{0}?v={1}".format(values["file"], hashstr) | |||
return f"/static/{values['file']}?v={hashstr}" | |||
raise error | |||
app.url_build_error_handlers.append(external_url_handler) | |||
@app.route("/") | |||
@catch_errors | |||
def index(): | |||
@@ -102,8 +118,13 @@ def index(): | |||
update_sites() | |||
query = do_check() | |||
return render_template( | |||
"index.mako", notice=notice, query=query, result=query.result, | |||
turnitin_result=query.turnitin_result) | |||
"index.mako", | |||
notice=notice, | |||
query=query, | |||
result=query.result, | |||
turnitin_result=query.turnitin_result, | |||
) | |||
@app.route("/settings", methods=["GET", "POST"]) | |||
@catch_errors | |||
@@ -111,15 +132,20 @@ def settings(): | |||
status = process_settings() if request.method == "POST" else None | |||
update_sites() | |||
default = cache.bot.wiki.get_site() | |||
kwargs = {"status": status, "default_lang": default.lang, | |||
"default_project": default.project} | |||
kwargs = { | |||
"status": status, | |||
"default_lang": default.lang, | |||
"default_project": default.project, | |||
} | |||
return render_template("settings.mako", **kwargs) | |||
@app.route("/api") | |||
@catch_errors | |||
def api(): | |||
return render_template("api.mako", help=True) | |||
@app.route("/api.json") | |||
@catch_errors | |||
def api_json(): | |||
@@ -134,7 +160,7 @@ def api_json(): | |||
except Exception as exc: | |||
result = format_api_error("unhandled_exception", exc) | |||
else: | |||
errmsg = u"Unknown format: '{0}'".format(format) | |||
errmsg = f"Unknown format: '{format}'" | |||
result = format_api_error("unknown_format", errmsg) | |||
if format == "jsonfm": | |||
@@ -144,5 +170,6 @@ def api_json(): | |||
resp.headers["Access-Control-Allow-Origin"] = "*" | |||
return resp | |||
if __name__ == '__main__': | |||
if __name__ == "__main__": | |||
app.run() |
@@ -1,13 +1,13 @@ | |||
#! /usr/bin/env python | |||
# -*- coding: utf-8 -*- | |||
from __future__ import print_function | |||
import os | |||
import subprocess | |||
def process(*args): | |||
print(*args) | |||
content = subprocess.check_output(args) | |||
subprocess.run(args, check=True) | |||
def main(): | |||
root = os.path.join(os.path.dirname(__file__), "static") | |||
@@ -15,10 +15,25 @@ def main(): | |||
for filename in filenames: | |||
name = os.path.relpath(os.path.join(dirpath, filename)) | |||
if filename.endswith(".js") and ".min." not in filename: | |||
process("uglifyjs", "--compress", "-o", name.replace(".js", ".min.js"), "--", name) | |||
process( | |||
"uglifyjs", | |||
"--compress", | |||
"-o", | |||
name.replace(".js", ".min.js"), | |||
"--", | |||
name, | |||
) | |||
if filename.endswith(".css") and ".min." not in filename: | |||
process("postcss", "-u", "cssnano", "--no-map", name, "-o", | |||
name.replace(".css", ".min.css")) | |||
process( | |||
"postcss", | |||
"-u", | |||
"cssnano", | |||
"--no-map", | |||
name, | |||
"-o", | |||
name.replace(".css", ".min.css"), | |||
) | |||
if __name__ == "__main__": | |||
main() |
@@ -1 +0,0 @@ | |||
# -*- coding: utf-8 -*- |
@@ -0,0 +1,44 @@ | |||
[project] | |||
name = "copyvios" | |||
version = "1.0.dev0" | |||
authors = [ | |||
{name = "Ben Kurtovic", email = "ben@benkurtovic.com"}, | |||
] | |||
description = "A copyright violation detector web tool for Wikipedia articles" | |||
readme = "README.md" | |||
requires-python = ">=3.11" | |||
dependencies = [ | |||
"earwigbot[sql,copyvios] >= 0.4", | |||
"mwparserfromhell >= 0.6", | |||
"flask >= 3.0", | |||
"flask-mako >= 0.4", | |||
"mako >= 1.3.5", | |||
"requests >= 2.32.3", | |||
"SQLAlchemy >= 2.0.32", | |||
"apsw >= 3.46.1", | |||
] | |||
[project.urls] | |||
Homepage = "https://github.com/earwig/copyvios" | |||
Issues = "https://github.com/earwig/copyvios/issues" | |||
[build-system] | |||
requires = ["setuptools>=61.0"] | |||
build-backend = "setuptools.build_meta" | |||
[tool.pyright] | |||
pythonVersion = "3.11" | |||
exclude = [ | |||
# TODO | |||
"src/copyvios/*", | |||
"app.py", | |||
] | |||
venvPath = "." | |||
venv = "venv" | |||
[tool.ruff] | |||
target-version = "py311" | |||
[tool.ruff.lint] | |||
select = ["E4", "E7", "E9", "F", "I", "UP"] | |||
ignore = ["F403"] |
@@ -2,48 +2,59 @@ | |||
import argparse | |||
import re | |||
import sqlite3 | |||
from typing import Any | |||
REGEX = re.compile( | |||
r'^' | |||
r'{address space usage: (?P<used_bytes>-?\d+) bytes/(?P<used_mb>\w+)} ' | |||
r'{rss usage: (?P<rss_bytes>-?\d+) bytes/(?P<rss_mb>\w+)} ' | |||
r'\[pid: (?P<pid>\d+)\|app: -\|req: -/-\] (?P<ip>[0-9.]+) \(-\) ' | |||
r'{(?P<vars>\d+) vars in (?P<var_bytes>\d+) bytes} ' | |||
r'\[(?P<date>[0-9A-Za-z: ]+)\] (?P<method>\w+) (?P<url>.*?) => ' | |||
r'generated (?P<resp_bytes>\d+) bytes in (?P<msecs>\d+) msecs ' | |||
r'\((- http://hasty.ai)?(?P<proto>[A-Z0-9/.]+) (?P<status>\d+)\) ' | |||
r'(?P<headers>\d+) headers in (?P<header_bytes>\d+) bytes ' | |||
r'\((?P<switches>\d+) switches on core (?P<core>\d+)\) ' | |||
r'(?P<agent>.*?)' | |||
r'( (?P<referer>https?://[^ ]*?))?( -)?( http(://|%3A%2F%2F)hasty\.ai)?' | |||
r'$' | |||
r"^" | |||
r"{address space usage: (?P<used_bytes>-?\d+) bytes/(?P<used_mb>\w+)} " | |||
r"{rss usage: (?P<rss_bytes>-?\d+) bytes/(?P<rss_mb>\w+)} " | |||
r"\[pid: (?P<pid>\d+)\|app: -\|req: -/-\] (?P<ip>[0-9.]+) \(-\) " | |||
r"{(?P<vars>\d+) vars in (?P<var_bytes>\d+) bytes} " | |||
r"\[(?P<date>[0-9A-Za-z: ]+)\] (?P<method>\w+) (?P<url>.*?) => " | |||
r"generated (?P<resp_bytes>\d+) bytes in (?P<msecs>\d+) msecs " | |||
r"\((- http://hasty.ai)?(?P<proto>[A-Z0-9/.]+) (?P<status>\d+)\) " | |||
r"(?P<headers>\d+) headers in (?P<header_bytes>\d+) bytes " | |||
r"\((?P<switches>\d+) switches on core (?P<core>\d+)\) " | |||
r"(?P<agent>.*?)" | |||
r"( (?P<referer>https?://[^ ]*?))?( -)?( http(://|%3A%2F%2F)hasty\.ai)?" | |||
r"$" | |||
) | |||
def save_logs(logs): | |||
def save_logs(logs: list[dict[str, Any]]) -> None: | |||
columns = sorted(REGEX.groupindex, key=lambda col: REGEX.groupindex[col]) | |||
conn = sqlite3.Connection('logs.db') | |||
conn = sqlite3.Connection("logs.db") | |||
cur = conn.cursor() | |||
cur.execute('CREATE TABLE IF NOT EXISTS logs(%s)' % ', '.join(columns)) | |||
cur.executemany('INSERT INTO logs VALUES (%s)' % ', '.join(['?'] * len(columns)), | |||
[[log[col] for col in columns] for log in logs]) | |||
cur.execute(f"CREATE TABLE IF NOT EXISTS logs({', '.join(columns)})") | |||
params = ", ".join(["?"] * len(columns)) | |||
cur.executemany( | |||
f"INSERT INTO logs VALUES ({params})", | |||
[[log[col] for col in columns] for log in logs], | |||
) | |||
conn.commit() | |||
conn.close() | |||
def read_logs(path): | |||
with open(path, 'r', errors='replace') as fp: | |||
def read_logs(path: str) -> list[dict[str, Any]]: | |||
with open(path, errors="replace") as fp: | |||
lines = fp.readlines() | |||
parsed = [(line, REGEX.match(line.strip())) for line in lines | |||
if line.startswith('{address space usage')] | |||
parsed = [ | |||
(line, REGEX.match(line.strip())) | |||
for line in lines | |||
if line.startswith("{address space usage") | |||
] | |||
for line, match in parsed: | |||
if not match: | |||
print('failed to parse:', line.strip()) | |||
print("failed to parse:", line.strip()) | |||
return [match.groupdict() for _, match in parsed if match] | |||
def main(): | |||
parser = argparse.ArgumentParser() | |||
parser.add_argument('logfile', default='uwsgi.log') | |||
parser.add_argument("logfile", default="uwsgi.log") | |||
args = parser.parse_args() | |||
save_logs(read_logs(args.logfile)) | |||
if __name__ == '__main__': | |||
if __name__ == "__main__": | |||
main() |
@@ -1,9 +1,7 @@ | |||
# -*- coding: utf-8 -*- | |||
from collections import OrderedDict | |||
from .checker import T_POSSIBLE, T_SUSPECT, do_check | |||
from .highlighter import highlight_delta | |||
from .checker import do_check, T_POSSIBLE, T_SUSPECT | |||
from .misc import Query, cache | |||
from .sites import update_sites | |||
@@ -15,83 +13,107 @@ _CHECK_ERRORS = { | |||
"no URL": "The parameter 'url' is required for URL comparisons", | |||
"bad URI": "The given URI scheme is unsupported", | |||
"no data": "No text could be found in the given URL (note that only HTML " | |||
"and plain text pages are supported, and content generated by " | |||
"JavaScript or found inside iframes is ignored)", | |||
"and plain text pages are supported, and content generated by " | |||
"JavaScript or found inside iframes is ignored)", | |||
"timeout": "The given URL timed out before any data could be retrieved", | |||
"search error": "An error occurred while using the search engine; try " | |||
"reloading or setting 'use_engine' to 0", | |||
"reloading or setting 'use_engine' to 0", | |||
} | |||
def _serialize_page(page): | |||
return OrderedDict((("title", page.title), ("url", page.url))) | |||
def _serialize_source(source, show_skip=True): | |||
if not source: | |||
return OrderedDict(( | |||
("url", None), ("confidence", 0.0), ("violation", "none"))) | |||
return OrderedDict((("url", None), ("confidence", 0.0), ("violation", "none"))) | |||
conf = source.confidence | |||
data = OrderedDict(( | |||
("url", source.url), | |||
("confidence", conf), | |||
("violation", "suspected" if conf >= T_SUSPECT else | |||
"possible" if conf >= T_POSSIBLE else "none") | |||
)) | |||
data = OrderedDict( | |||
( | |||
("url", source.url), | |||
("confidence", conf), | |||
( | |||
"violation", | |||
( | |||
"suspected" | |||
if conf >= T_SUSPECT | |||
else "possible" | |||
if conf >= T_POSSIBLE | |||
else "none" | |||
), | |||
), | |||
) | |||
) | |||
if show_skip: | |||
data["skipped"] = source.skipped | |||
data["excluded"] = source.excluded | |||
return data | |||
def _serialize_detail(result): | |||
source_chain, delta = result.best.chains | |||
article = highlight_delta(None, result.article_chain, delta) | |||
source = highlight_delta(None, source_chain, delta) | |||
return OrderedDict((("article", article), ("source", source))) | |||
def format_api_error(code, info): | |||
if isinstance(info, BaseException): | |||
info = type(info).__name__ + ": " + str(info) | |||
elif isinstance(info, unicode): | |||
info = info.encode("utf8") | |||
error_inner = OrderedDict((("code", code), ("info", info))) | |||
return OrderedDict((("status", "error"), ("error", error_inner))) | |||
def _hook_default(query): | |||
info = u"Unknown action: '{0}'".format(query.action.lower()) | |||
info = f"Unknown action: '{query.action.lower()}'" | |||
return format_api_error("unknown_action", info) | |||
def _hook_check(query): | |||
do_check(query) | |||
if not query.submitted: | |||
info = ("The query parameters 'project', 'lang', and either 'title' " | |||
"or 'oldid' are required for checks") | |||
info = ( | |||
"The query parameters 'project', 'lang', and either 'title' " | |||
"or 'oldid' are required for checks" | |||
) | |||
return format_api_error("missing_params", info) | |||
if query.error: | |||
info = _CHECK_ERRORS.get(query.error, "An unknown error occurred") | |||
return format_api_error(query.error.replace(" ", "_"), info) | |||
elif not query.site: | |||
info = (u"The given site (project={0}, lang={1}) either doesn't exist," | |||
u" is closed, or is private").format(query.project, query.lang) | |||
info = ( | |||
f"The given site (project={query.project}, lang={query.lang}) either doesn't exist," | |||
" is closed, or is private" | |||
) | |||
return format_api_error("bad_site", info) | |||
elif not query.result: | |||
if query.oldid: | |||
info = u"The revision ID couldn't be found: {0}" | |||
info = "The revision ID couldn't be found: {0}" | |||
return format_api_error("bad_oldid", info.format(query.oldid)) | |||
else: | |||
info = u"The page couldn't be found: {0}" | |||
info = "The page couldn't be found: {0}" | |||
return format_api_error("bad_title", info.format(query.page.title)) | |||
result = query.result | |||
data = OrderedDict(( | |||
("status", "ok"), | |||
("meta", OrderedDict(( | |||
("time", result.time), | |||
("queries", result.queries), | |||
("cached", result.cached), | |||
("redirected", bool(query.redirected_from)) | |||
))), | |||
("page", _serialize_page(query.page)) | |||
)) | |||
data = OrderedDict( | |||
( | |||
("status", "ok"), | |||
( | |||
"meta", | |||
OrderedDict( | |||
( | |||
("time", result.time), | |||
("queries", result.queries), | |||
("cached", result.cached), | |||
("redirected", bool(query.redirected_from)), | |||
) | |||
), | |||
), | |||
("page", _serialize_page(query.page)), | |||
) | |||
) | |||
if result.cached: | |||
data["meta"]["cache_time"] = result.cache_time | |||
if query.redirected_from: | |||
@@ -102,11 +124,13 @@ def _hook_check(query): | |||
data["detail"] = _serialize_detail(result) | |||
return data | |||
def _hook_sites(query): | |||
update_sites() | |||
return OrderedDict(( | |||
("status", "ok"), ("langs", cache.langs), ("projects", cache.projects) | |||
)) | |||
return OrderedDict( | |||
(("status", "ok"), ("langs", cache.langs), ("projects", cache.projects)) | |||
) | |||
_HOOKS = { | |||
"compare": _hook_check, | |||
@@ -114,13 +138,14 @@ _HOOKS = { | |||
"sites": _hook_sites, | |||
} | |||
def handle_api_request(): | |||
query = Query() | |||
if query.version: | |||
try: | |||
query.version = int(query.version) | |||
except ValueError: | |||
info = "The version string is invalid: {0}".format(query.version) | |||
info = f"The version string is invalid: {query.version}" | |||
return format_api_error("invalid_version", info) | |||
else: | |||
query.version = 1 | |||
@@ -129,5 +154,5 @@ def handle_api_request(): | |||
action = query.action.lower() if query.action else "" | |||
return _HOOKS.get(action, _hook_default)(query) | |||
info = "The API version is unsupported: {0}".format(query.version) | |||
info = f"The API version is unsupported: {query.version}" | |||
return format_api_error("unsupported_version", info) |
@@ -1,20 +1,19 @@ | |||
# -*- coding: utf-8 -*- | |||
from __future__ import unicode_literals | |||
from earwigbot.wiki import NS_TEMPLATE | |||
__all__ = ["get_attribution_info"] | |||
ATTRIB_TEMPLATES = { | |||
"enwiki": { | |||
"CC-notice", "Cc-notice", | |||
"CC-notice", | |||
"Cc-notice", | |||
"Citation-attribution", | |||
"Free-content attribution", "Open-source attribution", | |||
"Free-content attribution", | |||
"Open-source attribution", | |||
"Source-attribution", | |||
} | |||
} | |||
def get_attribution_info(site, page): | |||
"""Check to see if the given page has some kind of attribution info. | |||
@@ -30,7 +29,7 @@ def get_attribution_info(site, page): | |||
for template in page.parse().ifilter_templates(): | |||
if template.name.matches(templates): | |||
name = unicode(template.name).strip() | |||
name = str(template.name).strip() | |||
title = name if ":" in name else prefix + ":" + name | |||
return name, site.get_page(title).url | |||
return None |
@@ -1,10 +1,10 @@ | |||
# -*- coding: utf-8 -*- | |||
from datetime import datetime, timedelta | |||
from json import loads | |||
import random | |||
import re | |||
import urllib | |||
import urllib.error | |||
import urllib.parse | |||
import urllib.request | |||
from datetime import datetime, timedelta | |||
from json import loads | |||
from earwigbot import exceptions | |||
from flask import g | |||
@@ -13,32 +13,39 @@ from .misc import cache | |||
__all__ = ["set_background"] | |||
def _get_commons_site(): | |||
try: | |||
return cache.bot.wiki.get_site("commonswiki") | |||
except exceptions.SiteNotFoundError: | |||
return cache.bot.wiki.add_site(project="wikimedia", lang="commons") | |||
def _load_file(site, filename): | |||
data = site.api_query( | |||
action="query", prop="imageinfo", iiprop="url|size|canonicaltitle", | |||
titles="File:" + filename) | |||
res = data["query"]["pages"].values()[0]["imageinfo"][0] | |||
name = res["canonicaltitle"][len("File:"):].replace(" ", "_") | |||
action="query", | |||
prop="imageinfo", | |||
iiprop="url|size|canonicaltitle", | |||
titles="File:" + filename, | |||
) | |||
res = list(data["query"]["pages"].values())[0]["imageinfo"][0] | |||
name = res["canonicaltitle"][len("File:") :].replace(" ", "_") | |||
return name, res["url"], res["descriptionurl"], res["width"], res["height"] | |||
def _get_fresh_potd(): | |||
site = _get_commons_site() | |||
date = datetime.utcnow().strftime("%Y-%m-%d") | |||
page = site.get_page("Template:Potd/" + date) | |||
regex = ur"\{\{Potd filename\|(?:1=)?(.*?)\|.*?\}\}" | |||
regex = r"\{\{Potd filename\|(?:1=)?(.*?)\|.*?\}\}" | |||
filename = re.search(regex, page.get()).group(1) | |||
return _load_file(site, filename) | |||
def _get_fresh_list(): | |||
site = _get_commons_site() | |||
page = site.get_page("User:The Earwig/POTD") | |||
regex = ur"\*\*?\s*\[\[:File:(.*?)\]\]" | |||
regex = r"\*\*?\s*\[\[:File:(.*?)\]\]" | |||
filenames = re.findall(regex, page.get()) | |||
# Ensure all workers share the same background each day: | |||
@@ -46,6 +53,7 @@ def _get_fresh_list(): | |||
filename = random.choice(filenames) | |||
return _load_file(site, filename) | |||
def _build_url(screen, filename, url, imgwidth, imgheight): | |||
width = screen["width"] | |||
if float(imgwidth) / imgheight > float(screen["width"]) / screen["height"]: | |||
@@ -53,12 +61,11 @@ def _build_url(screen, filename, url, imgwidth, imgheight): | |||
if width >= imgwidth: | |||
return url | |||
url = url.replace("/commons/", "/commons/thumb/") | |||
return "%s/%dpx-%s" % (url, width, urllib.quote(filename.encode("utf8"))) | |||
return "%s/%dpx-%s" % (url, width, urllib.parse.quote(filename.encode("utf8"))) | |||
_BACKGROUNDS = {"potd": _get_fresh_potd, "list": _get_fresh_list} | |||
_BACKGROUNDS = { | |||
"potd": _get_fresh_potd, | |||
"list": _get_fresh_list | |||
} | |||
def _get_background(selected): | |||
if not cache.last_background_updates: | |||
@@ -73,6 +80,7 @@ def _get_background(selected): | |||
cache.last_background_updates[selected] = datetime.utcnow().date() | |||
return cache.background_data[selected] | |||
def set_background(selected): | |||
if "CopyviosScreenCache" in g.cookies: | |||
screen_cache = g.cookies["CopyviosScreenCache"].value |
@@ -1,17 +1,15 @@ | |||
# -*- coding: utf-8 -*- | |||
import re | |||
from datetime import datetime, timedelta | |||
from hashlib import sha256 | |||
from logging import getLogger | |||
import re | |||
from urlparse import urlparse | |||
from urllib.parse import urlparse | |||
from earwigbot import exceptions | |||
from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain | |||
from earwigbot.wiki.copyvios.parsers import ArticleTextParser | |||
from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult | |||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | |||
from .misc import Query, get_db, get_cursor, get_sql_error, sql_dialect | |||
from .misc import Query, get_cursor, get_db, get_sql_error, sql_dialect | |||
from .sites import get_site | |||
from .turnitin import search_turnitin | |||
@@ -22,9 +20,11 @@ T_SUSPECT = 0.75 | |||
_LOGGER = getLogger("copyvios.checker") | |||
def _coerce_bool(val): | |||
return val and val not in ("0", "false") | |||
def do_check(query=None): | |||
if not query: | |||
query = Query() | |||
@@ -44,6 +44,7 @@ def do_check(query=None): | |||
_get_results(query, follow=not _coerce_bool(query.noredirect)) | |||
return query | |||
def _get_results(query, follow=True): | |||
if query.oldid: | |||
if not re.match(r"^\d+$", query.oldid): | |||
@@ -100,8 +101,9 @@ def _get_results(query, follow=True): | |||
degree = int(query.degree) | |||
except ValueError: | |||
pass | |||
result = page.copyvio_compare(query.url, min_confidence=T_SUSPECT, | |||
max_time=10, degree=degree) | |||
result = page.copyvio_compare( | |||
query.url, min_confidence=T_SUSPECT, max_time=10, degree=degree | |||
) | |||
if result.best.chains[0] is EMPTY: | |||
query.error = "timeout" if result.time > 10 else "no data" | |||
return | |||
@@ -110,12 +112,18 @@ def _get_results(query, follow=True): | |||
else: | |||
query.error = "bad action" | |||
def _get_page_by_revid(site, revid): | |||
try: | |||
res = site.api_query(action="query", prop="info|revisions", revids=revid, | |||
rvprop="content|timestamp", inprop="protection|url", | |||
rvslots="main") | |||
page_data = res["query"]["pages"].values()[0] | |||
res = site.api_query( | |||
action="query", | |||
prop="info|revisions", | |||
revids=revid, | |||
rvprop="content|timestamp", | |||
inprop="protection|url", | |||
rvslots="main", | |||
) | |||
page_data = list(res["query"]["pages"].values())[0] | |||
title = page_data["title"] | |||
# Only need to check that these exist: | |||
revision = page_data["revisions"][0] | |||
@@ -131,24 +139,30 @@ def _get_page_by_revid(site, revid): | |||
page._load_content(res) | |||
return page | |||
def _perform_check(query, page, use_engine, use_links): | |||
conn = get_db() | |||
sql_error = get_sql_error() | |||
mode = "{0}:{1}:".format(use_engine, use_links) | |||
mode = f"{use_engine}:{use_links}:" | |||
if not _coerce_bool(query.nocache): | |||
try: | |||
query.result = _get_cached_results( | |||
page, conn, mode, _coerce_bool(query.noskip)) | |||
page, conn, mode, _coerce_bool(query.noskip) | |||
) | |||
except sql_error: | |||
_LOGGER.exception("Failed to retrieve cached results") | |||
if not query.result: | |||
try: | |||
query.result = page.copyvio_check( | |||
min_confidence=T_SUSPECT, max_queries=8, max_time=30, | |||
no_searches=not use_engine, no_links=not use_links, | |||
short_circuit=not query.noskip) | |||
min_confidence=T_SUSPECT, | |||
max_queries=8, | |||
max_time=30, | |||
no_searches=not use_engine, | |||
no_links=not use_links, | |||
short_circuit=not query.noskip, | |||
) | |||
except exceptions.SearchQueryError as exc: | |||
query.error = "search error" | |||
query.exception = exc | |||
@@ -159,6 +173,7 @@ def _perform_check(query, page, use_engine, use_links): | |||
except sql_error: | |||
_LOGGER.exception("Failed to cache results") | |||
def _get_cached_results(page, conn, mode, noskip): | |||
query1 = """SELECT cache_time, cache_queries, cache_process_time, | |||
cache_possible_miss | |||
@@ -167,7 +182,7 @@ def _get_cached_results(page, conn, mode, noskip): | |||
query2 = """SELECT cdata_url, cdata_confidence, cdata_skipped, cdata_excluded | |||
FROM cache_data | |||
WHERE cdata_cache_id = ?""" | |||
cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest()) | |||
cache_id = sha256(mode + page.get().encode("utf8")).digest() | |||
cursor = conn.cursor() | |||
cursor.execute(query1, (cache_id,)) | |||
@@ -186,8 +201,9 @@ def _get_cached_results(page, conn, mode, noskip): | |||
if not data: # TODO: do something less hacky for this edge case | |||
article_chain = MarkovChain(ArticleTextParser(page.get()).strip()) | |||
result = CopyvioCheckResult(False, [], queries, check_time, | |||
article_chain, possible_miss) | |||
result = CopyvioCheckResult( | |||
False, [], queries, check_time, article_chain, possible_miss | |||
) | |||
result.cached = True | |||
result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC") | |||
result.cache_age = _format_date(cache_time) | |||
@@ -216,8 +232,11 @@ def _get_cached_results(page, conn, mode, noskip): | |||
result.cache_age = _format_date(cache_time) | |||
return result | |||
def _format_date(cache_time): | |||
formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s") | |||
def formatter(n, w): | |||
return "{} {}{}".format(n, w, "" if n == 1 else "s") | |||
diff = datetime.utcnow() - cache_time | |||
total_seconds = diff.days * 86400 + diff.seconds | |||
if total_seconds > 3600: | |||
@@ -226,23 +245,34 @@ def _format_date(cache_time): | |||
return formatter(total_seconds / 60, "minute") | |||
return formatter(total_seconds, "second") | |||
def _cache_result(page, result, conn, mode): | |||
expiry = sql_dialect(mysql="DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)", | |||
sqlite="STRFTIME('%s', 'now', '-3 days')") | |||
expiry = sql_dialect( | |||
mysql="DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)", | |||
sqlite="STRFTIME('%s', 'now', '-3 days')", | |||
) | |||
query1 = "DELETE FROM cache WHERE cache_id = ?" | |||
query2 = "DELETE FROM cache WHERE cache_time < %s" % expiry | |||
query2 = f"DELETE FROM cache WHERE cache_time < {expiry}" | |||
query3 = """INSERT INTO cache (cache_id, cache_queries, cache_process_time, | |||
cache_possible_miss) VALUES (?, ?, ?, ?)""" | |||
query4 = """INSERT INTO cache_data (cdata_cache_id, cdata_url, | |||
cdata_confidence, cdata_skipped, | |||
cdata_excluded) VALUES (?, ?, ?, ?, ?)""" | |||
cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest()) | |||
data = [(cache_id, source.url[:1024], source.confidence, source.skipped, | |||
source.excluded) | |||
for source in result.sources] | |||
cache_id = sha256(mode + page.get().encode("utf8")).digest() | |||
data = [ | |||
( | |||
cache_id, | |||
source.url[:1024], | |||
source.confidence, | |||
source.skipped, | |||
source.excluded, | |||
) | |||
for source in result.sources | |||
] | |||
with get_cursor(conn) as cursor: | |||
cursor.execute(query1, (cache_id,)) | |||
cursor.execute(query2) | |||
cursor.execute(query3, (cache_id, result.queries, result.time, | |||
result.possible_miss)) | |||
cursor.execute( | |||
query3, (cache_id, result.queries, result.time, result.possible_miss) | |||
) | |||
cursor.executemany(query4, data) |
@@ -1,39 +1,38 @@ | |||
# -*- coding: utf-8 -*- | |||
import base64 | |||
from Cookie import CookieError, SimpleCookie | |||
from datetime import datetime, timedelta | |||
from http.cookies import CookieError, SimpleCookie | |||
from flask import g | |||
__all__ = ["parse_cookies", "set_cookie", "delete_cookie"] | |||
class _CookieManager(SimpleCookie): | |||
MAGIC = "--cpv2" | |||
def __init__(self, path, cookies): | |||
self._path = path | |||
try: | |||
super(_CookieManager, self).__init__(cookies) | |||
super().__init__(cookies) | |||
except CookieError: | |||
super(_CookieManager, self).__init__() | |||
for cookie in self.keys(): | |||
super().__init__() | |||
for cookie in list(self.keys()): | |||
if self[cookie].value is False: | |||
del self[cookie] | |||
def value_decode(self, value): | |||
unquoted = super(_CookieManager, self).value_decode(value)[0] | |||
unquoted = super().value_decode(value)[0] | |||
try: | |||
decoded = base64.b64decode(unquoted).decode("utf8") | |||
except (TypeError, UnicodeDecodeError): | |||
return False, "False" | |||
if decoded.startswith(self.MAGIC): | |||
return decoded[len(self.MAGIC):], value | |||
return decoded[len(self.MAGIC) :], value | |||
return False, "False" | |||
def value_encode(self, value): | |||
encoded = base64.b64encode(self.MAGIC + value.encode("utf8")) | |||
quoted = super(_CookieManager, self).value_encode(encoded)[1] | |||
quoted = super().value_encode(encoded)[1] | |||
return value, quoted | |||
@property | |||
@@ -44,6 +43,7 @@ class _CookieManager(SimpleCookie): | |||
def parse_cookies(path, cookies): | |||
return _CookieManager(path, cookies) | |||
def set_cookie(key, value, days=0): | |||
g.cookies[key] = value | |||
if days: | |||
@@ -53,6 +53,7 @@ def set_cookie(key, value, days=0): | |||
g.cookies[key]["path"] = g.cookies.path | |||
g.new_cookies.append(g.cookies[key].OutputString()) | |||
def delete_cookie(key): | |||
set_cookie(key, u"", days=-1) | |||
set_cookie(key, "", days=-1) | |||
del g.cookies[key] |
@@ -1,13 +1,12 @@ | |||
# -*- coding: utf-8 -*- | |||
from collections import deque | |||
from re import sub, UNICODE | |||
from re import UNICODE, sub | |||
from earwigbot.wiki.copyvios.markov import EMPTY_INTERSECTION | |||
from markupsafe import escape | |||
__all__ = ["highlight_delta"] | |||
def highlight_delta(context, chain, delta): | |||
degree = chain.degree - 1 | |||
highlights = [False] * degree | |||
@@ -18,7 +17,7 @@ def highlight_delta(context, chain, delta): | |||
word = _strip_word(chain, word) | |||
block.append(word) | |||
if tuple(block) in delta.chain: | |||
highlights[-1 * degree:] = [True] * degree | |||
highlights[-1 * degree :] = [True] * degree | |||
highlights.append(True) | |||
else: | |||
highlights.append(False) | |||
@@ -38,11 +37,12 @@ def highlight_delta(context, chain, delta): | |||
last = i - degree + 1 == numwords | |||
words.append(_highlight_word(word, before, after, first, last)) | |||
else: | |||
words.append(unicode(escape(word))) | |||
result.append(u" ".join(words)) | |||
words.append(str(escape(word))) | |||
result.append(" ".join(words)) | |||
i += 1 | |||
return u"<br /><br />".join(result) | |||
return "<br /><br />".join(result) | |||
def _get_next(paragraphs): | |||
body = [] | |||
@@ -58,41 +58,44 @@ def _get_next(paragraphs): | |||
break | |||
return body | |||
def _highlight_word(word, before, after, first, last): | |||
if before and after: | |||
# Word is in the middle of a highlighted block: | |||
res = unicode(escape(word)) | |||
res = str(escape(word)) | |||
if first: | |||
res = u'<span class="cv-hl">' + res | |||
res = '<span class="cv-hl">' + res | |||
if last: | |||
res += u'</span>' | |||
res += "</span>" | |||
elif after: | |||
# Word is the first in a highlighted block: | |||
res = u'<span class="cv-hl">' + _fade_word(word, u"in") | |||
res = '<span class="cv-hl">' + _fade_word(word, "in") | |||
if last: | |||
res += u"</span>" | |||
res += "</span>" | |||
elif before: | |||
# Word is the last in a highlighted block: | |||
res = _fade_word(word, u"out") + u"</span>" | |||
res = _fade_word(word, "out") + "</span>" | |||
if first: | |||
res = u'<span class="cv-hl">' + res | |||
res = '<span class="cv-hl">' + res | |||
else: | |||
res = unicode(escape(word)) | |||
res = str(escape(word)) | |||
return res | |||
def _fade_word(word, dir): | |||
if len(word) <= 4: | |||
word = unicode(escape(word)) | |||
return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word) | |||
if dir == u"out": | |||
before, after = unicode(escape(word[:-4])), unicode(escape(word[-4:])) | |||
base = u'{0}<span class="cv-hl-out">{1}</span>' | |||
word = str(escape(word)) | |||
return f'<span class="cv-hl-{dir}">{word}</span>' | |||
if dir == "out": | |||
before, after = str(escape(word[:-4])), str(escape(word[-4:])) | |||
base = '{0}<span class="cv-hl-out">{1}</span>' | |||
return base.format(before, after) | |||
else: | |||
before, after = unicode(escape(word[:4])), unicode(escape(word[4:])) | |||
base = u'<span class="cv-hl-in">{0}</span>{1}' | |||
before, after = str(escape(word[:4])), str(escape(word[4:])) | |||
base = '<span class="cv-hl-in">{0}</span>{1}' | |||
return base.format(before, after) | |||
def _strip_word(chain, word): | |||
if word == chain.START or word == chain.END: | |||
return word |
@@ -1,19 +1,18 @@ | |||
# -*- coding: utf-8 -*- | |||
from contextlib import contextmanager | |||
import datetime | |||
from contextlib import contextmanager | |||
from os.path import expanduser, join | |||
import apsw | |||
from flask import g, request | |||
import oursql | |||
from flask import g, request | |||
from sqlalchemy.pool import manage | |||
oursql = manage(oursql) | |||
__all__ = ["Query", "cache", "get_db", "get_notice", "httpsfix", "urlstrip"] | |||
class Query(object): | |||
class Query: | |||
def __init__(self, method="GET"): | |||
self.query = {} | |||
data = request.form if method == "POST" else request.args | |||
@@ -25,14 +24,14 @@ class Query(object): | |||
def __setattr__(self, key, value): | |||
if key == "query": | |||
super(Query, self).__setattr__(key, value) | |||
super().__setattr__(key, value) | |||
else: | |||
self.query[key] = value | |||
class _AppCache(object): | |||
class _AppCache: | |||
def __init__(self): | |||
super(_AppCache, self).__setattr__("_data", {}) | |||
super().__setattr__("_data", {}) | |||
def __getattr__(self, key): | |||
return self._data[key] | |||
@@ -43,6 +42,7 @@ class _AppCache(object): | |||
cache = _AppCache() | |||
def _connect_to_db(engine, args): | |||
if engine == "mysql": | |||
args["read_default_file"] = expanduser("~/.my.cnf") | |||
@@ -54,15 +54,17 @@ def _connect_to_db(engine, args): | |||
conn = apsw.Connection(dbpath) | |||
conn.cursor().execute("PRAGMA foreign_keys = ON") | |||
return conn | |||
raise ValueError("Unknown engine: %s" % engine) | |||
raise ValueError(f"Unknown engine: {engine}") | |||
def get_db(): | |||
if not g._db: | |||
args = cache.bot.config.wiki["_copyviosSQL"].copy() | |||
args = cache.bot.config.wiki["copyvios"].copy() | |||
g._engine = engine = args.pop("engine", "mysql").lower() | |||
g._db = _connect_to_db(engine, args) | |||
return g._db | |||
@contextmanager | |||
def get_cursor(conn): | |||
if g._engine == "mysql": | |||
@@ -72,21 +74,24 @@ def get_cursor(conn): | |||
with conn: | |||
yield conn.cursor() | |||
else: | |||
raise ValueError("Unknown engine: %s" % g._engine) | |||
raise ValueError(f"Unknown engine: {g._engine}") | |||
def get_sql_error(): | |||
if g._engine == "mysql": | |||
return oursql.Error | |||
if g._engine == "sqlite": | |||
return apsw.Error | |||
raise ValueError("Unknown engine: %s" % g._engine) | |||
raise ValueError(f"Unknown engine: {g._engine}") | |||
def sql_dialect(mysql, sqlite): | |||
if g._engine == "mysql": | |||
return mysql | |||
if g._engine == "sqlite": | |||
return sqlite | |||
raise ValueError("Unknown engine: %s" % g._engine) | |||
raise ValueError(f"Unknown engine: {g._engine}") | |||
def get_notice(): | |||
try: | |||
@@ -95,16 +100,19 @@ def get_notice(): | |||
if lines[0] == "<!-- active -->": | |||
return "\n".join(lines[1:]) | |||
return None | |||
except IOError: | |||
except OSError: | |||
return None | |||
def httpsfix(context, url): | |||
if url.startswith("http://"): | |||
url = url[len("http:"):] | |||
url = url[len("http:") :] | |||
return url | |||
def parse_wiki_timestamp(timestamp): | |||
return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S') | |||
return datetime.datetime.strptime(timestamp, "%Y%m%d%H%M%S") | |||
def urlstrip(context, url): | |||
if url.startswith("http://"): |
@@ -1,13 +1,12 @@ | |||
# -*- coding: utf-8 -*- | |||
from flask import g | |||
from markupsafe import escape | |||
from .cookies import set_cookie, delete_cookie | |||
from .cookies import delete_cookie, set_cookie | |||
from .misc import Query | |||
__all__ = ["process_settings"] | |||
def process_settings(): | |||
query = Query(method="POST") | |||
if query.action == "set": | |||
@@ -18,6 +17,7 @@ def process_settings(): | |||
status = None | |||
return status | |||
def _do_set(query): | |||
cookies = g.cookies | |||
changes = set() | |||
@@ -39,18 +39,19 @@ def _do_set(query): | |||
changes.add("background") | |||
if changes: | |||
changes = ", ".join(sorted(list(changes))) | |||
return "Updated {0}.".format(changes) | |||
return f"Updated {changes}." | |||
return None | |||
def _do_delete(query): | |||
cookies = g.cookies | |||
if query.cookie in cookies: | |||
delete_cookie(query.cookie.encode("utf8")) | |||
template = u'Deleted cookie <b><span class="mono">{0}</span></b>.' | |||
template = 'Deleted cookie <b><span class="mono">{0}</span></b>.' | |||
return template.format(escape(query.cookie)) | |||
elif query.all: | |||
number = len(cookies) | |||
for cookie in cookies.values(): | |||
for cookie in list(cookies.values()): | |||
delete_cookie(cookie.key) | |||
return "Deleted <b>{0}</b> cookies.".format(number) | |||
return f"Deleted <b>{number}</b> cookies." | |||
return None |
@@ -1,7 +1,5 @@ | |||
# -*- coding: utf-8 -*- | |||
from time import time | |||
from urlparse import urlparse | |||
from urllib.parse import urlparse | |||
from earwigbot import exceptions | |||
@@ -9,6 +7,7 @@ from .misc import cache | |||
__all__ = ["get_site", "update_sites"] | |||
def get_site(query): | |||
lang, project, name = query.lang, query.project, query.name | |||
wiki = cache.bot.wiki | |||
@@ -24,11 +23,13 @@ def get_site(query): | |||
except exceptions.SiteNotFoundError: | |||
return _add_site(lang, project) | |||
def update_sites(): | |||
if time() - cache.last_sites_update > 60 * 60 * 24 * 7: | |||
cache.langs, cache.projects = _load_sites() | |||
cache.last_sites_update = time() | |||
def _add_site(lang, project): | |||
update_sites() | |||
if not any(project == item[0] for item in cache.projects): | |||
@@ -40,12 +41,13 @@ def _add_site(lang, project): | |||
except (exceptions.APIError, exceptions.LoginError): | |||
return None | |||
def _load_sites(): | |||
site = cache.bot.wiki.get_site() | |||
matrix = site.api_query(action="sitematrix")["sitematrix"] | |||
del matrix["count"] | |||
langs, projects = set(), set() | |||
for site in matrix.itervalues(): | |||
for site in matrix.values(): | |||
if isinstance(site, list): # Special sites | |||
bad_sites = ["closed", "private", "fishbowl"] | |||
for special in site: | |||
@@ -55,19 +57,19 @@ def _load_sites(): | |||
lang, project = "www", full.split(".")[0] | |||
else: | |||
lang, project = full.rsplit(".", 2)[:2] | |||
code = u"{0}::{1}".format(lang, special["dbname"]) | |||
code = "{}::{}".format(lang, special["dbname"]) | |||
name = special["code"].capitalize() | |||
langs.add((code, u"{0} ({1})".format(lang, name))) | |||
langs.add((code, f"{lang} ({name})")) | |||
projects.add((project, project.capitalize())) | |||
else: | |||
this = set() | |||
for web in site["site"]: | |||
if "closed" in web: | |||
continue | |||
proj = "wikipedia" if web["code"] == u"wiki" else web["code"] | |||
proj = "wikipedia" if web["code"] == "wiki" else web["code"] | |||
this.add((proj, proj.capitalize())) | |||
if this: | |||
code = site["code"] | |||
langs.add((code, u"{0} ({1})".format(code, site["name"]))) | |||
langs.add((code, "{} ({})".format(code, site["name"]))) | |||
projects |= this | |||
return list(sorted(langs)), list(sorted(projects)) |
@@ -1,17 +1,17 @@ | |||
# -*- coding: utf-8 -*- | |||
from ast import literal_eval | |||
import re | |||
from ast import literal_eval | |||
import requests | |||
from .misc import parse_wiki_timestamp | |||
__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] | |||
__all__ = ["search_turnitin", "TURNITIN_API_ENDPOINT"] | |||
TURNITIN_API_ENDPOINT = "https://eranbot.toolforge.org/plagiabot/api.py" | |||
TURNITIN_API_ENDPOINT = 'https://eranbot.toolforge.org/plagiabot/api.py' | |||
def search_turnitin(page_title, lang): | |||
""" Search the Plagiabot database for Turnitin reports for a page. | |||
"""Search the Plagiabot database for Turnitin reports for a page. | |||
Keyword arguments: | |||
page_title -- string containing the page title | |||
@@ -21,14 +21,16 @@ def search_turnitin(page_title, lang): | |||
""" | |||
return TurnitinResult(_make_api_request(page_title, lang)) | |||
def _make_api_request(page_title, lang): | |||
""" Query the plagiabot API for Turnitin reports for a given page. | |||
""" | |||
stripped_page_title = page_title.replace(' ', '_') | |||
api_parameters = {'action': 'suspected_diffs', | |||
'page_title': stripped_page_title, | |||
'lang': lang, | |||
'report': 1} | |||
"""Query the plagiabot API for Turnitin reports for a given page.""" | |||
stripped_page_title = page_title.replace(" ", "_") | |||
api_parameters = { | |||
"action": "suspected_diffs", | |||
"page_title": stripped_page_title, | |||
"lang": lang, | |||
"report": 1, | |||
} | |||
result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters, verify=False) | |||
# use literal_eval to *safely* parse the resulting dict-containing string | |||
@@ -38,14 +40,16 @@ def _make_api_request(page_title, lang): | |||
parsed_api_result = [] | |||
return parsed_api_result | |||
class TurnitinResult(object): | |||
""" Container class for TurnitinReports. Each page may have zero or | |||
class TurnitinResult: | |||
"""Container class for TurnitinReports. Each page may have zero or | |||
more reports of plagiarism. The list will have multiple | |||
TurnitinReports if plagiarism has been detected for more than one | |||
revision. | |||
TurnitinResult.reports -- list containing >= 0 TurnitinReport items | |||
""" | |||
def __init__(self, turnitin_data): | |||
""" | |||
Keyword argument: | |||
@@ -54,14 +58,16 @@ class TurnitinResult(object): | |||
self.reports = [] | |||
for item in turnitin_data: | |||
report = TurnitinReport( | |||
item['diff_timestamp'], item['diff'], item['report']) | |||
item["diff_timestamp"], item["diff"], item["report"] | |||
) | |||
self.reports.append(report) | |||
def __repr__(self): | |||
return str(self.__dict__) | |||
class TurnitinReport(object): | |||
""" Contains data for each Turnitin report (one on each potentially | |||
class TurnitinReport: | |||
"""Contains data for each Turnitin report (one on each potentially | |||
plagiarized revision). | |||
TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot | |||
@@ -72,6 +78,7 @@ class TurnitinReport(object): | |||
words -- number of words found in both source and revision | |||
url -- url for the possibly-plagiarized source | |||
""" | |||
def __init__(self, timestamp, diffid, report): | |||
""" | |||
Keyword argument: | |||
@@ -86,9 +93,7 @@ class TurnitinReport(object): | |||
self.sources = [] | |||
for item in self.report_data[1]: | |||
source = {'percent': item[0], | |||
'words': item[1], | |||
'url': item[2]} | |||
source = {"percent": item[0], "words": item[1], "url": item[2]} | |||
self.sources.append(source) | |||
def __repr__(self): | |||
@@ -96,12 +101,11 @@ class TurnitinReport(object): | |||
def _parse_report(self, report_text): | |||
# extract report ID | |||
report_id_pattern = re.compile(r'\?rid=(\d*)') | |||
report_id_pattern = re.compile(r"\?rid=(\d*)") | |||
report_id = report_id_pattern.search(report_text).groups()[0] | |||
# extract percent match, words, and URL for each source in the report | |||
extract_info_pattern = re.compile( | |||
r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ') | |||
extract_info_pattern = re.compile(r"\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ") | |||
results = extract_info_pattern.findall(report_text) | |||
return (report_id, results) |
@@ -1 +1 @@ | |||
h1,h2{font-family:sans-serif}pre{white-space:pre-wrap}#help{margin:auto;max-width:1200px}.json{font-family:monospace}.indent{display:inline-block;padding-left:2em}.code{font-family:monospace}.resp-cond,.resp-desc,.resp-dtype{padding:0 .25em;background-color:#eee}.resp-dtype{color:#009}.resp-cond:before,.resp-dtype:before{content:"("}.resp-cond:after,.resp-dtype:after{content:")"}.resp-desc{color:#050}.resp-cond{color:#900;font-style:italic}.param-key{color:#009;font-weight:700}.param-val{color:#900;font-weight:700}.parameters{margin:1em 0}.parameters tr:first-child{font-family:sans-serif;font-size:1.17em;color:#fff}.parameters tr:first-child th{background-color:#369}.parameters td,.parameters th{padding:.2em .5em}.parameters th{background-color:#f0f0f0}.parameters td:first-child{font-family:monospace}.parameters tr:nth-child(2n+3){background-color:#e0e0e0}.parameters tr:nth-child(2n+4){background-color:#f0f0f0}a:link,a:visited{color:#373;text-decoration:none}a:hover{color:#040}a:active,a:hover{text-decoration:underline}a:active{color:#404}.no-color:link,.no-color:visited{color:#000;text-decoration:none}.no-color:active,.no-color:hover{color:#000;text-decoration:underline} | |||
h1,h2{font-family:sans-serif}pre{white-space:pre-wrap}#help{margin:auto;max-width:1200px}.json{font-family:monospace}.indent{display:inline-block;padding-left:2em}.code{font-family:monospace}.resp-cond,.resp-desc,.resp-dtype{background-color:#eee;padding:0 .25em}.resp-dtype{color:#009}.resp-cond:before,.resp-dtype:before{content:"("}.resp-cond:after,.resp-dtype:after{content:")"}.resp-desc{color:#050}.resp-cond{color:#900;font-style:italic}.param-key{color:#009;font-weight:700}.param-val{color:#900;font-weight:700}.parameters{margin:1em 0}.parameters tr:first-child{color:#fff;font-family:sans-serif;font-size:1.17em}.parameters tr:first-child th{background-color:#369}.parameters td,.parameters th{padding:.2em .5em}.parameters th{background-color:#f0f0f0}.parameters td:first-child{font-family:monospace}.parameters tr:nth-child(2n+3){background-color:#e0e0e0}.parameters tr:nth-child(2n+4){background-color:#f0f0f0}a:link,a:visited{color:#373;text-decoration:none}a:hover{color:#040}a:active,a:hover{text-decoration:underline}a:active{color:#404}.no-color:link,.no-color:visited{color:#000;text-decoration:none}.no-color:active,.no-color:hover{color:#000;text-decoration:underline} |
@@ -1 +1 @@ | |||
function update_screen_size(){var cache=cache_cookie(),data={width:window.screen.availWidth,height:window.screen.availHeight};cache&&cache.width==data.width&&cache.height==data.height||set_cookie("CopyviosScreenCache",JSON.stringify(data),1095)}function cache_cookie(){var cookie=get_cookie("CopyviosScreenCache");if(cookie)try{data=JSON.parse(cookie);var width=data.width,height=data.height;if(width&&height)return{width:width,height:height}}catch(SyntaxError){}return!1}function get_cookie(name){for(var nameEQ=name+"=",ca=document.cookie.split(";"),i=0;i<ca.length;i++){for(var c=ca[i];" "==c.charAt(0);)c=c.substring(1,c.length);if(0==c.indexOf(nameEQ)){var value=window.atob(c.substring(nameEQ.length,c.length));if(0==value.indexOf("--cpv2"))return value.substring("--cpv2".length,value.length)}}return null}function set_cookie_with_date(name,value,expires){value=window.btoa("--cpv2"+value);var path=window.location.pathname.split("/",2)[1];expires=expires?"; expires="+expires.toUTCString():"",document.cookie=name+"="+value+expires+"; path=/"+path}function set_cookie(name,value,days){var date;days?((date=new Date).setTime(date.getTime()+24*days*60*60*1e3),set_cookie_with_date(name,value,date)):set_cookie_with_date(name,value)}function delete_cookie(name){set_cookie(name,"",-1)}function toggle_notice(){var details=$("#notice-collapse-box"),trigger=$("#notice-collapse-trigger");details.is(":hidden")?(details.show(),trigger.text("[hide]")):(details.hide(),trigger.text("[show]"))}function install_notice(){var details=$("#notice-collapse-box"),trigger=$("#notice-collapse-trigger");0<=details.length&&0<=trigger.length&&(trigger.replaceWith($("<a/>",{id:"notice-collapse-trigger",href:"#",text:"[show]",click:function(){return toggle_notice(),!1}})),details.hide())}$(document).ready(function(){$("#action-search").change(function(){$(".cv-search").prop("disabled",!1),$(".cv-compare").prop("disabled",!0),$(".cv-search-oo-ui").addClass("oo-ui-widget-enabled").removeClass("oo-ui-widget-disabled"),$(".cv-compare-oo-ui").addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled")}),$("#action-compare").change(function(){$(".cv-search").prop("disabled",!0),$(".cv-compare").prop("disabled",!1),$(".cv-search-oo-ui").addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled"),$(".cv-compare-oo-ui").addClass("oo-ui-widget-enabled").removeClass("oo-ui-widget-disabled")}),$("#action-search").is(":checked")&&$("#action-search").change(),$("#action-compare").is(":checked")&&$("#action-compare").change(),$("#cv-form").submit(function(){$("#action-search").is(":checked")&&$.each([["engine","use_engine"],["links","use_links"],["turnitin","turnitin"]],function(i,val){$("#cv-cb-"+val[0]).is(":checked")&&$("#cv-form input[type='hidden'][name='"+val[1]+"']").prop("disabled",!0)}),$("#cv-form button[type='submit']").prop("disabled",!0).css("cursor","progress").parent().addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled")}),0<=$("#cv-additional").length&&($("#cv-additional").css("display","block"),$(".source-default-hidden").css("display","none"),$("#show-additional-sources").click(function(){return $(".source-default-hidden").css("display",""),$("#cv-additional").css("display","none"),!1})),install_notice()}); | |||
function update_screen_size(){var cache=cache_cookie(),data={width:window.screen.availWidth,height:window.screen.availHeight};cache&&cache.width==data.width&&cache.height==data.height||set_cookie("CopyviosScreenCache",JSON.stringify(data),1095)}function cache_cookie(){var cookie=get_cookie("CopyviosScreenCache");if(cookie)try{var width=(data=JSON.parse(cookie)).width,height=data.height;if(width&&height)return{width:width,height:height}}catch(SyntaxError){}return!1}function get_cookie(name){for(var nameEQ=name+"=",ca=document.cookie.split(";"),i=0;i<ca.length;i++){for(var c=ca[i];" "==c.charAt(0);)c=c.substring(1,c.length);if(0==c.indexOf(nameEQ)){var value=window.atob(c.substring(nameEQ.length,c.length));if(0==value.indexOf("--cpv2"))return value.substring("--cpv2".length,value.length)}}return null}function set_cookie_with_date(name,value,date){value=window.btoa("--cpv2"+value);var path=window.location.pathname.split("/",2)[1];date=date?"; expires="+date.toUTCString():"",document.cookie=name+"="+value+date+"; path=/"+path}function set_cookie(name,value,days){var date;days?((date=new Date).setTime(date.getTime()+24*days*60*60*1e3),set_cookie_with_date(name,value,date)):set_cookie_with_date(name,value)}function delete_cookie(name){set_cookie(name,"",-1)}function toggle_notice(){var details=$("#notice-collapse-box"),trigger=$("#notice-collapse-trigger");details.is(":hidden")?(details.show(),trigger.text("[hide]")):(details.hide(),trigger.text("[show]"))}function install_notice(){var details=$("#notice-collapse-box"),trigger=$("#notice-collapse-trigger");0<=details.length&&0<=trigger.length&&(trigger.replaceWith($("<a/>",{id:"notice-collapse-trigger",href:"#",text:"[show]",click:function(){return toggle_notice(),!1}})),details.hide())}$(document).ready(function(){$("#action-search").change(function(){$(".cv-search").prop("disabled",!1),$(".cv-compare").prop("disabled",!0),$(".cv-search-oo-ui").addClass("oo-ui-widget-enabled").removeClass("oo-ui-widget-disabled"),$(".cv-compare-oo-ui").addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled")}),$("#action-compare").change(function(){$(".cv-search").prop("disabled",!0),$(".cv-compare").prop("disabled",!1),$(".cv-search-oo-ui").addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled"),$(".cv-compare-oo-ui").addClass("oo-ui-widget-enabled").removeClass("oo-ui-widget-disabled")}),$("#action-search").is(":checked")&&$("#action-search").change(),$("#action-compare").is(":checked")&&$("#action-compare").change(),$("#cv-form").submit(function(){$("#action-search").is(":checked")&&$.each([["engine","use_engine"],["links","use_links"],["turnitin","turnitin"]],function(i,val){$("#cv-cb-"+val[0]).is(":checked")&&$("#cv-form input[type='hidden'][name='"+val[1]+"']").prop("disabled",!0)}),$("#cv-form button[type='submit']").prop("disabled",!0).css("cursor","progress").parent().addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled")}),0<=$("#cv-additional").length&&($("#cv-additional").css("display","block"),$(".source-default-hidden").css("display","none"),$("#show-additional-sources").click(function(){return $(".source-default-hidden").css("display",""),$("#cv-additional").css("display","none"),!1})),install_notice()}); |