@@ -3,7 +3,7 @@ | |||||
*.egg-info | *.egg-info | ||||
.DS_Store | .DS_Store | ||||
__pycache__ | __pycache__ | ||||
venv | |||||
.earwigbot | .earwigbot | ||||
logs/* | logs/* | ||||
!logs/.gitinclude | !logs/.gitinclude |
@@ -0,0 +1,11 @@ | |||||
repos: | |||||
- repo: https://github.com/astral-sh/ruff-pre-commit | |||||
rev: v0.6.2 | |||||
hooks: | |||||
- id: ruff | |||||
args: [--fix] | |||||
- id: ruff-format | |||||
- repo: https://github.com/RobertCraigie/pyright-python | |||||
rev: v1.1.377 | |||||
hooks: | |||||
- id: pyright |
@@ -1,4 +1,4 @@ | |||||
Copyright (c) 2009-2021 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Copyright (c) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy | Permission is hereby granted, free of charge, to any person obtaining a copy | ||||
of this software and associated documentation files (the "Software"), to deal | of this software and associated documentation files (the "Software"), to deal | ||||
@@ -1,50 +1,46 @@ | |||||
This is a [copyright violation](https://en.wikipedia.org/wiki/WP:COPYVIO) | This is a [copyright violation](https://en.wikipedia.org/wiki/WP:COPYVIO) | ||||
detector running on [Wikimedia Cloud Services](https://copyvios.toolforge.org/). | |||||
detector web tool for Wikipedia articles running on | |||||
[Wikimedia Cloud Services](https://wikitech.wikimedia.org/wiki/Help:Cloud_Services_introduction) | |||||
at [copyvios.toolforge.org](https://copyvios.toolforge.org/). | |||||
It can search the web for content similar to a given article, and graphically | It can search the web for content similar to a given article, and graphically | ||||
compare an article to a specific URL. Some technical details are expanded upon | |||||
[in a blog post](https://benkurtovic.com/2014/08/20/copyvio-detector.html). | |||||
compare an article to specific URLs. Some technical details are expanded upon | |||||
[in a blog post](https://benkurtovic.com/2014/08/20/copyvio-detector.html), | |||||
though much of it is outdated. | |||||
Dependencies | |||||
Installation | |||||
============ | ============ | ||||
* [earwigbot](https://github.com/earwig/earwigbot) >= 0.1 | |||||
* [flask](https://flask.palletsprojects.com/) >= 0.10.1 | |||||
* [flask-mako](https://pythonhosted.org/Flask-Mako/) >= 0.3 | |||||
* [mako](https://www.makotemplates.org/) >= 0.7.2 | |||||
* [mwparserfromhell](https://github.com/earwig/mwparserfromhell) >= 0.3 | |||||
* [oursql](https://pythonhosted.org/oursql/) >= 0.9.3.1 | |||||
* [requests](https://requests.readthedocs.io/) >= 2.9.1 | |||||
* [SQLAlchemy](https://www.sqlalchemy.org/) >= 0.9.6 | |||||
* [apsw](https://github.com/rogerbinns/apsw) >= 3.26.0 | |||||
* [uglifyjs](https://github.com/mishoo/UglifyJS) >= 3.12.6 | |||||
* [cssnano](https://github.com/cssnano/cssnano) >= 4.1.10 | |||||
* [postcss-cli](https://github.com/postcss/postcss-cli) >= 8.3.1 | |||||
- If using Toolforge, clone the repository to `~/www/python/src`, or otherwise | |||||
symlink it to that directory. | |||||
Running | |||||
======= | |||||
- Create a virtual environment and install the dependencies. On Toolforge, | |||||
this should be in `~/www/python/venv`, otherwise it can be in a subdirectory | |||||
of the git project named `venv`: | |||||
python3 -m venv venv | |||||
. venv/bin/activate | |||||
pip install -e . | |||||
- If using Toolforge, you should clone the repository to `~/www/python/src`, or | |||||
otherwise symlink it to that directory. A | |||||
[virtualenv](https://virtualenv.pypa.io/) should be created at | |||||
`~/www/python/venv`. | |||||
- If you intend to modify CSS or JS, install the frontend dependencies: | |||||
- Install all dependencies listed above. | |||||
npm install -g uglify-js cssnano postcss postcss-cli | |||||
- Create an SQL database with the `cache` and `cache_data` tables defined by | |||||
[earwigbot-plugins](https://github.com/earwig/earwigbot-plugins/blob/develop/tasks/schema/afc_copyvios.sql). | |||||
- Create an SQL database with the tables defined by `schema.sql`. | |||||
- Create an earwigbot instance in `.earwigbot` (run `earwigbot .earwigbot`). In | |||||
`.earwigbot/config.yml`, fill out the connection info for the database by | |||||
- Create an earwigbot instance in `.earwigbot` (run `earwigbot .earwigbot`). | |||||
In `.earwigbot/config.yml`, fill out the connection info for the database by | |||||
adding the following to the `wiki` section: | adding the following to the `wiki` section: | ||||
_copyviosSQL: | |||||
copyvios: | |||||
engine: mysql | |||||
host: <hostname of database server> | host: <hostname of database server> | ||||
db: <name of database> | |||||
db: <name of database> | |||||
If additional arguments are needed by `oursql.connect()`, like usernames or | |||||
passwords, they should be added to the `_copyviosSQL` section. | |||||
Running | |||||
======= | |||||
- Run `./build.py` to minify JS and CSS files. | |||||
- Run `./build.py` to minify JS and CSS files after making any frontend | |||||
changes. | |||||
- Start the web server (on Toolforge, `webservice uwsgi-python start`). | |||||
- Start your WSGI server pointing to app:app. |
@@ -1,10 +1,9 @@ | |||||
#! /usr/bin/env python | #! /usr/bin/env python | ||||
# -*- coding: utf-8 -*- | |||||
import logging | |||||
from functools import wraps | from functools import wraps | ||||
from hashlib import md5 | from hashlib import md5 | ||||
from json import dumps | from json import dumps | ||||
from logging import DEBUG, INFO, getLogger | |||||
from logging.handlers import TimedRotatingFileHandler | from logging.handlers import TimedRotatingFileHandler | ||||
from os import path | from os import path | ||||
from time import asctime | from time import asctime | ||||
@@ -13,7 +12,7 @@ from traceback import format_exc | |||||
from earwigbot.bot import Bot | from earwigbot.bot import Bot | ||||
from earwigbot.wiki.copyvios import globalize | from earwigbot.wiki.copyvios import globalize | ||||
from flask import Flask, g, make_response, request | from flask import Flask, g, make_response, request | ||||
from flask_mako import MakoTemplates, render_template, TemplateError | |||||
from flask_mako import MakoTemplates, TemplateError, render_template | |||||
from copyvios.api import format_api_error, handle_api_request | from copyvios.api import format_api_error, handle_api_request | ||||
from copyvios.checker import do_check | from copyvios.checker import do_check | ||||
@@ -26,24 +25,27 @@ app = Flask(__name__) | |||||
MakoTemplates(app) | MakoTemplates(app) | ||||
hand = TimedRotatingFileHandler("logs/app.log", when="midnight", backupCount=7) | hand = TimedRotatingFileHandler("logs/app.log", when="midnight", backupCount=7) | ||||
hand.setLevel(DEBUG) | |||||
hand.setLevel(logging.DEBUG) | |||||
app.logger.addHandler(hand) | app.logger.addHandler(hand) | ||||
app.logger.info(u"Flask server started " + asctime()) | |||||
app.logger.info("Flask server started " + asctime()) | |||||
app._hash_cache = {} | app._hash_cache = {} | ||||
def catch_errors(func): | def catch_errors(func): | ||||
@wraps(func) | @wraps(func) | ||||
def inner(*args, **kwargs): | def inner(*args, **kwargs): | ||||
try: | try: | ||||
return func(*args, **kwargs) | return func(*args, **kwargs) | ||||
except TemplateError as exc: | except TemplateError as exc: | ||||
app.logger.error(u"Caught exception:\n{0}".format(exc.text)) | |||||
app.logger.error(f"Caught exception:\n{exc.text}") | |||||
return render_template("error.mako", traceback=exc.text) | return render_template("error.mako", traceback=exc.text) | ||||
except Exception: | except Exception: | ||||
app.logger.exception(u"Caught exception:") | |||||
app.logger.exception("Caught exception:") | |||||
return render_template("error.mako", traceback=format_exc()) | return render_template("error.mako", traceback=format_exc()) | ||||
return inner | return inner | ||||
@app.before_first_request | @app.before_first_request | ||||
def setup_app(): | def setup_app(): | ||||
cache.bot = Bot(".earwigbot", 100) | cache.bot = Bot(".earwigbot", 100) | ||||
@@ -54,31 +56,43 @@ def setup_app(): | |||||
globalize(num_workers=8) | globalize(num_workers=8) | ||||
@app.before_request | @app.before_request | ||||
def prepare_request(): | def prepare_request(): | ||||
g._db = None | g._db = None | ||||
g.cookies = parse_cookies( | g.cookies = parse_cookies( | ||||
request.script_root or "/", request.environ.get("HTTP_COOKIE")) | |||||
request.script_root or "/", request.environ.get("HTTP_COOKIE") | |||||
) | |||||
g.new_cookies = [] | g.new_cookies = [] | ||||
@app.after_request | @app.after_request | ||||
def add_new_cookies(response): | def add_new_cookies(response): | ||||
for cookie in g.new_cookies: | for cookie in g.new_cookies: | ||||
response.headers.add("Set-Cookie", cookie) | response.headers.add("Set-Cookie", cookie) | ||||
return response | return response | ||||
@app.after_request | @app.after_request | ||||
def write_access_log(response): | def write_access_log(response): | ||||
msg = u"%s %s %s %s -> %s" | |||||
app.logger.debug(msg, asctime(), request.method, request.path, | |||||
request.values.to_dict(), response.status_code) | |||||
msg = "%s %s %s %s -> %s" | |||||
app.logger.debug( | |||||
msg, | |||||
asctime(), | |||||
request.method, | |||||
request.path, | |||||
request.values.to_dict(), | |||||
response.status_code, | |||||
) | |||||
return response | return response | ||||
@app.teardown_appcontext | @app.teardown_appcontext | ||||
def close_databases(error): | def close_databases(error): | ||||
if g._db: | if g._db: | ||||
g._db.close() | g._db.close() | ||||
def external_url_handler(error, endpoint, values): | def external_url_handler(error, endpoint, values): | ||||
if endpoint == "static" and "file" in values: | if endpoint == "static" and "file" in values: | ||||
fpath = path.join(app.static_folder, values["file"]) | fpath = path.join(app.static_folder, values["file"]) | ||||
@@ -90,11 +104,13 @@ def external_url_handler(error, endpoint, values): | |||||
with open(fpath, "rb") as f: | with open(fpath, "rb") as f: | ||||
hashstr = md5(f.read()).hexdigest() | hashstr = md5(f.read()).hexdigest() | ||||
app._hash_cache[fpath] = (mtime, hashstr) | app._hash_cache[fpath] = (mtime, hashstr) | ||||
return "/static/{0}?v={1}".format(values["file"], hashstr) | |||||
return f"/static/{values['file']}?v={hashstr}" | |||||
raise error | raise error | ||||
app.url_build_error_handlers.append(external_url_handler) | app.url_build_error_handlers.append(external_url_handler) | ||||
@app.route("/") | @app.route("/") | ||||
@catch_errors | @catch_errors | ||||
def index(): | def index(): | ||||
@@ -102,8 +118,13 @@ def index(): | |||||
update_sites() | update_sites() | ||||
query = do_check() | query = do_check() | ||||
return render_template( | return render_template( | ||||
"index.mako", notice=notice, query=query, result=query.result, | |||||
turnitin_result=query.turnitin_result) | |||||
"index.mako", | |||||
notice=notice, | |||||
query=query, | |||||
result=query.result, | |||||
turnitin_result=query.turnitin_result, | |||||
) | |||||
@app.route("/settings", methods=["GET", "POST"]) | @app.route("/settings", methods=["GET", "POST"]) | ||||
@catch_errors | @catch_errors | ||||
@@ -111,15 +132,20 @@ def settings(): | |||||
status = process_settings() if request.method == "POST" else None | status = process_settings() if request.method == "POST" else None | ||||
update_sites() | update_sites() | ||||
default = cache.bot.wiki.get_site() | default = cache.bot.wiki.get_site() | ||||
kwargs = {"status": status, "default_lang": default.lang, | |||||
"default_project": default.project} | |||||
kwargs = { | |||||
"status": status, | |||||
"default_lang": default.lang, | |||||
"default_project": default.project, | |||||
} | |||||
return render_template("settings.mako", **kwargs) | return render_template("settings.mako", **kwargs) | ||||
@app.route("/api") | @app.route("/api") | ||||
@catch_errors | @catch_errors | ||||
def api(): | def api(): | ||||
return render_template("api.mako", help=True) | return render_template("api.mako", help=True) | ||||
@app.route("/api.json") | @app.route("/api.json") | ||||
@catch_errors | @catch_errors | ||||
def api_json(): | def api_json(): | ||||
@@ -134,7 +160,7 @@ def api_json(): | |||||
except Exception as exc: | except Exception as exc: | ||||
result = format_api_error("unhandled_exception", exc) | result = format_api_error("unhandled_exception", exc) | ||||
else: | else: | ||||
errmsg = u"Unknown format: '{0}'".format(format) | |||||
errmsg = f"Unknown format: '{format}'" | |||||
result = format_api_error("unknown_format", errmsg) | result = format_api_error("unknown_format", errmsg) | ||||
if format == "jsonfm": | if format == "jsonfm": | ||||
@@ -144,5 +170,6 @@ def api_json(): | |||||
resp.headers["Access-Control-Allow-Origin"] = "*" | resp.headers["Access-Control-Allow-Origin"] = "*" | ||||
return resp | return resp | ||||
if __name__ == '__main__': | |||||
if __name__ == "__main__": | |||||
app.run() | app.run() |
@@ -1,13 +1,13 @@ | |||||
#! /usr/bin/env python | #! /usr/bin/env python | ||||
# -*- coding: utf-8 -*- | |||||
from __future__ import print_function | |||||
import os | import os | ||||
import subprocess | import subprocess | ||||
def process(*args): | def process(*args): | ||||
print(*args) | print(*args) | ||||
content = subprocess.check_output(args) | |||||
subprocess.run(args, check=True) | |||||
def main(): | def main(): | ||||
root = os.path.join(os.path.dirname(__file__), "static") | root = os.path.join(os.path.dirname(__file__), "static") | ||||
@@ -15,10 +15,25 @@ def main(): | |||||
for filename in filenames: | for filename in filenames: | ||||
name = os.path.relpath(os.path.join(dirpath, filename)) | name = os.path.relpath(os.path.join(dirpath, filename)) | ||||
if filename.endswith(".js") and ".min." not in filename: | if filename.endswith(".js") and ".min." not in filename: | ||||
process("uglifyjs", "--compress", "-o", name.replace(".js", ".min.js"), "--", name) | |||||
process( | |||||
"uglifyjs", | |||||
"--compress", | |||||
"-o", | |||||
name.replace(".js", ".min.js"), | |||||
"--", | |||||
name, | |||||
) | |||||
if filename.endswith(".css") and ".min." not in filename: | if filename.endswith(".css") and ".min." not in filename: | ||||
process("postcss", "-u", "cssnano", "--no-map", name, "-o", | |||||
name.replace(".css", ".min.css")) | |||||
process( | |||||
"postcss", | |||||
"-u", | |||||
"cssnano", | |||||
"--no-map", | |||||
name, | |||||
"-o", | |||||
name.replace(".css", ".min.css"), | |||||
) | |||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
main() | main() |
@@ -1 +0,0 @@ | |||||
# -*- coding: utf-8 -*- |
@@ -0,0 +1,44 @@ | |||||
[project] | |||||
name = "copyvios" | |||||
version = "1.0.dev0" | |||||
authors = [ | |||||
{name = "Ben Kurtovic", email = "ben@benkurtovic.com"}, | |||||
] | |||||
description = "A copyright violation detector web tool for Wikipedia articles" | |||||
readme = "README.md" | |||||
requires-python = ">=3.11" | |||||
dependencies = [ | |||||
"earwigbot[sql,copyvios] >= 0.4", | |||||
"mwparserfromhell >= 0.6", | |||||
"flask >= 3.0", | |||||
"flask-mako >= 0.4", | |||||
"mako >= 1.3.5", | |||||
"requests >= 2.32.3", | |||||
"SQLAlchemy >= 2.0.32", | |||||
"apsw >= 3.46.1", | |||||
] | |||||
[project.urls] | |||||
Homepage = "https://github.com/earwig/copyvios" | |||||
Issues = "https://github.com/earwig/copyvios/issues" | |||||
[build-system] | |||||
requires = ["setuptools>=61.0"] | |||||
build-backend = "setuptools.build_meta" | |||||
[tool.pyright] | |||||
pythonVersion = "3.11" | |||||
exclude = [ | |||||
# TODO | |||||
"src/copyvios/*", | |||||
"app.py", | |||||
] | |||||
venvPath = "." | |||||
venv = "venv" | |||||
[tool.ruff] | |||||
target-version = "py311" | |||||
[tool.ruff.lint] | |||||
select = ["E4", "E7", "E9", "F", "I", "UP"] | |||||
ignore = ["F403"] |
@@ -2,48 +2,59 @@ | |||||
import argparse | import argparse | ||||
import re | import re | ||||
import sqlite3 | import sqlite3 | ||||
from typing import Any | |||||
REGEX = re.compile( | REGEX = re.compile( | ||||
r'^' | |||||
r'{address space usage: (?P<used_bytes>-?\d+) bytes/(?P<used_mb>\w+)} ' | |||||
r'{rss usage: (?P<rss_bytes>-?\d+) bytes/(?P<rss_mb>\w+)} ' | |||||
r'\[pid: (?P<pid>\d+)\|app: -\|req: -/-\] (?P<ip>[0-9.]+) \(-\) ' | |||||
r'{(?P<vars>\d+) vars in (?P<var_bytes>\d+) bytes} ' | |||||
r'\[(?P<date>[0-9A-Za-z: ]+)\] (?P<method>\w+) (?P<url>.*?) => ' | |||||
r'generated (?P<resp_bytes>\d+) bytes in (?P<msecs>\d+) msecs ' | |||||
r'\((- http://hasty.ai)?(?P<proto>[A-Z0-9/.]+) (?P<status>\d+)\) ' | |||||
r'(?P<headers>\d+) headers in (?P<header_bytes>\d+) bytes ' | |||||
r'\((?P<switches>\d+) switches on core (?P<core>\d+)\) ' | |||||
r'(?P<agent>.*?)' | |||||
r'( (?P<referer>https?://[^ ]*?))?( -)?( http(://|%3A%2F%2F)hasty\.ai)?' | |||||
r'$' | |||||
r"^" | |||||
r"{address space usage: (?P<used_bytes>-?\d+) bytes/(?P<used_mb>\w+)} " | |||||
r"{rss usage: (?P<rss_bytes>-?\d+) bytes/(?P<rss_mb>\w+)} " | |||||
r"\[pid: (?P<pid>\d+)\|app: -\|req: -/-\] (?P<ip>[0-9.]+) \(-\) " | |||||
r"{(?P<vars>\d+) vars in (?P<var_bytes>\d+) bytes} " | |||||
r"\[(?P<date>[0-9A-Za-z: ]+)\] (?P<method>\w+) (?P<url>.*?) => " | |||||
r"generated (?P<resp_bytes>\d+) bytes in (?P<msecs>\d+) msecs " | |||||
r"\((- http://hasty.ai)?(?P<proto>[A-Z0-9/.]+) (?P<status>\d+)\) " | |||||
r"(?P<headers>\d+) headers in (?P<header_bytes>\d+) bytes " | |||||
r"\((?P<switches>\d+) switches on core (?P<core>\d+)\) " | |||||
r"(?P<agent>.*?)" | |||||
r"( (?P<referer>https?://[^ ]*?))?( -)?( http(://|%3A%2F%2F)hasty\.ai)?" | |||||
r"$" | |||||
) | ) | ||||
def save_logs(logs): | |||||
def save_logs(logs: list[dict[str, Any]]) -> None: | |||||
columns = sorted(REGEX.groupindex, key=lambda col: REGEX.groupindex[col]) | columns = sorted(REGEX.groupindex, key=lambda col: REGEX.groupindex[col]) | ||||
conn = sqlite3.Connection('logs.db') | |||||
conn = sqlite3.Connection("logs.db") | |||||
cur = conn.cursor() | cur = conn.cursor() | ||||
cur.execute('CREATE TABLE IF NOT EXISTS logs(%s)' % ', '.join(columns)) | |||||
cur.executemany('INSERT INTO logs VALUES (%s)' % ', '.join(['?'] * len(columns)), | |||||
[[log[col] for col in columns] for log in logs]) | |||||
cur.execute(f"CREATE TABLE IF NOT EXISTS logs({', '.join(columns)})") | |||||
params = ", ".join(["?"] * len(columns)) | |||||
cur.executemany( | |||||
f"INSERT INTO logs VALUES ({params})", | |||||
[[log[col] for col in columns] for log in logs], | |||||
) | |||||
conn.commit() | conn.commit() | ||||
conn.close() | conn.close() | ||||
def read_logs(path): | |||||
with open(path, 'r', errors='replace') as fp: | |||||
def read_logs(path: str) -> list[dict[str, Any]]: | |||||
with open(path, errors="replace") as fp: | |||||
lines = fp.readlines() | lines = fp.readlines() | ||||
parsed = [(line, REGEX.match(line.strip())) for line in lines | |||||
if line.startswith('{address space usage')] | |||||
parsed = [ | |||||
(line, REGEX.match(line.strip())) | |||||
for line in lines | |||||
if line.startswith("{address space usage") | |||||
] | |||||
for line, match in parsed: | for line, match in parsed: | ||||
if not match: | if not match: | ||||
print('failed to parse:', line.strip()) | |||||
print("failed to parse:", line.strip()) | |||||
return [match.groupdict() for _, match in parsed if match] | return [match.groupdict() for _, match in parsed if match] | ||||
def main(): | def main(): | ||||
parser = argparse.ArgumentParser() | parser = argparse.ArgumentParser() | ||||
parser.add_argument('logfile', default='uwsgi.log') | |||||
parser.add_argument("logfile", default="uwsgi.log") | |||||
args = parser.parse_args() | args = parser.parse_args() | ||||
save_logs(read_logs(args.logfile)) | save_logs(read_logs(args.logfile)) | ||||
if __name__ == '__main__': | |||||
if __name__ == "__main__": | |||||
main() | main() |
@@ -1,9 +1,7 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from collections import OrderedDict | from collections import OrderedDict | ||||
from .checker import T_POSSIBLE, T_SUSPECT, do_check | |||||
from .highlighter import highlight_delta | from .highlighter import highlight_delta | ||||
from .checker import do_check, T_POSSIBLE, T_SUSPECT | |||||
from .misc import Query, cache | from .misc import Query, cache | ||||
from .sites import update_sites | from .sites import update_sites | ||||
@@ -15,83 +13,107 @@ _CHECK_ERRORS = { | |||||
"no URL": "The parameter 'url' is required for URL comparisons", | "no URL": "The parameter 'url' is required for URL comparisons", | ||||
"bad URI": "The given URI scheme is unsupported", | "bad URI": "The given URI scheme is unsupported", | ||||
"no data": "No text could be found in the given URL (note that only HTML " | "no data": "No text could be found in the given URL (note that only HTML " | ||||
"and plain text pages are supported, and content generated by " | |||||
"JavaScript or found inside iframes is ignored)", | |||||
"and plain text pages are supported, and content generated by " | |||||
"JavaScript or found inside iframes is ignored)", | |||||
"timeout": "The given URL timed out before any data could be retrieved", | "timeout": "The given URL timed out before any data could be retrieved", | ||||
"search error": "An error occurred while using the search engine; try " | "search error": "An error occurred while using the search engine; try " | ||||
"reloading or setting 'use_engine' to 0", | |||||
"reloading or setting 'use_engine' to 0", | |||||
} | } | ||||
def _serialize_page(page): | def _serialize_page(page): | ||||
return OrderedDict((("title", page.title), ("url", page.url))) | return OrderedDict((("title", page.title), ("url", page.url))) | ||||
def _serialize_source(source, show_skip=True): | def _serialize_source(source, show_skip=True): | ||||
if not source: | if not source: | ||||
return OrderedDict(( | |||||
("url", None), ("confidence", 0.0), ("violation", "none"))) | |||||
return OrderedDict((("url", None), ("confidence", 0.0), ("violation", "none"))) | |||||
conf = source.confidence | conf = source.confidence | ||||
data = OrderedDict(( | |||||
("url", source.url), | |||||
("confidence", conf), | |||||
("violation", "suspected" if conf >= T_SUSPECT else | |||||
"possible" if conf >= T_POSSIBLE else "none") | |||||
)) | |||||
data = OrderedDict( | |||||
( | |||||
("url", source.url), | |||||
("confidence", conf), | |||||
( | |||||
"violation", | |||||
( | |||||
"suspected" | |||||
if conf >= T_SUSPECT | |||||
else "possible" | |||||
if conf >= T_POSSIBLE | |||||
else "none" | |||||
), | |||||
), | |||||
) | |||||
) | |||||
if show_skip: | if show_skip: | ||||
data["skipped"] = source.skipped | data["skipped"] = source.skipped | ||||
data["excluded"] = source.excluded | data["excluded"] = source.excluded | ||||
return data | return data | ||||
def _serialize_detail(result): | def _serialize_detail(result): | ||||
source_chain, delta = result.best.chains | source_chain, delta = result.best.chains | ||||
article = highlight_delta(None, result.article_chain, delta) | article = highlight_delta(None, result.article_chain, delta) | ||||
source = highlight_delta(None, source_chain, delta) | source = highlight_delta(None, source_chain, delta) | ||||
return OrderedDict((("article", article), ("source", source))) | return OrderedDict((("article", article), ("source", source))) | ||||
def format_api_error(code, info): | def format_api_error(code, info): | ||||
if isinstance(info, BaseException): | if isinstance(info, BaseException): | ||||
info = type(info).__name__ + ": " + str(info) | info = type(info).__name__ + ": " + str(info) | ||||
elif isinstance(info, unicode): | |||||
info = info.encode("utf8") | |||||
error_inner = OrderedDict((("code", code), ("info", info))) | error_inner = OrderedDict((("code", code), ("info", info))) | ||||
return OrderedDict((("status", "error"), ("error", error_inner))) | return OrderedDict((("status", "error"), ("error", error_inner))) | ||||
def _hook_default(query): | def _hook_default(query): | ||||
info = u"Unknown action: '{0}'".format(query.action.lower()) | |||||
info = f"Unknown action: '{query.action.lower()}'" | |||||
return format_api_error("unknown_action", info) | return format_api_error("unknown_action", info) | ||||
def _hook_check(query): | def _hook_check(query): | ||||
do_check(query) | do_check(query) | ||||
if not query.submitted: | if not query.submitted: | ||||
info = ("The query parameters 'project', 'lang', and either 'title' " | |||||
"or 'oldid' are required for checks") | |||||
info = ( | |||||
"The query parameters 'project', 'lang', and either 'title' " | |||||
"or 'oldid' are required for checks" | |||||
) | |||||
return format_api_error("missing_params", info) | return format_api_error("missing_params", info) | ||||
if query.error: | if query.error: | ||||
info = _CHECK_ERRORS.get(query.error, "An unknown error occurred") | info = _CHECK_ERRORS.get(query.error, "An unknown error occurred") | ||||
return format_api_error(query.error.replace(" ", "_"), info) | return format_api_error(query.error.replace(" ", "_"), info) | ||||
elif not query.site: | elif not query.site: | ||||
info = (u"The given site (project={0}, lang={1}) either doesn't exist," | |||||
u" is closed, or is private").format(query.project, query.lang) | |||||
info = ( | |||||
f"The given site (project={query.project}, lang={query.lang}) either doesn't exist," | |||||
" is closed, or is private" | |||||
) | |||||
return format_api_error("bad_site", info) | return format_api_error("bad_site", info) | ||||
elif not query.result: | elif not query.result: | ||||
if query.oldid: | if query.oldid: | ||||
info = u"The revision ID couldn't be found: {0}" | |||||
info = "The revision ID couldn't be found: {0}" | |||||
return format_api_error("bad_oldid", info.format(query.oldid)) | return format_api_error("bad_oldid", info.format(query.oldid)) | ||||
else: | else: | ||||
info = u"The page couldn't be found: {0}" | |||||
info = "The page couldn't be found: {0}" | |||||
return format_api_error("bad_title", info.format(query.page.title)) | return format_api_error("bad_title", info.format(query.page.title)) | ||||
result = query.result | result = query.result | ||||
data = OrderedDict(( | |||||
("status", "ok"), | |||||
("meta", OrderedDict(( | |||||
("time", result.time), | |||||
("queries", result.queries), | |||||
("cached", result.cached), | |||||
("redirected", bool(query.redirected_from)) | |||||
))), | |||||
("page", _serialize_page(query.page)) | |||||
)) | |||||
data = OrderedDict( | |||||
( | |||||
("status", "ok"), | |||||
( | |||||
"meta", | |||||
OrderedDict( | |||||
( | |||||
("time", result.time), | |||||
("queries", result.queries), | |||||
("cached", result.cached), | |||||
("redirected", bool(query.redirected_from)), | |||||
) | |||||
), | |||||
), | |||||
("page", _serialize_page(query.page)), | |||||
) | |||||
) | |||||
if result.cached: | if result.cached: | ||||
data["meta"]["cache_time"] = result.cache_time | data["meta"]["cache_time"] = result.cache_time | ||||
if query.redirected_from: | if query.redirected_from: | ||||
@@ -102,11 +124,13 @@ def _hook_check(query): | |||||
data["detail"] = _serialize_detail(result) | data["detail"] = _serialize_detail(result) | ||||
return data | return data | ||||
def _hook_sites(query): | def _hook_sites(query): | ||||
update_sites() | update_sites() | ||||
return OrderedDict(( | |||||
("status", "ok"), ("langs", cache.langs), ("projects", cache.projects) | |||||
)) | |||||
return OrderedDict( | |||||
(("status", "ok"), ("langs", cache.langs), ("projects", cache.projects)) | |||||
) | |||||
_HOOKS = { | _HOOKS = { | ||||
"compare": _hook_check, | "compare": _hook_check, | ||||
@@ -114,13 +138,14 @@ _HOOKS = { | |||||
"sites": _hook_sites, | "sites": _hook_sites, | ||||
} | } | ||||
def handle_api_request(): | def handle_api_request(): | ||||
query = Query() | query = Query() | ||||
if query.version: | if query.version: | ||||
try: | try: | ||||
query.version = int(query.version) | query.version = int(query.version) | ||||
except ValueError: | except ValueError: | ||||
info = "The version string is invalid: {0}".format(query.version) | |||||
info = f"The version string is invalid: {query.version}" | |||||
return format_api_error("invalid_version", info) | return format_api_error("invalid_version", info) | ||||
else: | else: | ||||
query.version = 1 | query.version = 1 | ||||
@@ -129,5 +154,5 @@ def handle_api_request(): | |||||
action = query.action.lower() if query.action else "" | action = query.action.lower() if query.action else "" | ||||
return _HOOKS.get(action, _hook_default)(query) | return _HOOKS.get(action, _hook_default)(query) | ||||
info = "The API version is unsupported: {0}".format(query.version) | |||||
info = f"The API version is unsupported: {query.version}" | |||||
return format_api_error("unsupported_version", info) | return format_api_error("unsupported_version", info) |
@@ -1,20 +1,19 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from __future__ import unicode_literals | |||||
from earwigbot.wiki import NS_TEMPLATE | from earwigbot.wiki import NS_TEMPLATE | ||||
__all__ = ["get_attribution_info"] | __all__ = ["get_attribution_info"] | ||||
ATTRIB_TEMPLATES = { | ATTRIB_TEMPLATES = { | ||||
"enwiki": { | "enwiki": { | ||||
"CC-notice", "Cc-notice", | |||||
"CC-notice", | |||||
"Cc-notice", | |||||
"Citation-attribution", | "Citation-attribution", | ||||
"Free-content attribution", "Open-source attribution", | |||||
"Free-content attribution", | |||||
"Open-source attribution", | |||||
"Source-attribution", | "Source-attribution", | ||||
} | } | ||||
} | } | ||||
def get_attribution_info(site, page): | def get_attribution_info(site, page): | ||||
"""Check to see if the given page has some kind of attribution info. | """Check to see if the given page has some kind of attribution info. | ||||
@@ -30,7 +29,7 @@ def get_attribution_info(site, page): | |||||
for template in page.parse().ifilter_templates(): | for template in page.parse().ifilter_templates(): | ||||
if template.name.matches(templates): | if template.name.matches(templates): | ||||
name = unicode(template.name).strip() | |||||
name = str(template.name).strip() | |||||
title = name if ":" in name else prefix + ":" + name | title = name if ":" in name else prefix + ":" + name | ||||
return name, site.get_page(title).url | return name, site.get_page(title).url | ||||
return None | return None |
@@ -1,10 +1,10 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from datetime import datetime, timedelta | |||||
from json import loads | |||||
import random | import random | ||||
import re | import re | ||||
import urllib | |||||
import urllib.error | |||||
import urllib.parse | |||||
import urllib.request | |||||
from datetime import datetime, timedelta | |||||
from json import loads | |||||
from earwigbot import exceptions | from earwigbot import exceptions | ||||
from flask import g | from flask import g | ||||
@@ -13,32 +13,39 @@ from .misc import cache | |||||
__all__ = ["set_background"] | __all__ = ["set_background"] | ||||
def _get_commons_site(): | def _get_commons_site(): | ||||
try: | try: | ||||
return cache.bot.wiki.get_site("commonswiki") | return cache.bot.wiki.get_site("commonswiki") | ||||
except exceptions.SiteNotFoundError: | except exceptions.SiteNotFoundError: | ||||
return cache.bot.wiki.add_site(project="wikimedia", lang="commons") | return cache.bot.wiki.add_site(project="wikimedia", lang="commons") | ||||
def _load_file(site, filename): | def _load_file(site, filename): | ||||
data = site.api_query( | data = site.api_query( | ||||
action="query", prop="imageinfo", iiprop="url|size|canonicaltitle", | |||||
titles="File:" + filename) | |||||
res = data["query"]["pages"].values()[0]["imageinfo"][0] | |||||
name = res["canonicaltitle"][len("File:"):].replace(" ", "_") | |||||
action="query", | |||||
prop="imageinfo", | |||||
iiprop="url|size|canonicaltitle", | |||||
titles="File:" + filename, | |||||
) | |||||
res = list(data["query"]["pages"].values())[0]["imageinfo"][0] | |||||
name = res["canonicaltitle"][len("File:") :].replace(" ", "_") | |||||
return name, res["url"], res["descriptionurl"], res["width"], res["height"] | return name, res["url"], res["descriptionurl"], res["width"], res["height"] | ||||
def _get_fresh_potd(): | def _get_fresh_potd(): | ||||
site = _get_commons_site() | site = _get_commons_site() | ||||
date = datetime.utcnow().strftime("%Y-%m-%d") | date = datetime.utcnow().strftime("%Y-%m-%d") | ||||
page = site.get_page("Template:Potd/" + date) | page = site.get_page("Template:Potd/" + date) | ||||
regex = ur"\{\{Potd filename\|(?:1=)?(.*?)\|.*?\}\}" | |||||
regex = r"\{\{Potd filename\|(?:1=)?(.*?)\|.*?\}\}" | |||||
filename = re.search(regex, page.get()).group(1) | filename = re.search(regex, page.get()).group(1) | ||||
return _load_file(site, filename) | return _load_file(site, filename) | ||||
def _get_fresh_list(): | def _get_fresh_list(): | ||||
site = _get_commons_site() | site = _get_commons_site() | ||||
page = site.get_page("User:The Earwig/POTD") | page = site.get_page("User:The Earwig/POTD") | ||||
regex = ur"\*\*?\s*\[\[:File:(.*?)\]\]" | |||||
regex = r"\*\*?\s*\[\[:File:(.*?)\]\]" | |||||
filenames = re.findall(regex, page.get()) | filenames = re.findall(regex, page.get()) | ||||
# Ensure all workers share the same background each day: | # Ensure all workers share the same background each day: | ||||
@@ -46,6 +53,7 @@ def _get_fresh_list(): | |||||
filename = random.choice(filenames) | filename = random.choice(filenames) | ||||
return _load_file(site, filename) | return _load_file(site, filename) | ||||
def _build_url(screen, filename, url, imgwidth, imgheight): | def _build_url(screen, filename, url, imgwidth, imgheight): | ||||
width = screen["width"] | width = screen["width"] | ||||
if float(imgwidth) / imgheight > float(screen["width"]) / screen["height"]: | if float(imgwidth) / imgheight > float(screen["width"]) / screen["height"]: | ||||
@@ -53,12 +61,11 @@ def _build_url(screen, filename, url, imgwidth, imgheight): | |||||
if width >= imgwidth: | if width >= imgwidth: | ||||
return url | return url | ||||
url = url.replace("/commons/", "/commons/thumb/") | url = url.replace("/commons/", "/commons/thumb/") | ||||
return "%s/%dpx-%s" % (url, width, urllib.quote(filename.encode("utf8"))) | |||||
return "%s/%dpx-%s" % (url, width, urllib.parse.quote(filename.encode("utf8"))) | |||||
_BACKGROUNDS = {"potd": _get_fresh_potd, "list": _get_fresh_list} | |||||
_BACKGROUNDS = { | |||||
"potd": _get_fresh_potd, | |||||
"list": _get_fresh_list | |||||
} | |||||
def _get_background(selected): | def _get_background(selected): | ||||
if not cache.last_background_updates: | if not cache.last_background_updates: | ||||
@@ -73,6 +80,7 @@ def _get_background(selected): | |||||
cache.last_background_updates[selected] = datetime.utcnow().date() | cache.last_background_updates[selected] = datetime.utcnow().date() | ||||
return cache.background_data[selected] | return cache.background_data[selected] | ||||
def set_background(selected): | def set_background(selected): | ||||
if "CopyviosScreenCache" in g.cookies: | if "CopyviosScreenCache" in g.cookies: | ||||
screen_cache = g.cookies["CopyviosScreenCache"].value | screen_cache = g.cookies["CopyviosScreenCache"].value |
@@ -1,17 +1,15 @@ | |||||
# -*- coding: utf-8 -*- | |||||
import re | |||||
from datetime import datetime, timedelta | from datetime import datetime, timedelta | ||||
from hashlib import sha256 | from hashlib import sha256 | ||||
from logging import getLogger | from logging import getLogger | ||||
import re | |||||
from urlparse import urlparse | |||||
from urllib.parse import urlparse | |||||
from earwigbot import exceptions | from earwigbot import exceptions | ||||
from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain | from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain | ||||
from earwigbot.wiki.copyvios.parsers import ArticleTextParser | from earwigbot.wiki.copyvios.parsers import ArticleTextParser | ||||
from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult | |||||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | |||||
from .misc import Query, get_db, get_cursor, get_sql_error, sql_dialect | |||||
from .misc import Query, get_cursor, get_db, get_sql_error, sql_dialect | |||||
from .sites import get_site | from .sites import get_site | ||||
from .turnitin import search_turnitin | from .turnitin import search_turnitin | ||||
@@ -22,9 +20,11 @@ T_SUSPECT = 0.75 | |||||
_LOGGER = getLogger("copyvios.checker") | _LOGGER = getLogger("copyvios.checker") | ||||
def _coerce_bool(val): | def _coerce_bool(val): | ||||
return val and val not in ("0", "false") | return val and val not in ("0", "false") | ||||
def do_check(query=None): | def do_check(query=None): | ||||
if not query: | if not query: | ||||
query = Query() | query = Query() | ||||
@@ -44,6 +44,7 @@ def do_check(query=None): | |||||
_get_results(query, follow=not _coerce_bool(query.noredirect)) | _get_results(query, follow=not _coerce_bool(query.noredirect)) | ||||
return query | return query | ||||
def _get_results(query, follow=True): | def _get_results(query, follow=True): | ||||
if query.oldid: | if query.oldid: | ||||
if not re.match(r"^\d+$", query.oldid): | if not re.match(r"^\d+$", query.oldid): | ||||
@@ -100,8 +101,9 @@ def _get_results(query, follow=True): | |||||
degree = int(query.degree) | degree = int(query.degree) | ||||
except ValueError: | except ValueError: | ||||
pass | pass | ||||
result = page.copyvio_compare(query.url, min_confidence=T_SUSPECT, | |||||
max_time=10, degree=degree) | |||||
result = page.copyvio_compare( | |||||
query.url, min_confidence=T_SUSPECT, max_time=10, degree=degree | |||||
) | |||||
if result.best.chains[0] is EMPTY: | if result.best.chains[0] is EMPTY: | ||||
query.error = "timeout" if result.time > 10 else "no data" | query.error = "timeout" if result.time > 10 else "no data" | ||||
return | return | ||||
@@ -110,12 +112,18 @@ def _get_results(query, follow=True): | |||||
else: | else: | ||||
query.error = "bad action" | query.error = "bad action" | ||||
def _get_page_by_revid(site, revid): | def _get_page_by_revid(site, revid): | ||||
try: | try: | ||||
res = site.api_query(action="query", prop="info|revisions", revids=revid, | |||||
rvprop="content|timestamp", inprop="protection|url", | |||||
rvslots="main") | |||||
page_data = res["query"]["pages"].values()[0] | |||||
res = site.api_query( | |||||
action="query", | |||||
prop="info|revisions", | |||||
revids=revid, | |||||
rvprop="content|timestamp", | |||||
inprop="protection|url", | |||||
rvslots="main", | |||||
) | |||||
page_data = list(res["query"]["pages"].values())[0] | |||||
title = page_data["title"] | title = page_data["title"] | ||||
# Only need to check that these exist: | # Only need to check that these exist: | ||||
revision = page_data["revisions"][0] | revision = page_data["revisions"][0] | ||||
@@ -131,24 +139,30 @@ def _get_page_by_revid(site, revid): | |||||
page._load_content(res) | page._load_content(res) | ||||
return page | return page | ||||
def _perform_check(query, page, use_engine, use_links): | def _perform_check(query, page, use_engine, use_links): | ||||
conn = get_db() | conn = get_db() | ||||
sql_error = get_sql_error() | sql_error = get_sql_error() | ||||
mode = "{0}:{1}:".format(use_engine, use_links) | |||||
mode = f"{use_engine}:{use_links}:" | |||||
if not _coerce_bool(query.nocache): | if not _coerce_bool(query.nocache): | ||||
try: | try: | ||||
query.result = _get_cached_results( | query.result = _get_cached_results( | ||||
page, conn, mode, _coerce_bool(query.noskip)) | |||||
page, conn, mode, _coerce_bool(query.noskip) | |||||
) | |||||
except sql_error: | except sql_error: | ||||
_LOGGER.exception("Failed to retrieve cached results") | _LOGGER.exception("Failed to retrieve cached results") | ||||
if not query.result: | if not query.result: | ||||
try: | try: | ||||
query.result = page.copyvio_check( | query.result = page.copyvio_check( | ||||
min_confidence=T_SUSPECT, max_queries=8, max_time=30, | |||||
no_searches=not use_engine, no_links=not use_links, | |||||
short_circuit=not query.noskip) | |||||
min_confidence=T_SUSPECT, | |||||
max_queries=8, | |||||
max_time=30, | |||||
no_searches=not use_engine, | |||||
no_links=not use_links, | |||||
short_circuit=not query.noskip, | |||||
) | |||||
except exceptions.SearchQueryError as exc: | except exceptions.SearchQueryError as exc: | ||||
query.error = "search error" | query.error = "search error" | ||||
query.exception = exc | query.exception = exc | ||||
@@ -159,6 +173,7 @@ def _perform_check(query, page, use_engine, use_links): | |||||
except sql_error: | except sql_error: | ||||
_LOGGER.exception("Failed to cache results") | _LOGGER.exception("Failed to cache results") | ||||
def _get_cached_results(page, conn, mode, noskip): | def _get_cached_results(page, conn, mode, noskip): | ||||
query1 = """SELECT cache_time, cache_queries, cache_process_time, | query1 = """SELECT cache_time, cache_queries, cache_process_time, | ||||
cache_possible_miss | cache_possible_miss | ||||
@@ -167,7 +182,7 @@ def _get_cached_results(page, conn, mode, noskip): | |||||
query2 = """SELECT cdata_url, cdata_confidence, cdata_skipped, cdata_excluded | query2 = """SELECT cdata_url, cdata_confidence, cdata_skipped, cdata_excluded | ||||
FROM cache_data | FROM cache_data | ||||
WHERE cdata_cache_id = ?""" | WHERE cdata_cache_id = ?""" | ||||
cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest()) | |||||
cache_id = sha256(mode + page.get().encode("utf8")).digest() | |||||
cursor = conn.cursor() | cursor = conn.cursor() | ||||
cursor.execute(query1, (cache_id,)) | cursor.execute(query1, (cache_id,)) | ||||
@@ -186,8 +201,9 @@ def _get_cached_results(page, conn, mode, noskip): | |||||
if not data: # TODO: do something less hacky for this edge case | if not data: # TODO: do something less hacky for this edge case | ||||
article_chain = MarkovChain(ArticleTextParser(page.get()).strip()) | article_chain = MarkovChain(ArticleTextParser(page.get()).strip()) | ||||
result = CopyvioCheckResult(False, [], queries, check_time, | |||||
article_chain, possible_miss) | |||||
result = CopyvioCheckResult( | |||||
False, [], queries, check_time, article_chain, possible_miss | |||||
) | |||||
result.cached = True | result.cached = True | ||||
result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC") | result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC") | ||||
result.cache_age = _format_date(cache_time) | result.cache_age = _format_date(cache_time) | ||||
@@ -216,8 +232,11 @@ def _get_cached_results(page, conn, mode, noskip): | |||||
result.cache_age = _format_date(cache_time) | result.cache_age = _format_date(cache_time) | ||||
return result | return result | ||||
def _format_date(cache_time): | def _format_date(cache_time): | ||||
formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s") | |||||
def formatter(n, w): | |||||
return "{} {}{}".format(n, w, "" if n == 1 else "s") | |||||
diff = datetime.utcnow() - cache_time | diff = datetime.utcnow() - cache_time | ||||
total_seconds = diff.days * 86400 + diff.seconds | total_seconds = diff.days * 86400 + diff.seconds | ||||
if total_seconds > 3600: | if total_seconds > 3600: | ||||
@@ -226,23 +245,34 @@ def _format_date(cache_time): | |||||
return formatter(total_seconds / 60, "minute") | return formatter(total_seconds / 60, "minute") | ||||
return formatter(total_seconds, "second") | return formatter(total_seconds, "second") | ||||
def _cache_result(page, result, conn, mode): | def _cache_result(page, result, conn, mode): | ||||
expiry = sql_dialect(mysql="DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)", | |||||
sqlite="STRFTIME('%s', 'now', '-3 days')") | |||||
expiry = sql_dialect( | |||||
mysql="DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)", | |||||
sqlite="STRFTIME('%s', 'now', '-3 days')", | |||||
) | |||||
query1 = "DELETE FROM cache WHERE cache_id = ?" | query1 = "DELETE FROM cache WHERE cache_id = ?" | ||||
query2 = "DELETE FROM cache WHERE cache_time < %s" % expiry | |||||
query2 = f"DELETE FROM cache WHERE cache_time < {expiry}" | |||||
query3 = """INSERT INTO cache (cache_id, cache_queries, cache_process_time, | query3 = """INSERT INTO cache (cache_id, cache_queries, cache_process_time, | ||||
cache_possible_miss) VALUES (?, ?, ?, ?)""" | cache_possible_miss) VALUES (?, ?, ?, ?)""" | ||||
query4 = """INSERT INTO cache_data (cdata_cache_id, cdata_url, | query4 = """INSERT INTO cache_data (cdata_cache_id, cdata_url, | ||||
cdata_confidence, cdata_skipped, | cdata_confidence, cdata_skipped, | ||||
cdata_excluded) VALUES (?, ?, ?, ?, ?)""" | cdata_excluded) VALUES (?, ?, ?, ?, ?)""" | ||||
cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest()) | |||||
data = [(cache_id, source.url[:1024], source.confidence, source.skipped, | |||||
source.excluded) | |||||
for source in result.sources] | |||||
cache_id = sha256(mode + page.get().encode("utf8")).digest() | |||||
data = [ | |||||
( | |||||
cache_id, | |||||
source.url[:1024], | |||||
source.confidence, | |||||
source.skipped, | |||||
source.excluded, | |||||
) | |||||
for source in result.sources | |||||
] | |||||
with get_cursor(conn) as cursor: | with get_cursor(conn) as cursor: | ||||
cursor.execute(query1, (cache_id,)) | cursor.execute(query1, (cache_id,)) | ||||
cursor.execute(query2) | cursor.execute(query2) | ||||
cursor.execute(query3, (cache_id, result.queries, result.time, | |||||
result.possible_miss)) | |||||
cursor.execute( | |||||
query3, (cache_id, result.queries, result.time, result.possible_miss) | |||||
) | |||||
cursor.executemany(query4, data) | cursor.executemany(query4, data) |
@@ -1,39 +1,38 @@ | |||||
# -*- coding: utf-8 -*- | |||||
import base64 | import base64 | ||||
from Cookie import CookieError, SimpleCookie | |||||
from datetime import datetime, timedelta | from datetime import datetime, timedelta | ||||
from http.cookies import CookieError, SimpleCookie | |||||
from flask import g | from flask import g | ||||
__all__ = ["parse_cookies", "set_cookie", "delete_cookie"] | __all__ = ["parse_cookies", "set_cookie", "delete_cookie"] | ||||
class _CookieManager(SimpleCookie): | class _CookieManager(SimpleCookie): | ||||
MAGIC = "--cpv2" | MAGIC = "--cpv2" | ||||
def __init__(self, path, cookies): | def __init__(self, path, cookies): | ||||
self._path = path | self._path = path | ||||
try: | try: | ||||
super(_CookieManager, self).__init__(cookies) | |||||
super().__init__(cookies) | |||||
except CookieError: | except CookieError: | ||||
super(_CookieManager, self).__init__() | |||||
for cookie in self.keys(): | |||||
super().__init__() | |||||
for cookie in list(self.keys()): | |||||
if self[cookie].value is False: | if self[cookie].value is False: | ||||
del self[cookie] | del self[cookie] | ||||
def value_decode(self, value): | def value_decode(self, value): | ||||
unquoted = super(_CookieManager, self).value_decode(value)[0] | |||||
unquoted = super().value_decode(value)[0] | |||||
try: | try: | ||||
decoded = base64.b64decode(unquoted).decode("utf8") | decoded = base64.b64decode(unquoted).decode("utf8") | ||||
except (TypeError, UnicodeDecodeError): | except (TypeError, UnicodeDecodeError): | ||||
return False, "False" | return False, "False" | ||||
if decoded.startswith(self.MAGIC): | if decoded.startswith(self.MAGIC): | ||||
return decoded[len(self.MAGIC):], value | |||||
return decoded[len(self.MAGIC) :], value | |||||
return False, "False" | return False, "False" | ||||
def value_encode(self, value): | def value_encode(self, value): | ||||
encoded = base64.b64encode(self.MAGIC + value.encode("utf8")) | encoded = base64.b64encode(self.MAGIC + value.encode("utf8")) | ||||
quoted = super(_CookieManager, self).value_encode(encoded)[1] | |||||
quoted = super().value_encode(encoded)[1] | |||||
return value, quoted | return value, quoted | ||||
@property | @property | ||||
@@ -44,6 +43,7 @@ class _CookieManager(SimpleCookie): | |||||
def parse_cookies(path, cookies): | def parse_cookies(path, cookies): | ||||
return _CookieManager(path, cookies) | return _CookieManager(path, cookies) | ||||
def set_cookie(key, value, days=0): | def set_cookie(key, value, days=0): | ||||
g.cookies[key] = value | g.cookies[key] = value | ||||
if days: | if days: | ||||
@@ -53,6 +53,7 @@ def set_cookie(key, value, days=0): | |||||
g.cookies[key]["path"] = g.cookies.path | g.cookies[key]["path"] = g.cookies.path | ||||
g.new_cookies.append(g.cookies[key].OutputString()) | g.new_cookies.append(g.cookies[key].OutputString()) | ||||
def delete_cookie(key): | def delete_cookie(key): | ||||
set_cookie(key, u"", days=-1) | |||||
set_cookie(key, "", days=-1) | |||||
del g.cookies[key] | del g.cookies[key] |
@@ -1,13 +1,12 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from collections import deque | from collections import deque | ||||
from re import sub, UNICODE | |||||
from re import UNICODE, sub | |||||
from earwigbot.wiki.copyvios.markov import EMPTY_INTERSECTION | from earwigbot.wiki.copyvios.markov import EMPTY_INTERSECTION | ||||
from markupsafe import escape | from markupsafe import escape | ||||
__all__ = ["highlight_delta"] | __all__ = ["highlight_delta"] | ||||
def highlight_delta(context, chain, delta): | def highlight_delta(context, chain, delta): | ||||
degree = chain.degree - 1 | degree = chain.degree - 1 | ||||
highlights = [False] * degree | highlights = [False] * degree | ||||
@@ -18,7 +17,7 @@ def highlight_delta(context, chain, delta): | |||||
word = _strip_word(chain, word) | word = _strip_word(chain, word) | ||||
block.append(word) | block.append(word) | ||||
if tuple(block) in delta.chain: | if tuple(block) in delta.chain: | ||||
highlights[-1 * degree:] = [True] * degree | |||||
highlights[-1 * degree :] = [True] * degree | |||||
highlights.append(True) | highlights.append(True) | ||||
else: | else: | ||||
highlights.append(False) | highlights.append(False) | ||||
@@ -38,11 +37,12 @@ def highlight_delta(context, chain, delta): | |||||
last = i - degree + 1 == numwords | last = i - degree + 1 == numwords | ||||
words.append(_highlight_word(word, before, after, first, last)) | words.append(_highlight_word(word, before, after, first, last)) | ||||
else: | else: | ||||
words.append(unicode(escape(word))) | |||||
result.append(u" ".join(words)) | |||||
words.append(str(escape(word))) | |||||
result.append(" ".join(words)) | |||||
i += 1 | i += 1 | ||||
return u"<br /><br />".join(result) | |||||
return "<br /><br />".join(result) | |||||
def _get_next(paragraphs): | def _get_next(paragraphs): | ||||
body = [] | body = [] | ||||
@@ -58,41 +58,44 @@ def _get_next(paragraphs): | |||||
break | break | ||||
return body | return body | ||||
def _highlight_word(word, before, after, first, last): | def _highlight_word(word, before, after, first, last): | ||||
if before and after: | if before and after: | ||||
# Word is in the middle of a highlighted block: | # Word is in the middle of a highlighted block: | ||||
res = unicode(escape(word)) | |||||
res = str(escape(word)) | |||||
if first: | if first: | ||||
res = u'<span class="cv-hl">' + res | |||||
res = '<span class="cv-hl">' + res | |||||
if last: | if last: | ||||
res += u'</span>' | |||||
res += "</span>" | |||||
elif after: | elif after: | ||||
# Word is the first in a highlighted block: | # Word is the first in a highlighted block: | ||||
res = u'<span class="cv-hl">' + _fade_word(word, u"in") | |||||
res = '<span class="cv-hl">' + _fade_word(word, "in") | |||||
if last: | if last: | ||||
res += u"</span>" | |||||
res += "</span>" | |||||
elif before: | elif before: | ||||
# Word is the last in a highlighted block: | # Word is the last in a highlighted block: | ||||
res = _fade_word(word, u"out") + u"</span>" | |||||
res = _fade_word(word, "out") + "</span>" | |||||
if first: | if first: | ||||
res = u'<span class="cv-hl">' + res | |||||
res = '<span class="cv-hl">' + res | |||||
else: | else: | ||||
res = unicode(escape(word)) | |||||
res = str(escape(word)) | |||||
return res | return res | ||||
def _fade_word(word, dir): | def _fade_word(word, dir): | ||||
if len(word) <= 4: | if len(word) <= 4: | ||||
word = unicode(escape(word)) | |||||
return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word) | |||||
if dir == u"out": | |||||
before, after = unicode(escape(word[:-4])), unicode(escape(word[-4:])) | |||||
base = u'{0}<span class="cv-hl-out">{1}</span>' | |||||
word = str(escape(word)) | |||||
return f'<span class="cv-hl-{dir}">{word}</span>' | |||||
if dir == "out": | |||||
before, after = str(escape(word[:-4])), str(escape(word[-4:])) | |||||
base = '{0}<span class="cv-hl-out">{1}</span>' | |||||
return base.format(before, after) | return base.format(before, after) | ||||
else: | else: | ||||
before, after = unicode(escape(word[:4])), unicode(escape(word[4:])) | |||||
base = u'<span class="cv-hl-in">{0}</span>{1}' | |||||
before, after = str(escape(word[:4])), str(escape(word[4:])) | |||||
base = '<span class="cv-hl-in">{0}</span>{1}' | |||||
return base.format(before, after) | return base.format(before, after) | ||||
def _strip_word(chain, word): | def _strip_word(chain, word): | ||||
if word == chain.START or word == chain.END: | if word == chain.START or word == chain.END: | ||||
return word | return word |
@@ -1,19 +1,18 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from contextlib import contextmanager | |||||
import datetime | import datetime | ||||
from contextlib import contextmanager | |||||
from os.path import expanduser, join | from os.path import expanduser, join | ||||
import apsw | import apsw | ||||
from flask import g, request | |||||
import oursql | import oursql | ||||
from flask import g, request | |||||
from sqlalchemy.pool import manage | from sqlalchemy.pool import manage | ||||
oursql = manage(oursql) | oursql = manage(oursql) | ||||
__all__ = ["Query", "cache", "get_db", "get_notice", "httpsfix", "urlstrip"] | __all__ = ["Query", "cache", "get_db", "get_notice", "httpsfix", "urlstrip"] | ||||
class Query(object): | |||||
class Query: | |||||
def __init__(self, method="GET"): | def __init__(self, method="GET"): | ||||
self.query = {} | self.query = {} | ||||
data = request.form if method == "POST" else request.args | data = request.form if method == "POST" else request.args | ||||
@@ -25,14 +24,14 @@ class Query(object): | |||||
def __setattr__(self, key, value): | def __setattr__(self, key, value): | ||||
if key == "query": | if key == "query": | ||||
super(Query, self).__setattr__(key, value) | |||||
super().__setattr__(key, value) | |||||
else: | else: | ||||
self.query[key] = value | self.query[key] = value | ||||
class _AppCache(object): | |||||
class _AppCache: | |||||
def __init__(self): | def __init__(self): | ||||
super(_AppCache, self).__setattr__("_data", {}) | |||||
super().__setattr__("_data", {}) | |||||
def __getattr__(self, key): | def __getattr__(self, key): | ||||
return self._data[key] | return self._data[key] | ||||
@@ -43,6 +42,7 @@ class _AppCache(object): | |||||
cache = _AppCache() | cache = _AppCache() | ||||
def _connect_to_db(engine, args): | def _connect_to_db(engine, args): | ||||
if engine == "mysql": | if engine == "mysql": | ||||
args["read_default_file"] = expanduser("~/.my.cnf") | args["read_default_file"] = expanduser("~/.my.cnf") | ||||
@@ -54,15 +54,17 @@ def _connect_to_db(engine, args): | |||||
conn = apsw.Connection(dbpath) | conn = apsw.Connection(dbpath) | ||||
conn.cursor().execute("PRAGMA foreign_keys = ON") | conn.cursor().execute("PRAGMA foreign_keys = ON") | ||||
return conn | return conn | ||||
raise ValueError("Unknown engine: %s" % engine) | |||||
raise ValueError(f"Unknown engine: {engine}") | |||||
def get_db(): | def get_db(): | ||||
if not g._db: | if not g._db: | ||||
args = cache.bot.config.wiki["_copyviosSQL"].copy() | |||||
args = cache.bot.config.wiki["copyvios"].copy() | |||||
g._engine = engine = args.pop("engine", "mysql").lower() | g._engine = engine = args.pop("engine", "mysql").lower() | ||||
g._db = _connect_to_db(engine, args) | g._db = _connect_to_db(engine, args) | ||||
return g._db | return g._db | ||||
@contextmanager | @contextmanager | ||||
def get_cursor(conn): | def get_cursor(conn): | ||||
if g._engine == "mysql": | if g._engine == "mysql": | ||||
@@ -72,21 +74,24 @@ def get_cursor(conn): | |||||
with conn: | with conn: | ||||
yield conn.cursor() | yield conn.cursor() | ||||
else: | else: | ||||
raise ValueError("Unknown engine: %s" % g._engine) | |||||
raise ValueError(f"Unknown engine: {g._engine}") | |||||
def get_sql_error(): | def get_sql_error(): | ||||
if g._engine == "mysql": | if g._engine == "mysql": | ||||
return oursql.Error | return oursql.Error | ||||
if g._engine == "sqlite": | if g._engine == "sqlite": | ||||
return apsw.Error | return apsw.Error | ||||
raise ValueError("Unknown engine: %s" % g._engine) | |||||
raise ValueError(f"Unknown engine: {g._engine}") | |||||
def sql_dialect(mysql, sqlite): | def sql_dialect(mysql, sqlite): | ||||
if g._engine == "mysql": | if g._engine == "mysql": | ||||
return mysql | return mysql | ||||
if g._engine == "sqlite": | if g._engine == "sqlite": | ||||
return sqlite | return sqlite | ||||
raise ValueError("Unknown engine: %s" % g._engine) | |||||
raise ValueError(f"Unknown engine: {g._engine}") | |||||
def get_notice(): | def get_notice(): | ||||
try: | try: | ||||
@@ -95,16 +100,19 @@ def get_notice(): | |||||
if lines[0] == "<!-- active -->": | if lines[0] == "<!-- active -->": | ||||
return "\n".join(lines[1:]) | return "\n".join(lines[1:]) | ||||
return None | return None | ||||
except IOError: | |||||
except OSError: | |||||
return None | return None | ||||
def httpsfix(context, url): | def httpsfix(context, url): | ||||
if url.startswith("http://"): | if url.startswith("http://"): | ||||
url = url[len("http:"):] | |||||
url = url[len("http:") :] | |||||
return url | return url | ||||
def parse_wiki_timestamp(timestamp): | def parse_wiki_timestamp(timestamp): | ||||
return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S') | |||||
return datetime.datetime.strptime(timestamp, "%Y%m%d%H%M%S") | |||||
def urlstrip(context, url): | def urlstrip(context, url): | ||||
if url.startswith("http://"): | if url.startswith("http://"): |
@@ -1,13 +1,12 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from flask import g | from flask import g | ||||
from markupsafe import escape | from markupsafe import escape | ||||
from .cookies import set_cookie, delete_cookie | |||||
from .cookies import delete_cookie, set_cookie | |||||
from .misc import Query | from .misc import Query | ||||
__all__ = ["process_settings"] | __all__ = ["process_settings"] | ||||
def process_settings(): | def process_settings(): | ||||
query = Query(method="POST") | query = Query(method="POST") | ||||
if query.action == "set": | if query.action == "set": | ||||
@@ -18,6 +17,7 @@ def process_settings(): | |||||
status = None | status = None | ||||
return status | return status | ||||
def _do_set(query): | def _do_set(query): | ||||
cookies = g.cookies | cookies = g.cookies | ||||
changes = set() | changes = set() | ||||
@@ -39,18 +39,19 @@ def _do_set(query): | |||||
changes.add("background") | changes.add("background") | ||||
if changes: | if changes: | ||||
changes = ", ".join(sorted(list(changes))) | changes = ", ".join(sorted(list(changes))) | ||||
return "Updated {0}.".format(changes) | |||||
return f"Updated {changes}." | |||||
return None | return None | ||||
def _do_delete(query): | def _do_delete(query): | ||||
cookies = g.cookies | cookies = g.cookies | ||||
if query.cookie in cookies: | if query.cookie in cookies: | ||||
delete_cookie(query.cookie.encode("utf8")) | delete_cookie(query.cookie.encode("utf8")) | ||||
template = u'Deleted cookie <b><span class="mono">{0}</span></b>.' | |||||
template = 'Deleted cookie <b><span class="mono">{0}</span></b>.' | |||||
return template.format(escape(query.cookie)) | return template.format(escape(query.cookie)) | ||||
elif query.all: | elif query.all: | ||||
number = len(cookies) | number = len(cookies) | ||||
for cookie in cookies.values(): | |||||
for cookie in list(cookies.values()): | |||||
delete_cookie(cookie.key) | delete_cookie(cookie.key) | ||||
return "Deleted <b>{0}</b> cookies.".format(number) | |||||
return f"Deleted <b>{number}</b> cookies." | |||||
return None | return None |
@@ -1,7 +1,5 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from time import time | from time import time | ||||
from urlparse import urlparse | |||||
from urllib.parse import urlparse | |||||
from earwigbot import exceptions | from earwigbot import exceptions | ||||
@@ -9,6 +7,7 @@ from .misc import cache | |||||
__all__ = ["get_site", "update_sites"] | __all__ = ["get_site", "update_sites"] | ||||
def get_site(query): | def get_site(query): | ||||
lang, project, name = query.lang, query.project, query.name | lang, project, name = query.lang, query.project, query.name | ||||
wiki = cache.bot.wiki | wiki = cache.bot.wiki | ||||
@@ -24,11 +23,13 @@ def get_site(query): | |||||
except exceptions.SiteNotFoundError: | except exceptions.SiteNotFoundError: | ||||
return _add_site(lang, project) | return _add_site(lang, project) | ||||
def update_sites(): | def update_sites(): | ||||
if time() - cache.last_sites_update > 60 * 60 * 24 * 7: | if time() - cache.last_sites_update > 60 * 60 * 24 * 7: | ||||
cache.langs, cache.projects = _load_sites() | cache.langs, cache.projects = _load_sites() | ||||
cache.last_sites_update = time() | cache.last_sites_update = time() | ||||
def _add_site(lang, project): | def _add_site(lang, project): | ||||
update_sites() | update_sites() | ||||
if not any(project == item[0] for item in cache.projects): | if not any(project == item[0] for item in cache.projects): | ||||
@@ -40,12 +41,13 @@ def _add_site(lang, project): | |||||
except (exceptions.APIError, exceptions.LoginError): | except (exceptions.APIError, exceptions.LoginError): | ||||
return None | return None | ||||
def _load_sites(): | def _load_sites(): | ||||
site = cache.bot.wiki.get_site() | site = cache.bot.wiki.get_site() | ||||
matrix = site.api_query(action="sitematrix")["sitematrix"] | matrix = site.api_query(action="sitematrix")["sitematrix"] | ||||
del matrix["count"] | del matrix["count"] | ||||
langs, projects = set(), set() | langs, projects = set(), set() | ||||
for site in matrix.itervalues(): | |||||
for site in matrix.values(): | |||||
if isinstance(site, list): # Special sites | if isinstance(site, list): # Special sites | ||||
bad_sites = ["closed", "private", "fishbowl"] | bad_sites = ["closed", "private", "fishbowl"] | ||||
for special in site: | for special in site: | ||||
@@ -55,19 +57,19 @@ def _load_sites(): | |||||
lang, project = "www", full.split(".")[0] | lang, project = "www", full.split(".")[0] | ||||
else: | else: | ||||
lang, project = full.rsplit(".", 2)[:2] | lang, project = full.rsplit(".", 2)[:2] | ||||
code = u"{0}::{1}".format(lang, special["dbname"]) | |||||
code = "{}::{}".format(lang, special["dbname"]) | |||||
name = special["code"].capitalize() | name = special["code"].capitalize() | ||||
langs.add((code, u"{0} ({1})".format(lang, name))) | |||||
langs.add((code, f"{lang} ({name})")) | |||||
projects.add((project, project.capitalize())) | projects.add((project, project.capitalize())) | ||||
else: | else: | ||||
this = set() | this = set() | ||||
for web in site["site"]: | for web in site["site"]: | ||||
if "closed" in web: | if "closed" in web: | ||||
continue | continue | ||||
proj = "wikipedia" if web["code"] == u"wiki" else web["code"] | |||||
proj = "wikipedia" if web["code"] == "wiki" else web["code"] | |||||
this.add((proj, proj.capitalize())) | this.add((proj, proj.capitalize())) | ||||
if this: | if this: | ||||
code = site["code"] | code = site["code"] | ||||
langs.add((code, u"{0} ({1})".format(code, site["name"]))) | |||||
langs.add((code, "{} ({})".format(code, site["name"]))) | |||||
projects |= this | projects |= this | ||||
return list(sorted(langs)), list(sorted(projects)) | return list(sorted(langs)), list(sorted(projects)) |
@@ -1,17 +1,17 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from ast import literal_eval | |||||
import re | import re | ||||
from ast import literal_eval | |||||
import requests | import requests | ||||
from .misc import parse_wiki_timestamp | from .misc import parse_wiki_timestamp | ||||
__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] | |||||
__all__ = ["search_turnitin", "TURNITIN_API_ENDPOINT"] | |||||
TURNITIN_API_ENDPOINT = "https://eranbot.toolforge.org/plagiabot/api.py" | |||||
TURNITIN_API_ENDPOINT = 'https://eranbot.toolforge.org/plagiabot/api.py' | |||||
def search_turnitin(page_title, lang): | def search_turnitin(page_title, lang): | ||||
""" Search the Plagiabot database for Turnitin reports for a page. | |||||
"""Search the Plagiabot database for Turnitin reports for a page. | |||||
Keyword arguments: | Keyword arguments: | ||||
page_title -- string containing the page title | page_title -- string containing the page title | ||||
@@ -21,14 +21,16 @@ def search_turnitin(page_title, lang): | |||||
""" | """ | ||||
return TurnitinResult(_make_api_request(page_title, lang)) | return TurnitinResult(_make_api_request(page_title, lang)) | ||||
def _make_api_request(page_title, lang): | def _make_api_request(page_title, lang): | ||||
""" Query the plagiabot API for Turnitin reports for a given page. | |||||
""" | |||||
stripped_page_title = page_title.replace(' ', '_') | |||||
api_parameters = {'action': 'suspected_diffs', | |||||
'page_title': stripped_page_title, | |||||
'lang': lang, | |||||
'report': 1} | |||||
"""Query the plagiabot API for Turnitin reports for a given page.""" | |||||
stripped_page_title = page_title.replace(" ", "_") | |||||
api_parameters = { | |||||
"action": "suspected_diffs", | |||||
"page_title": stripped_page_title, | |||||
"lang": lang, | |||||
"report": 1, | |||||
} | |||||
result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters, verify=False) | result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters, verify=False) | ||||
# use literal_eval to *safely* parse the resulting dict-containing string | # use literal_eval to *safely* parse the resulting dict-containing string | ||||
@@ -38,14 +40,16 @@ def _make_api_request(page_title, lang): | |||||
parsed_api_result = [] | parsed_api_result = [] | ||||
return parsed_api_result | return parsed_api_result | ||||
class TurnitinResult(object): | |||||
""" Container class for TurnitinReports. Each page may have zero or | |||||
class TurnitinResult: | |||||
"""Container class for TurnitinReports. Each page may have zero or | |||||
more reports of plagiarism. The list will have multiple | more reports of plagiarism. The list will have multiple | ||||
TurnitinReports if plagiarism has been detected for more than one | TurnitinReports if plagiarism has been detected for more than one | ||||
revision. | revision. | ||||
TurnitinResult.reports -- list containing >= 0 TurnitinReport items | TurnitinResult.reports -- list containing >= 0 TurnitinReport items | ||||
""" | """ | ||||
def __init__(self, turnitin_data): | def __init__(self, turnitin_data): | ||||
""" | """ | ||||
Keyword argument: | Keyword argument: | ||||
@@ -54,14 +58,16 @@ class TurnitinResult(object): | |||||
self.reports = [] | self.reports = [] | ||||
for item in turnitin_data: | for item in turnitin_data: | ||||
report = TurnitinReport( | report = TurnitinReport( | ||||
item['diff_timestamp'], item['diff'], item['report']) | |||||
item["diff_timestamp"], item["diff"], item["report"] | |||||
) | |||||
self.reports.append(report) | self.reports.append(report) | ||||
def __repr__(self): | def __repr__(self): | ||||
return str(self.__dict__) | return str(self.__dict__) | ||||
class TurnitinReport(object): | |||||
""" Contains data for each Turnitin report (one on each potentially | |||||
class TurnitinReport: | |||||
"""Contains data for each Turnitin report (one on each potentially | |||||
plagiarized revision). | plagiarized revision). | ||||
TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot | TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot | ||||
@@ -72,6 +78,7 @@ class TurnitinReport(object): | |||||
words -- number of words found in both source and revision | words -- number of words found in both source and revision | ||||
url -- url for the possibly-plagiarized source | url -- url for the possibly-plagiarized source | ||||
""" | """ | ||||
def __init__(self, timestamp, diffid, report): | def __init__(self, timestamp, diffid, report): | ||||
""" | """ | ||||
Keyword argument: | Keyword argument: | ||||
@@ -86,9 +93,7 @@ class TurnitinReport(object): | |||||
self.sources = [] | self.sources = [] | ||||
for item in self.report_data[1]: | for item in self.report_data[1]: | ||||
source = {'percent': item[0], | |||||
'words': item[1], | |||||
'url': item[2]} | |||||
source = {"percent": item[0], "words": item[1], "url": item[2]} | |||||
self.sources.append(source) | self.sources.append(source) | ||||
def __repr__(self): | def __repr__(self): | ||||
@@ -96,12 +101,11 @@ class TurnitinReport(object): | |||||
def _parse_report(self, report_text): | def _parse_report(self, report_text): | ||||
# extract report ID | # extract report ID | ||||
report_id_pattern = re.compile(r'\?rid=(\d*)') | |||||
report_id_pattern = re.compile(r"\?rid=(\d*)") | |||||
report_id = report_id_pattern.search(report_text).groups()[0] | report_id = report_id_pattern.search(report_text).groups()[0] | ||||
# extract percent match, words, and URL for each source in the report | # extract percent match, words, and URL for each source in the report | ||||
extract_info_pattern = re.compile( | |||||
r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ') | |||||
extract_info_pattern = re.compile(r"\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ") | |||||
results = extract_info_pattern.findall(report_text) | results = extract_info_pattern.findall(report_text) | ||||
return (report_id, results) | return (report_id, results) |
@@ -1 +1 @@ | |||||
h1,h2{font-family:sans-serif}pre{white-space:pre-wrap}#help{margin:auto;max-width:1200px}.json{font-family:monospace}.indent{display:inline-block;padding-left:2em}.code{font-family:monospace}.resp-cond,.resp-desc,.resp-dtype{padding:0 .25em;background-color:#eee}.resp-dtype{color:#009}.resp-cond:before,.resp-dtype:before{content:"("}.resp-cond:after,.resp-dtype:after{content:")"}.resp-desc{color:#050}.resp-cond{color:#900;font-style:italic}.param-key{color:#009;font-weight:700}.param-val{color:#900;font-weight:700}.parameters{margin:1em 0}.parameters tr:first-child{font-family:sans-serif;font-size:1.17em;color:#fff}.parameters tr:first-child th{background-color:#369}.parameters td,.parameters th{padding:.2em .5em}.parameters th{background-color:#f0f0f0}.parameters td:first-child{font-family:monospace}.parameters tr:nth-child(2n+3){background-color:#e0e0e0}.parameters tr:nth-child(2n+4){background-color:#f0f0f0}a:link,a:visited{color:#373;text-decoration:none}a:hover{color:#040}a:active,a:hover{text-decoration:underline}a:active{color:#404}.no-color:link,.no-color:visited{color:#000;text-decoration:none}.no-color:active,.no-color:hover{color:#000;text-decoration:underline} | |||||
h1,h2{font-family:sans-serif}pre{white-space:pre-wrap}#help{margin:auto;max-width:1200px}.json{font-family:monospace}.indent{display:inline-block;padding-left:2em}.code{font-family:monospace}.resp-cond,.resp-desc,.resp-dtype{background-color:#eee;padding:0 .25em}.resp-dtype{color:#009}.resp-cond:before,.resp-dtype:before{content:"("}.resp-cond:after,.resp-dtype:after{content:")"}.resp-desc{color:#050}.resp-cond{color:#900;font-style:italic}.param-key{color:#009;font-weight:700}.param-val{color:#900;font-weight:700}.parameters{margin:1em 0}.parameters tr:first-child{color:#fff;font-family:sans-serif;font-size:1.17em}.parameters tr:first-child th{background-color:#369}.parameters td,.parameters th{padding:.2em .5em}.parameters th{background-color:#f0f0f0}.parameters td:first-child{font-family:monospace}.parameters tr:nth-child(2n+3){background-color:#e0e0e0}.parameters tr:nth-child(2n+4){background-color:#f0f0f0}a:link,a:visited{color:#373;text-decoration:none}a:hover{color:#040}a:active,a:hover{text-decoration:underline}a:active{color:#404}.no-color:link,.no-color:visited{color:#000;text-decoration:none}.no-color:active,.no-color:hover{color:#000;text-decoration:underline} |
@@ -1 +1 @@ | |||||
function update_screen_size(){var cache=cache_cookie(),data={width:window.screen.availWidth,height:window.screen.availHeight};cache&&cache.width==data.width&&cache.height==data.height||set_cookie("CopyviosScreenCache",JSON.stringify(data),1095)}function cache_cookie(){var cookie=get_cookie("CopyviosScreenCache");if(cookie)try{data=JSON.parse(cookie);var width=data.width,height=data.height;if(width&&height)return{width:width,height:height}}catch(SyntaxError){}return!1}function get_cookie(name){for(var nameEQ=name+"=",ca=document.cookie.split(";"),i=0;i<ca.length;i++){for(var c=ca[i];" "==c.charAt(0);)c=c.substring(1,c.length);if(0==c.indexOf(nameEQ)){var value=window.atob(c.substring(nameEQ.length,c.length));if(0==value.indexOf("--cpv2"))return value.substring("--cpv2".length,value.length)}}return null}function set_cookie_with_date(name,value,expires){value=window.btoa("--cpv2"+value);var path=window.location.pathname.split("/",2)[1];expires=expires?"; expires="+expires.toUTCString():"",document.cookie=name+"="+value+expires+"; path=/"+path}function set_cookie(name,value,days){var date;days?((date=new Date).setTime(date.getTime()+24*days*60*60*1e3),set_cookie_with_date(name,value,date)):set_cookie_with_date(name,value)}function delete_cookie(name){set_cookie(name,"",-1)}function toggle_notice(){var details=$("#notice-collapse-box"),trigger=$("#notice-collapse-trigger");details.is(":hidden")?(details.show(),trigger.text("[hide]")):(details.hide(),trigger.text("[show]"))}function install_notice(){var details=$("#notice-collapse-box"),trigger=$("#notice-collapse-trigger");0<=details.length&&0<=trigger.length&&(trigger.replaceWith($("<a/>",{id:"notice-collapse-trigger",href:"#",text:"[show]",click:function(){return toggle_notice(),!1}})),details.hide())}$(document).ready(function(){$("#action-search").change(function(){$(".cv-search").prop("disabled",!1),$(".cv-compare").prop("disabled",!0),$(".cv-search-oo-ui").addClass("oo-ui-widget-enabled").removeClass("oo-ui-widget-disabled"),$(".cv-compare-oo-ui").addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled")}),$("#action-compare").change(function(){$(".cv-search").prop("disabled",!0),$(".cv-compare").prop("disabled",!1),$(".cv-search-oo-ui").addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled"),$(".cv-compare-oo-ui").addClass("oo-ui-widget-enabled").removeClass("oo-ui-widget-disabled")}),$("#action-search").is(":checked")&&$("#action-search").change(),$("#action-compare").is(":checked")&&$("#action-compare").change(),$("#cv-form").submit(function(){$("#action-search").is(":checked")&&$.each([["engine","use_engine"],["links","use_links"],["turnitin","turnitin"]],function(i,val){$("#cv-cb-"+val[0]).is(":checked")&&$("#cv-form input[type='hidden'][name='"+val[1]+"']").prop("disabled",!0)}),$("#cv-form button[type='submit']").prop("disabled",!0).css("cursor","progress").parent().addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled")}),0<=$("#cv-additional").length&&($("#cv-additional").css("display","block"),$(".source-default-hidden").css("display","none"),$("#show-additional-sources").click(function(){return $(".source-default-hidden").css("display",""),$("#cv-additional").css("display","none"),!1})),install_notice()}); | |||||
function update_screen_size(){var cache=cache_cookie(),data={width:window.screen.availWidth,height:window.screen.availHeight};cache&&cache.width==data.width&&cache.height==data.height||set_cookie("CopyviosScreenCache",JSON.stringify(data),1095)}function cache_cookie(){var cookie=get_cookie("CopyviosScreenCache");if(cookie)try{var width=(data=JSON.parse(cookie)).width,height=data.height;if(width&&height)return{width:width,height:height}}catch(SyntaxError){}return!1}function get_cookie(name){for(var nameEQ=name+"=",ca=document.cookie.split(";"),i=0;i<ca.length;i++){for(var c=ca[i];" "==c.charAt(0);)c=c.substring(1,c.length);if(0==c.indexOf(nameEQ)){var value=window.atob(c.substring(nameEQ.length,c.length));if(0==value.indexOf("--cpv2"))return value.substring("--cpv2".length,value.length)}}return null}function set_cookie_with_date(name,value,date){value=window.btoa("--cpv2"+value);var path=window.location.pathname.split("/",2)[1];date=date?"; expires="+date.toUTCString():"",document.cookie=name+"="+value+date+"; path=/"+path}function set_cookie(name,value,days){var date;days?((date=new Date).setTime(date.getTime()+24*days*60*60*1e3),set_cookie_with_date(name,value,date)):set_cookie_with_date(name,value)}function delete_cookie(name){set_cookie(name,"",-1)}function toggle_notice(){var details=$("#notice-collapse-box"),trigger=$("#notice-collapse-trigger");details.is(":hidden")?(details.show(),trigger.text("[hide]")):(details.hide(),trigger.text("[show]"))}function install_notice(){var details=$("#notice-collapse-box"),trigger=$("#notice-collapse-trigger");0<=details.length&&0<=trigger.length&&(trigger.replaceWith($("<a/>",{id:"notice-collapse-trigger",href:"#",text:"[show]",click:function(){return toggle_notice(),!1}})),details.hide())}$(document).ready(function(){$("#action-search").change(function(){$(".cv-search").prop("disabled",!1),$(".cv-compare").prop("disabled",!0),$(".cv-search-oo-ui").addClass("oo-ui-widget-enabled").removeClass("oo-ui-widget-disabled"),$(".cv-compare-oo-ui").addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled")}),$("#action-compare").change(function(){$(".cv-search").prop("disabled",!0),$(".cv-compare").prop("disabled",!1),$(".cv-search-oo-ui").addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled"),$(".cv-compare-oo-ui").addClass("oo-ui-widget-enabled").removeClass("oo-ui-widget-disabled")}),$("#action-search").is(":checked")&&$("#action-search").change(),$("#action-compare").is(":checked")&&$("#action-compare").change(),$("#cv-form").submit(function(){$("#action-search").is(":checked")&&$.each([["engine","use_engine"],["links","use_links"],["turnitin","turnitin"]],function(i,val){$("#cv-cb-"+val[0]).is(":checked")&&$("#cv-form input[type='hidden'][name='"+val[1]+"']").prop("disabled",!0)}),$("#cv-form button[type='submit']").prop("disabled",!0).css("cursor","progress").parent().addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled")}),0<=$("#cv-additional").length&&($("#cv-additional").css("display","block"),$(".source-default-hidden").css("display","none"),$("#show-additional-sources").click(function(){return $(".source-default-hidden").css("display",""),$("#cv-additional").css("display","none"),!1})),install_notice()}); |