Initial Python 3 conversion

há 2 meses · 1f893911eb
--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,7 @@
 *.egg-info
 .DS_Store
 __pycache__

 venv
 .earwigbot
 logs/*
 !logs/.gitinclude
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,11 @@
 repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.6.2
    hooks:
      - id: ruff
        args: [--fix]
      - id: ruff-format
  - repo: https://github.com/RobertCraigie/pyright-python
    rev: v1.1.377
    hooks:
      - id: pyright
--- a/+ 1
+++ b/+ 1
@@ -1,4 +1,4 @@
 Copyright (c) 2009-2021 Ben Kurtovic <ben.kurtovic@gmail.com>
 Copyright (c) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/README.md
+++ b/README.md
@@ -1,50 +1,46 @@
 This is a [copyright violation](https://en.wikipedia.org/wiki/WP:COPYVIO)
 detector running on [Wikimedia Cloud Services](https://copyvios.toolforge.org/).
 detector web tool for Wikipedia articles running on
 [Wikimedia Cloud Services](https://wikitech.wikimedia.org/wiki/Help:Cloud_Services_introduction)
 at [copyvios.toolforge.org](https://copyvios.toolforge.org/).

 It can search the web for content similar to a given article, and graphically
 compare an article to a specific URL. Some technical details are expanded upon
 [in a blog post](https://benkurtovic.com/2014/08/20/copyvio-detector.html).
 compare an article to specific URLs. Some technical details are expanded upon
 [in a blog post](https://benkurtovic.com/2014/08/20/copyvio-detector.html),
 though much of it is outdated.

 Dependencies
 Installation
 ============

 * [earwigbot](https://github.com/earwig/earwigbot) >= 0.1
 * [flask](https://flask.palletsprojects.com/) >= 0.10.1
 * [flask-mako](https://pythonhosted.org/Flask-Mako/) >= 0.3
 * [mako](https://www.makotemplates.org/) >= 0.7.2
 * [mwparserfromhell](https://github.com/earwig/mwparserfromhell) >= 0.3
 * [oursql](https://pythonhosted.org/oursql/) >= 0.9.3.1
 * [requests](https://requests.readthedocs.io/) >= 2.9.1
 * [SQLAlchemy](https://www.sqlalchemy.org/) >= 0.9.6
 * [apsw](https://github.com/rogerbinns/apsw) >= 3.26.0
 * [uglifyjs](https://github.com/mishoo/UglifyJS) >= 3.12.6
 * [cssnano](https://github.com/cssnano/cssnano) >= 4.1.10
 * [postcss-cli](https://github.com/postcss/postcss-cli) >= 8.3.1
 - If using Toolforge, clone the repository to `~/www/python/src`, or otherwise
  symlink it to that directory.

 Running
 =======
 - Create a virtual environment and install the dependencies. On Toolforge,
  this should be in `~/www/python/venv`, otherwise it can be in a subdirectory
  of the git project named `venv`:

    python3 -m venv venv
    . venv/bin/activate
    pip install -e .

 - If using Toolforge, you should clone the repository to `~/www/python/src`, or
  otherwise symlink it to that directory. A
  [virtualenv](https://virtualenv.pypa.io/) should be created at
  `~/www/python/venv`.
 - If you intend to modify CSS or JS, install the frontend dependencies:

 - Install all dependencies listed above.
    npm install -g uglify-js cssnano postcss postcss-cli

 - Create an SQL database with the `cache` and `cache_data` tables defined by
  [earwigbot-plugins](https://github.com/earwig/earwigbot-plugins/blob/develop/tasks/schema/afc_copyvios.sql).
 - Create an SQL database with the tables defined by `schema.sql`.

 - Create an earwigbot instance in `.earwigbot` (run `earwigbot .earwigbot`). In
  `.earwigbot/config.yml`, fill out the connection info for the database by
 - Create an earwigbot instance in `.earwigbot` (run `earwigbot .earwigbot`).
  In `.earwigbot/config.yml`, fill out the connection info for the database by
  adding the following to the `wiki` section:

        _copyviosSQL:
        copyvios:
            engine: mysql
            host: <hostname of database server>
            db:   <name of database>
            db: <name of database>

  If additional arguments are needed by `oursql.connect()`, like usernames or
  passwords, they should be added to the `_copyviosSQL` section.
 Running
 =======

 - Run `./build.py` to minify JS and CSS files.
 - Run `./build.py` to minify JS and CSS files after making any frontend
  changes.

 - Start the web server (on Toolforge, `webservice uwsgi-python start`).
 - Start your WSGI server pointing to app:app.
--- a/app.py
+++ b/app.py
@@ -1,10 +1,9 @@
 #! /usr/bin/env python
 # -*- coding: utf-8  -*-

 import logging
 from functools import wraps
 from hashlib import md5
 from json import dumps
 from logging import DEBUG, INFO, getLogger
 from logging.handlers import TimedRotatingFileHandler
 from os import path
 from time import asctime
@@ -13,7 +12,7 @@ from traceback import format_exc
 from earwigbot.bot import Bot
 from earwigbot.wiki.copyvios import globalize
 from flask import Flask, g, make_response, request
 from flask_mako import MakoTemplates, render_template, TemplateError
 from flask_mako import MakoTemplates, TemplateError, render_template

 from copyvios.api import format_api_error, handle_api_request
 from copyvios.checker import do_check
@@ -26,24 +25,27 @@ app = Flask(__name__)
 MakoTemplates(app)

 hand = TimedRotatingFileHandler("logs/app.log", when="midnight", backupCount=7)
 hand.setLevel(DEBUG)
 hand.setLevel(logging.DEBUG)
 app.logger.addHandler(hand)
 app.logger.info(u"Flask server started " + asctime())
 app.logger.info("Flask server started " + asctime())
 app._hash_cache = {}


 def catch_errors(func):
    @wraps(func)
    def inner(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except TemplateError as exc:
            app.logger.error(u"Caught exception:\n{0}".format(exc.text))
            app.logger.error(f"Caught exception:\n{exc.text}")
            return render_template("error.mako", traceback=exc.text)
        except Exception:
            app.logger.exception(u"Caught exception:")
            app.logger.exception("Caught exception:")
            return render_template("error.mako", traceback=format_exc())

    return inner


@app.before_first_request
 def setup_app():
    cache.bot = Bot(".earwigbot", 100)
@@ -54,31 +56,43 @@ def setup_app():

    globalize(num_workers=8)


@app.before_request
 def prepare_request():
    g._db = None
    g.cookies = parse_cookies(
        request.script_root or "/", request.environ.get("HTTP_COOKIE"))
        request.script_root or "/", request.environ.get("HTTP_COOKIE")
    )
    g.new_cookies = []


@app.after_request
 def add_new_cookies(response):
    for cookie in g.new_cookies:
        response.headers.add("Set-Cookie", cookie)
    return response


@app.after_request
 def write_access_log(response):
    msg = u"%s %s %s %s -> %s"
    app.logger.debug(msg, asctime(), request.method, request.path,
                     request.values.to_dict(), response.status_code)
    msg = "%s %s %s %s -> %s"
    app.logger.debug(
        msg,
        asctime(),
        request.method,
        request.path,
        request.values.to_dict(),
        response.status_code,
    )
    return response


@app.teardown_appcontext
 def close_databases(error):
    if g._db:
        g._db.close()


 def external_url_handler(error, endpoint, values):
    if endpoint == "static" and "file" in values:
        fpath = path.join(app.static_folder, values["file"])
@@ -90,11 +104,13 @@ def external_url_handler(error, endpoint, values):
            with open(fpath, "rb") as f:
                hashstr = md5(f.read()).hexdigest()
            app._hash_cache[fpath] = (mtime, hashstr)
        return "/static/{0}?v={1}".format(values["file"], hashstr)
        return f"/static/{values['file']}?v={hashstr}"
    raise error


 app.url_build_error_handlers.append(external_url_handler)


@app.route("/")
@catch_errors
 def index():
@@ -102,8 +118,13 @@ def index():
    update_sites()
    query = do_check()
    return render_template(
        "index.mako", notice=notice, query=query, result=query.result,
        turnitin_result=query.turnitin_result)
        "index.mako",
        notice=notice,
        query=query,
        result=query.result,
        turnitin_result=query.turnitin_result,
    )


@app.route("/settings", methods=["GET", "POST"])
@catch_errors
@@ -111,15 +132,20 @@ def settings():
    status = process_settings() if request.method == "POST" else None
    update_sites()
    default = cache.bot.wiki.get_site()
    kwargs = {"status": status, "default_lang": default.lang,
              "default_project": default.project}
    kwargs = {
        "status": status,
        "default_lang": default.lang,
        "default_project": default.project,
    }
    return render_template("settings.mako", **kwargs)


@app.route("/api")
@catch_errors
 def api():
    return render_template("api.mako", help=True)


@app.route("/api.json")
@catch_errors
 def api_json():
@@ -134,7 +160,7 @@ def api_json():
        except Exception as exc:
            result = format_api_error("unhandled_exception", exc)
    else:
        errmsg = u"Unknown format: '{0}'".format(format)
        errmsg = f"Unknown format: '{format}'"
        result = format_api_error("unknown_format", errmsg)

    if format == "jsonfm":
@@ -144,5 +170,6 @@ def api_json():
    resp.headers["Access-Control-Allow-Origin"] = "*"
    return resp

 if __name__ == '__main__':

 if __name__ == "__main__":
    app.run()
--- a/build.py
+++ b/build.py
@@ -1,13 +1,13 @@
 #! /usr/bin/env python
 # -*- coding: utf-8  -*-

 from __future__ import print_function
 import os
 import subprocess


 def process(*args):
    print(*args)
    content = subprocess.check_output(args)
    subprocess.run(args, check=True)


 def main():
    root = os.path.join(os.path.dirname(__file__), "static")
@@ -15,10 +15,25 @@ def main():
        for filename in filenames:
            name = os.path.relpath(os.path.join(dirpath, filename))
            if filename.endswith(".js") and ".min." not in filename:
                process("uglifyjs", "--compress", "-o", name.replace(".js", ".min.js"), "--", name)
                process(
                    "uglifyjs",
                    "--compress",
                    "-o",
                    name.replace(".js", ".min.js"),
                    "--",
                    name,
                )
            if filename.endswith(".css") and ".min." not in filename:
                process("postcss", "-u", "cssnano", "--no-map", name, "-o",
                        name.replace(".css", ".min.css"))
                process(
                    "postcss",
                    "-u",
                    "cssnano",
                    "--no-map",
                    name,
                    "-o",
                    name.replace(".css", ".min.css"),
                )


 if __name__ == "__main__":
    main()
--- a/copyvios/init.py
+++ b/copyvios/init.py
@@ -1 +0,0 @@
 # -*- coding: utf-8  -*-
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,44 @@
 [project]
 name = "copyvios"
 version = "1.0.dev0"
 authors = [
    {name = "Ben Kurtovic", email = "ben@benkurtovic.com"},
 ]
 description = "A copyright violation detector web tool for Wikipedia articles"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
    "earwigbot[sql,copyvios] >= 0.4",
    "mwparserfromhell >= 0.6",
    "flask >= 3.0",
    "flask-mako >= 0.4",
    "mako >= 1.3.5",
    "requests >= 2.32.3",
    "SQLAlchemy >= 2.0.32",
    "apsw >= 3.46.1",
 ]

 [project.urls]
 Homepage = "https://github.com/earwig/copyvios"
 Issues = "https://github.com/earwig/copyvios/issues"

 [build-system]
 requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"

 [tool.pyright]
 pythonVersion = "3.11"
 exclude = [
    # TODO
    "src/copyvios/*",
    "app.py",
 ]
 venvPath = "."
 venv = "venv"

 [tool.ruff]
 target-version = "py311"

 [tool.ruff.lint]
 select = ["E4", "E7", "E9", "F", "I", "UP"]
 ignore = ["F403"]
--- a/scripts/log_analyzer.py
+++ b/scripts/log_analyzer.py
@@ -2,48 +2,59 @@
 import argparse
 import re
 import sqlite3
 from typing import Any

 REGEX = re.compile(
    r'^'
    r'{address space usage: (?P<used_bytes>-?\d+) bytes/(?P<used_mb>\w+)} '
    r'{rss usage: (?P<rss_bytes>-?\d+) bytes/(?P<rss_mb>\w+)} '
    r'\[pid: (?P<pid>\d+)\|app: -\|req: -/-\] (?P<ip>[0-9.]+) \(-\) '
    r'{(?P<vars>\d+) vars in (?P<var_bytes>\d+) bytes} '
    r'\[(?P<date>[0-9A-Za-z: ]+)\] (?P<method>\w+) (?P<url>.*?) => '
    r'generated (?P<resp_bytes>\d+) bytes in (?P<msecs>\d+) msecs '
    r'\((- http://hasty.ai)?(?P<proto>[A-Z0-9/.]+) (?P<status>\d+)\) '
    r'(?P<headers>\d+) headers in (?P<header_bytes>\d+) bytes '
    r'\((?P<switches>\d+) switches on core (?P<core>\d+)\) '
    r'(?P<agent>.*?)'
    r'( (?P<referer>https?://[^ ]*?))?( -)?( http(://|%3A%2F%2F)hasty\.ai)?'
    r'$'
    r"^"
    r"{address space usage: (?P<used_bytes>-?\d+) bytes/(?P<used_mb>\w+)} "
    r"{rss usage: (?P<rss_bytes>-?\d+) bytes/(?P<rss_mb>\w+)} "
    r"\[pid: (?P<pid>\d+)\|app: -\|req: -/-\] (?P<ip>[0-9.]+) \(-\) "
    r"{(?P<vars>\d+) vars in (?P<var_bytes>\d+) bytes} "
    r"\[(?P<date>[0-9A-Za-z: ]+)\] (?P<method>\w+) (?P<url>.*?) => "
    r"generated (?P<resp_bytes>\d+) bytes in (?P<msecs>\d+) msecs "
    r"\((- http://hasty.ai)?(?P<proto>[A-Z0-9/.]+) (?P<status>\d+)\) "
    r"(?P<headers>\d+) headers in (?P<header_bytes>\d+) bytes "
    r"\((?P<switches>\d+) switches on core (?P<core>\d+)\) "
    r"(?P<agent>.*?)"
    r"( (?P<referer>https?://[^ ]*?))?( -)?( http(://|%3A%2F%2F)hasty\.ai)?"
    r"$"
 )

 def save_logs(logs):

 def save_logs(logs: list[dict[str, Any]]) -> None:
    columns = sorted(REGEX.groupindex, key=lambda col: REGEX.groupindex[col])
    conn = sqlite3.Connection('logs.db')
    conn = sqlite3.Connection("logs.db")
    cur = conn.cursor()
    cur.execute('CREATE TABLE IF NOT EXISTS logs(%s)' % ', '.join(columns))
    cur.executemany('INSERT INTO logs VALUES (%s)' % ', '.join(['?'] * len(columns)),
                    [[log[col] for col in columns] for log in logs])
    cur.execute(f"CREATE TABLE IF NOT EXISTS logs({', '.join(columns)})")
    params = ", ".join(["?"] * len(columns))
    cur.executemany(
        f"INSERT INTO logs VALUES ({params})",
        [[log[col] for col in columns] for log in logs],
    )
    conn.commit()
    conn.close()

 def read_logs(path):
    with open(path, 'r', errors='replace') as fp:

 def read_logs(path: str) -> list[dict[str, Any]]:
    with open(path, errors="replace") as fp:
        lines = fp.readlines()
    parsed = [(line, REGEX.match(line.strip())) for line in lines
              if line.startswith('{address space usage')]
    parsed = [
        (line, REGEX.match(line.strip()))
        for line in lines
        if line.startswith("{address space usage")
    ]
    for line, match in parsed:
        if not match:
            print('failed to parse:', line.strip())
            print("failed to parse:", line.strip())
    return [match.groupdict() for _, match in parsed if match]


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('logfile', default='uwsgi.log')
    parser.add_argument("logfile", default="uwsgi.log")
    args = parser.parse_args()
    save_logs(read_logs(args.logfile))

 if __name__ == '__main__':

 if __name__ == "__main__":
    main()
--- a/+ 0
+++ b/+ 0
--- a/src/copyvios/api.py
+++ b/src/copyvios/api.py
@@ -1,9 +1,7 @@
 # -*- coding: utf-8  -*-

 from collections import OrderedDict

 from .checker import T_POSSIBLE, T_SUSPECT, do_check
 from .highlighter import highlight_delta
 from .checker import do_check, T_POSSIBLE, T_SUSPECT
 from .misc import Query, cache
 from .sites import update_sites

@@ -15,83 +13,107 @@ _CHECK_ERRORS = {
    "no URL": "The parameter 'url' is required for URL comparisons",
    "bad URI": "The given URI scheme is unsupported",
    "no data": "No text could be found in the given URL (note that only HTML "
               "and plain text pages are supported, and content generated by "
               "JavaScript or found inside iframes is ignored)",
    "and plain text pages are supported, and content generated by "
    "JavaScript or found inside iframes is ignored)",
    "timeout": "The given URL timed out before any data could be retrieved",
    "search error": "An error occurred while using the search engine; try "
                    "reloading or setting 'use_engine' to 0",
    "reloading or setting 'use_engine' to 0",
 }


 def _serialize_page(page):
    return OrderedDict((("title", page.title), ("url", page.url)))


 def _serialize_source(source, show_skip=True):
    if not source:
        return OrderedDict((
            ("url", None), ("confidence", 0.0), ("violation", "none")))
        return OrderedDict((("url", None), ("confidence", 0.0), ("violation", "none")))

    conf = source.confidence
    data = OrderedDict((
        ("url", source.url),
        ("confidence", conf),
        ("violation", "suspected" if conf >= T_SUSPECT else
                      "possible" if conf >= T_POSSIBLE else "none")
    ))
    data = OrderedDict(
        (
            ("url", source.url),
            ("confidence", conf),
            (
                "violation",
                (
                    "suspected"
                    if conf >= T_SUSPECT
                    else "possible"
                    if conf >= T_POSSIBLE
                    else "none"
                ),
            ),
        )
    )
    if show_skip:
        data["skipped"] = source.skipped
        data["excluded"] = source.excluded
    return data


 def _serialize_detail(result):
    source_chain, delta = result.best.chains
    article = highlight_delta(None, result.article_chain, delta)
    source = highlight_delta(None, source_chain, delta)
    return OrderedDict((("article", article), ("source", source)))


 def format_api_error(code, info):
    if isinstance(info, BaseException):
        info = type(info).__name__ + ": " + str(info)
    elif isinstance(info, unicode):
        info = info.encode("utf8")
    error_inner = OrderedDict((("code", code), ("info", info)))
    return OrderedDict((("status", "error"), ("error", error_inner)))


 def _hook_default(query):
    info = u"Unknown action: '{0}'".format(query.action.lower())
    info = f"Unknown action: '{query.action.lower()}'"
    return format_api_error("unknown_action", info)


 def _hook_check(query):
    do_check(query)
    if not query.submitted:
        info = ("The query parameters 'project', 'lang', and either 'title' "
                "or 'oldid' are required for checks")
        info = (
            "The query parameters 'project', 'lang', and either 'title' "
            "or 'oldid' are required for checks"
        )
        return format_api_error("missing_params", info)
    if query.error:
        info = _CHECK_ERRORS.get(query.error, "An unknown error occurred")
        return format_api_error(query.error.replace(" ", "_"), info)
    elif not query.site:
        info = (u"The given site (project={0}, lang={1}) either doesn't exist,"
                u" is closed, or is private").format(query.project, query.lang)
        info = (
            f"The given site (project={query.project}, lang={query.lang}) either doesn't exist,"
            " is closed, or is private"
        )
        return format_api_error("bad_site", info)
    elif not query.result:
        if query.oldid:
            info = u"The revision ID couldn't be found: {0}"
            info = "The revision ID couldn't be found: {0}"
            return format_api_error("bad_oldid", info.format(query.oldid))
        else:
            info = u"The page couldn't be found: {0}"
            info = "The page couldn't be found: {0}"
            return format_api_error("bad_title", info.format(query.page.title))

    result = query.result
    data = OrderedDict((
        ("status", "ok"),
        ("meta", OrderedDict((
            ("time", result.time),
            ("queries", result.queries),
            ("cached", result.cached),
            ("redirected", bool(query.redirected_from))
        ))),
        ("page", _serialize_page(query.page))
    ))
    data = OrderedDict(
        (
            ("status", "ok"),
            (
                "meta",
                OrderedDict(
                    (
                        ("time", result.time),
                        ("queries", result.queries),
                        ("cached", result.cached),
                        ("redirected", bool(query.redirected_from)),
                    )
                ),
            ),
            ("page", _serialize_page(query.page)),
        )
    )
    if result.cached:
        data["meta"]["cache_time"] = result.cache_time
    if query.redirected_from:
@@ -102,11 +124,13 @@ def _hook_check(query):
        data["detail"] = _serialize_detail(result)
    return data


 def _hook_sites(query):
    update_sites()
    return OrderedDict((
        ("status", "ok"), ("langs", cache.langs), ("projects", cache.projects)
    ))
    return OrderedDict(
        (("status", "ok"), ("langs", cache.langs), ("projects", cache.projects))
    )


 _HOOKS = {
    "compare": _hook_check,
@@ -114,13 +138,14 @@ _HOOKS = {
    "sites": _hook_sites,
 }


 def handle_api_request():
    query = Query()
    if query.version:
        try:
            query.version = int(query.version)
        except ValueError:
            info = "The version string is invalid: {0}".format(query.version)
            info = f"The version string is invalid: {query.version}"
            return format_api_error("invalid_version", info)
    else:
        query.version = 1
@@ -129,5 +154,5 @@ def handle_api_request():
        action = query.action.lower() if query.action else ""
        return _HOOKS.get(action, _hook_default)(query)

    info = "The API version is unsupported: {0}".format(query.version)
    info = f"The API version is unsupported: {query.version}"
    return format_api_error("unsupported_version", info)
--- a/src/copyvios/attribution.py
+++ b/src/copyvios/attribution.py
@@ -1,20 +1,19 @@
 # -*- coding: utf-8  -*-

 from __future__ import unicode_literals

 from earwigbot.wiki import NS_TEMPLATE

 __all__ = ["get_attribution_info"]

 ATTRIB_TEMPLATES = {
    "enwiki": {
        "CC-notice", "Cc-notice",
        "CC-notice",
        "Cc-notice",
        "Citation-attribution",
        "Free-content attribution", "Open-source attribution",
        "Free-content attribution",
        "Open-source attribution",
        "Source-attribution",
    }
 }


 def get_attribution_info(site, page):
    """Check to see if the given page has some kind of attribution info.

@@ -30,7 +29,7 @@ def get_attribution_info(site, page):

    for template in page.parse().ifilter_templates():
        if template.name.matches(templates):
            name = unicode(template.name).strip()
            name = str(template.name).strip()
            title = name if ":" in name else prefix + ":" + name
            return name, site.get_page(title).url
    return None
--- a/src/copyvios/background.py
+++ b/src/copyvios/background.py
@@ -1,10 +1,10 @@
 # -*- coding: utf-8  -*-

 from datetime import datetime, timedelta
 from json import loads
 import random
 import re
 import urllib
 import urllib.error
 import urllib.parse
 import urllib.request
 from datetime import datetime, timedelta
 from json import loads

 from earwigbot import exceptions
 from flask import g
@@ -13,32 +13,39 @@ from .misc import cache

 __all__ = ["set_background"]


 def _get_commons_site():
    try:
        return cache.bot.wiki.get_site("commonswiki")
    except exceptions.SiteNotFoundError:
        return cache.bot.wiki.add_site(project="wikimedia", lang="commons")


 def _load_file(site, filename):
    data = site.api_query(
        action="query", prop="imageinfo", iiprop="url|size|canonicaltitle",
        titles="File:" + filename)
    res = data["query"]["pages"].values()[0]["imageinfo"][0]
    name = res["canonicaltitle"][len("File:"):].replace(" ", "_")
        action="query",
        prop="imageinfo",
        iiprop="url|size|canonicaltitle",
        titles="File:" + filename,
    )
    res = list(data["query"]["pages"].values())[0]["imageinfo"][0]
    name = res["canonicaltitle"][len("File:") :].replace(" ", "_")
    return name, res["url"], res["descriptionurl"], res["width"], res["height"]


 def _get_fresh_potd():
    site = _get_commons_site()
    date = datetime.utcnow().strftime("%Y-%m-%d")
    page = site.get_page("Template:Potd/" + date)
    regex = ur"\{\{Potd filename\|(?:1=)?(.*?)\|.*?\}\}"
    regex = r"\{\{Potd filename\|(?:1=)?(.*?)\|.*?\}\}"
    filename = re.search(regex, page.get()).group(1)
    return _load_file(site, filename)


 def _get_fresh_list():
    site = _get_commons_site()
    page = site.get_page("User:The Earwig/POTD")
    regex = ur"\*\*?\s*\[\[:File:(.*?)\]\]"
    regex = r"\*\*?\s*\[\[:File:(.*?)\]\]"
    filenames = re.findall(regex, page.get())

    # Ensure all workers share the same background each day:
@@ -46,6 +53,7 @@ def _get_fresh_list():
    filename = random.choice(filenames)
    return _load_file(site, filename)


 def _build_url(screen, filename, url, imgwidth, imgheight):
    width = screen["width"]
    if float(imgwidth) / imgheight > float(screen["width"]) / screen["height"]:
@@ -53,12 +61,11 @@ def _build_url(screen, filename, url, imgwidth, imgheight):
    if width >= imgwidth:
        return url
    url = url.replace("/commons/", "/commons/thumb/")
    return "%s/%dpx-%s" % (url, width, urllib.quote(filename.encode("utf8")))
    return "%s/%dpx-%s" % (url, width, urllib.parse.quote(filename.encode("utf8")))


 _BACKGROUNDS = {"potd": _get_fresh_potd, "list": _get_fresh_list}

 _BACKGROUNDS = {
    "potd": _get_fresh_potd,
    "list": _get_fresh_list
 }

 def _get_background(selected):
    if not cache.last_background_updates:
@@ -73,6 +80,7 @@ def _get_background(selected):
        cache.last_background_updates[selected] = datetime.utcnow().date()
    return cache.background_data[selected]


 def set_background(selected):
    if "CopyviosScreenCache" in g.cookies:
        screen_cache = g.cookies["CopyviosScreenCache"].value
--- a/src/copyvios/checker.py
+++ b/src/copyvios/checker.py
@@ -1,17 +1,15 @@
 # -*- coding: utf-8  -*-

 import re
 from datetime import datetime, timedelta
 from hashlib import sha256
 from logging import getLogger
 import re
 from urlparse import urlparse
 from urllib.parse import urlparse

 from earwigbot import exceptions
 from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain
 from earwigbot.wiki.copyvios.parsers import ArticleTextParser
 from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult
 from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource

 from .misc import Query, get_db, get_cursor, get_sql_error, sql_dialect
 from .misc import Query, get_cursor, get_db, get_sql_error, sql_dialect
 from .sites import get_site
 from .turnitin import search_turnitin

@@ -22,9 +20,11 @@ T_SUSPECT = 0.75

 _LOGGER = getLogger("copyvios.checker")


 def _coerce_bool(val):
    return val and val not in ("0", "false")


 def do_check(query=None):
    if not query:
        query = Query()
@@ -44,6 +44,7 @@ def do_check(query=None):
            _get_results(query, follow=not _coerce_bool(query.noredirect))
    return query


 def _get_results(query, follow=True):
    if query.oldid:
        if not re.match(r"^\d+$", query.oldid):
@@ -100,8 +101,9 @@ def _get_results(query, follow=True):
                degree = int(query.degree)
            except ValueError:
                pass
        result = page.copyvio_compare(query.url, min_confidence=T_SUSPECT,
                                      max_time=10, degree=degree)
        result = page.copyvio_compare(
            query.url, min_confidence=T_SUSPECT, max_time=10, degree=degree
        )
        if result.best.chains[0] is EMPTY:
            query.error = "timeout" if result.time > 10 else "no data"
            return
@@ -110,12 +112,18 @@ def _get_results(query, follow=True):
    else:
        query.error = "bad action"


 def _get_page_by_revid(site, revid):
    try:
        res = site.api_query(action="query", prop="info|revisions", revids=revid,
                             rvprop="content|timestamp", inprop="protection|url",
                             rvslots="main")
        page_data = res["query"]["pages"].values()[0]
        res = site.api_query(
            action="query",
            prop="info|revisions",
            revids=revid,
            rvprop="content|timestamp",
            inprop="protection|url",
            rvslots="main",
        )
        page_data = list(res["query"]["pages"].values())[0]
        title = page_data["title"]
        # Only need to check that these exist:
        revision = page_data["revisions"][0]
@@ -131,24 +139,30 @@ def _get_page_by_revid(site, revid):
    page._load_content(res)
    return page


 def _perform_check(query, page, use_engine, use_links):
    conn = get_db()
    sql_error = get_sql_error()
    mode = "{0}:{1}:".format(use_engine, use_links)
    mode = f"{use_engine}:{use_links}:"

    if not _coerce_bool(query.nocache):
        try:
            query.result = _get_cached_results(
                page, conn, mode, _coerce_bool(query.noskip))
                page, conn, mode, _coerce_bool(query.noskip)
            )
        except sql_error:
            _LOGGER.exception("Failed to retrieve cached results")

    if not query.result:
        try:
            query.result = page.copyvio_check(
                min_confidence=T_SUSPECT, max_queries=8, max_time=30,
                no_searches=not use_engine, no_links=not use_links,
                short_circuit=not query.noskip)
                min_confidence=T_SUSPECT,
                max_queries=8,
                max_time=30,
                no_searches=not use_engine,
                no_links=not use_links,
                short_circuit=not query.noskip,
            )
        except exceptions.SearchQueryError as exc:
            query.error = "search error"
            query.exception = exc
@@ -159,6 +173,7 @@ def _perform_check(query, page, use_engine, use_links):
        except sql_error:
            _LOGGER.exception("Failed to cache results")


 def _get_cached_results(page, conn, mode, noskip):
    query1 = """SELECT cache_time, cache_queries, cache_process_time,
                       cache_possible_miss
@@ -167,7 +182,7 @@ def _get_cached_results(page, conn, mode, noskip):
    query2 = """SELECT cdata_url, cdata_confidence, cdata_skipped, cdata_excluded
                FROM cache_data
                WHERE cdata_cache_id = ?"""
    cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
    cache_id = sha256(mode + page.get().encode("utf8")).digest()

    cursor = conn.cursor()
    cursor.execute(query1, (cache_id,))
@@ -186,8 +201,9 @@ def _get_cached_results(page, conn, mode, noskip):

    if not data:  # TODO: do something less hacky for this edge case
        article_chain = MarkovChain(ArticleTextParser(page.get()).strip())
        result = CopyvioCheckResult(False, [], queries, check_time,
                                    article_chain, possible_miss)
        result = CopyvioCheckResult(
            False, [], queries, check_time, article_chain, possible_miss
        )
        result.cached = True
        result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
        result.cache_age = _format_date(cache_time)
@@ -216,8 +232,11 @@ def _get_cached_results(page, conn, mode, noskip):
    result.cache_age = _format_date(cache_time)
    return result


 def _format_date(cache_time):
    formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s")
    def formatter(n, w):
        return "{} {}{}".format(n, w, "" if n == 1 else "s")

    diff = datetime.utcnow() - cache_time
    total_seconds = diff.days * 86400 + diff.seconds
    if total_seconds > 3600:
@@ -226,23 +245,34 @@ def _format_date(cache_time):
        return formatter(total_seconds / 60, "minute")
    return formatter(total_seconds, "second")


 def _cache_result(page, result, conn, mode):
    expiry = sql_dialect(mysql="DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)",
                         sqlite="STRFTIME('%s', 'now', '-3 days')")
    expiry = sql_dialect(
        mysql="DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)",
        sqlite="STRFTIME('%s', 'now', '-3 days')",
    )
    query1 = "DELETE FROM cache WHERE cache_id = ?"
    query2 = "DELETE FROM cache WHERE cache_time < %s" % expiry
    query2 = f"DELETE FROM cache WHERE cache_time < {expiry}"
    query3 = """INSERT INTO cache (cache_id, cache_queries, cache_process_time,
                                   cache_possible_miss) VALUES (?, ?, ?, ?)"""
    query4 = """INSERT INTO cache_data (cdata_cache_id, cdata_url,
                                        cdata_confidence, cdata_skipped,
                                        cdata_excluded) VALUES (?, ?, ?, ?, ?)"""
    cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
    data = [(cache_id, source.url[:1024], source.confidence, source.skipped,
             source.excluded)
            for source in result.sources]
    cache_id = sha256(mode + page.get().encode("utf8")).digest()
    data = [
        (
            cache_id,
            source.url[:1024],
            source.confidence,
            source.skipped,
            source.excluded,
        )
        for source in result.sources
    ]
    with get_cursor(conn) as cursor:
        cursor.execute(query1, (cache_id,))
        cursor.execute(query2)
        cursor.execute(query3, (cache_id, result.queries, result.time,
                                result.possible_miss))
        cursor.execute(
            query3, (cache_id, result.queries, result.time, result.possible_miss)
        )
        cursor.executemany(query4, data)
--- a/src/copyvios/cookies.py
+++ b/src/copyvios/cookies.py
@@ -1,39 +1,38 @@
 # -*- coding: utf-8  -*-

 import base64
 from Cookie import CookieError, SimpleCookie
 from datetime import datetime, timedelta
 from http.cookies import CookieError, SimpleCookie

 from flask import g

 __all__ = ["parse_cookies", "set_cookie", "delete_cookie"]


 class _CookieManager(SimpleCookie):
    MAGIC = "--cpv2"

    def __init__(self, path, cookies):
        self._path = path
        try:
            super(_CookieManager, self).__init__(cookies)
            super().__init__(cookies)
        except CookieError:
            super(_CookieManager, self).__init__()
        for cookie in self.keys():
            super().__init__()
        for cookie in list(self.keys()):
            if self[cookie].value is False:
                del self[cookie]

    def value_decode(self, value):
        unquoted = super(_CookieManager, self).value_decode(value)[0]
        unquoted = super().value_decode(value)[0]
        try:
            decoded = base64.b64decode(unquoted).decode("utf8")
        except (TypeError, UnicodeDecodeError):
            return False, "False"
        if decoded.startswith(self.MAGIC):
            return decoded[len(self.MAGIC):], value
            return decoded[len(self.MAGIC) :], value
        return False, "False"

    def value_encode(self, value):
        encoded = base64.b64encode(self.MAGIC + value.encode("utf8"))
        quoted = super(_CookieManager, self).value_encode(encoded)[1]
        quoted = super().value_encode(encoded)[1]
        return value, quoted

    @property
@@ -44,6 +43,7 @@ class _CookieManager(SimpleCookie):
 def parse_cookies(path, cookies):
    return _CookieManager(path, cookies)


 def set_cookie(key, value, days=0):
    g.cookies[key] = value
    if days:
@@ -53,6 +53,7 @@ def set_cookie(key, value, days=0):
    g.cookies[key]["path"] = g.cookies.path
    g.new_cookies.append(g.cookies[key].OutputString())


 def delete_cookie(key):
    set_cookie(key, u"", days=-1)
    set_cookie(key, "", days=-1)
    del g.cookies[key]
--- a/src/copyvios/highlighter.py
+++ b/src/copyvios/highlighter.py
@@ -1,13 +1,12 @@
 # -*- coding: utf-8  -*-

 from collections import deque
 from re import sub, UNICODE
 from re import UNICODE, sub

 from earwigbot.wiki.copyvios.markov import EMPTY_INTERSECTION
 from markupsafe import escape

 __all__ = ["highlight_delta"]


 def highlight_delta(context, chain, delta):
    degree = chain.degree - 1
    highlights = [False] * degree
@@ -18,7 +17,7 @@ def highlight_delta(context, chain, delta):
        word = _strip_word(chain, word)
        block.append(word)
        if tuple(block) in delta.chain:
            highlights[-1 * degree:] = [True] * degree
            highlights[-1 * degree :] = [True] * degree
            highlights.append(True)
        else:
            highlights.append(False)
@@ -38,11 +37,12 @@ def highlight_delta(context, chain, delta):
                last = i - degree + 1 == numwords
                words.append(_highlight_word(word, before, after, first, last))
            else:
                words.append(unicode(escape(word)))
        result.append(u" ".join(words))
                words.append(str(escape(word)))
        result.append(" ".join(words))
        i += 1

    return u"<br /><br />".join(result)
    return "<br /><br />".join(result)


 def _get_next(paragraphs):
    body = []
@@ -58,41 +58,44 @@ def _get_next(paragraphs):
                break
    return body


 def _highlight_word(word, before, after, first, last):
    if before and after:
        # Word is in the middle of a highlighted block:
        res = unicode(escape(word))
        res = str(escape(word))
        if first:
            res = u'<span class="cv-hl">' + res
            res = '<span class="cv-hl">' + res
        if last:
            res += u'</span>'
            res += "</span>"
    elif after:
        # Word is the first in a highlighted block:
        res = u'<span class="cv-hl">' + _fade_word(word, u"in")
        res = '<span class="cv-hl">' + _fade_word(word, "in")
        if last:
            res += u"</span>"
            res += "</span>"
    elif before:
        # Word is the last in a highlighted block:
        res = _fade_word(word, u"out") + u"</span>"
        res = _fade_word(word, "out") + "</span>"
        if first:
            res = u'<span class="cv-hl">' + res
            res = '<span class="cv-hl">' + res
    else:
        res = unicode(escape(word))
        res = str(escape(word))
    return res


 def _fade_word(word, dir):
    if len(word) <= 4:
        word = unicode(escape(word))
        return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
    if dir == u"out":
        before, after = unicode(escape(word[:-4])), unicode(escape(word[-4:]))
        base = u'{0}<span class="cv-hl-out">{1}</span>'
        word = str(escape(word))
        return f'<span class="cv-hl-{dir}">{word}</span>'
    if dir == "out":
        before, after = str(escape(word[:-4])), str(escape(word[-4:]))
        base = '{0}<span class="cv-hl-out">{1}</span>'
        return base.format(before, after)
    else:
        before, after = unicode(escape(word[:4])), unicode(escape(word[4:]))
        base = u'<span class="cv-hl-in">{0}</span>{1}'
        before, after = str(escape(word[:4])), str(escape(word[4:]))
        base = '<span class="cv-hl-in">{0}</span>{1}'
        return base.format(before, after)


 def _strip_word(chain, word):
    if word == chain.START or word == chain.END:
        return word
--- a/src/copyvios/misc.py
+++ b/src/copyvios/misc.py
@@ -1,19 +1,18 @@
 # -*- coding: utf-8  -*-

 from contextlib import contextmanager
 import datetime
 from contextlib import contextmanager
 from os.path import expanduser, join

 import apsw
 from flask import g, request
 import oursql
 from flask import g, request
 from sqlalchemy.pool import manage

 oursql = manage(oursql)

 __all__ = ["Query", "cache", "get_db", "get_notice", "httpsfix", "urlstrip"]

 class Query(object):

 class Query:
    def __init__(self, method="GET"):
        self.query = {}
        data = request.form if method == "POST" else request.args
@@ -25,14 +24,14 @@ class Query(object):

    def __setattr__(self, key, value):
        if key == "query":
            super(Query, self).__setattr__(key, value)
            super().__setattr__(key, value)
        else:
            self.query[key] = value


 class _AppCache(object):
 class _AppCache:
    def __init__(self):
        super(_AppCache, self).__setattr__("_data", {})
        super().__setattr__("_data", {})

    def __getattr__(self, key):
        return self._data[key]
@@ -43,6 +42,7 @@ class _AppCache(object):

 cache = _AppCache()


 def _connect_to_db(engine, args):
    if engine == "mysql":
        args["read_default_file"] = expanduser("~/.my.cnf")
@@ -54,15 +54,17 @@ def _connect_to_db(engine, args):
        conn = apsw.Connection(dbpath)
        conn.cursor().execute("PRAGMA foreign_keys = ON")
        return conn
    raise ValueError("Unknown engine: %s" % engine)
    raise ValueError(f"Unknown engine: {engine}")


 def get_db():
    if not g._db:
        args = cache.bot.config.wiki["_copyviosSQL"].copy()
        args = cache.bot.config.wiki["copyvios"].copy()
        g._engine = engine = args.pop("engine", "mysql").lower()
        g._db = _connect_to_db(engine, args)
    return g._db


@contextmanager
 def get_cursor(conn):
    if g._engine == "mysql":
@@ -72,21 +74,24 @@ def get_cursor(conn):
        with conn:
            yield conn.cursor()
    else:
        raise ValueError("Unknown engine: %s" % g._engine)
        raise ValueError(f"Unknown engine: {g._engine}")


 def get_sql_error():
    if g._engine == "mysql":
        return oursql.Error
    if g._engine == "sqlite":
        return apsw.Error
    raise ValueError("Unknown engine: %s" % g._engine)
    raise ValueError(f"Unknown engine: {g._engine}")


 def sql_dialect(mysql, sqlite):
    if g._engine == "mysql":
        return mysql
    if g._engine == "sqlite":
        return sqlite
    raise ValueError("Unknown engine: %s" % g._engine)
    raise ValueError(f"Unknown engine: {g._engine}")


 def get_notice():
    try:
@@ -95,16 +100,19 @@ def get_notice():
            if lines[0] == "<!-- active -->":
                return "\n".join(lines[1:])
            return None
    except IOError:
    except OSError:
        return None


 def httpsfix(context, url):
    if url.startswith("http://"):
        url = url[len("http:"):]
        url = url[len("http:") :]
    return url


 def parse_wiki_timestamp(timestamp):
    return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S')
    return datetime.datetime.strptime(timestamp, "%Y%m%d%H%M%S")


 def urlstrip(context, url):
    if url.startswith("http://"):
--- a/src/copyvios/settings.py
+++ b/src/copyvios/settings.py
@@ -1,13 +1,12 @@
 # -*- coding: utf-8  -*-

 from flask import g
 from markupsafe import escape

 from .cookies import set_cookie, delete_cookie
 from .cookies import delete_cookie, set_cookie
 from .misc import Query

 __all__ = ["process_settings"]


 def process_settings():
    query = Query(method="POST")
    if query.action == "set":
@@ -18,6 +17,7 @@ def process_settings():
        status = None
    return status


 def _do_set(query):
    cookies = g.cookies
    changes = set()
@@ -39,18 +39,19 @@ def _do_set(query):
            changes.add("background")
    if changes:
        changes = ", ".join(sorted(list(changes)))
        return "Updated {0}.".format(changes)
        return f"Updated {changes}."
    return None


 def _do_delete(query):
    cookies = g.cookies
    if query.cookie in cookies:
        delete_cookie(query.cookie.encode("utf8"))
        template = u'Deleted cookie <b><span class="mono">{0}</span></b>.'
        template = 'Deleted cookie <b><span class="mono">{0}</span></b>.'
        return template.format(escape(query.cookie))
    elif query.all:
        number = len(cookies)
        for cookie in cookies.values():
        for cookie in list(cookies.values()):
            delete_cookie(cookie.key)
        return "Deleted <b>{0}</b> cookies.".format(number)
        return f"Deleted <b>{number}</b> cookies."
    return None
--- a/src/copyvios/sites.py
+++ b/src/copyvios/sites.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8  -*-

 from time import time
 from urlparse import urlparse
 from urllib.parse import urlparse

 from earwigbot import exceptions

@@ -9,6 +7,7 @@ from .misc import cache

 __all__ = ["get_site", "update_sites"]


 def get_site(query):
    lang, project, name = query.lang, query.project, query.name
    wiki = cache.bot.wiki
@@ -24,11 +23,13 @@ def get_site(query):
    except exceptions.SiteNotFoundError:
        return _add_site(lang, project)


 def update_sites():
    if time() - cache.last_sites_update > 60 * 60 * 24 * 7:
        cache.langs, cache.projects = _load_sites()
        cache.last_sites_update = time()


 def _add_site(lang, project):
    update_sites()
    if not any(project == item[0] for item in cache.projects):
@@ -40,12 +41,13 @@ def _add_site(lang, project):
    except (exceptions.APIError, exceptions.LoginError):
        return None


 def _load_sites():
    site = cache.bot.wiki.get_site()
    matrix = site.api_query(action="sitematrix")["sitematrix"]
    del matrix["count"]
    langs, projects = set(), set()
    for site in matrix.itervalues():
    for site in matrix.values():
        if isinstance(site, list):  # Special sites
            bad_sites = ["closed", "private", "fishbowl"]
            for special in site:
@@ -55,19 +57,19 @@ def _load_sites():
                        lang, project = "www", full.split(".")[0]
                    else:
                        lang, project = full.rsplit(".", 2)[:2]
                    code = u"{0}::{1}".format(lang, special["dbname"])
                    code = "{}::{}".format(lang, special["dbname"])
                    name = special["code"].capitalize()
                    langs.add((code, u"{0} ({1})".format(lang, name)))
                    langs.add((code, f"{lang} ({name})"))
                    projects.add((project, project.capitalize()))
        else:
            this = set()
            for web in site["site"]:
                if "closed" in web:
                    continue
                proj = "wikipedia" if web["code"] == u"wiki" else web["code"]
                proj = "wikipedia" if web["code"] == "wiki" else web["code"]
                this.add((proj, proj.capitalize()))
            if this:
                code = site["code"]
                langs.add((code, u"{0} ({1})".format(code, site["name"])))
                langs.add((code, "{} ({})".format(code, site["name"])))
                projects |= this
    return list(sorted(langs)), list(sorted(projects))
--- a/src/copyvios/turnitin.py
+++ b/src/copyvios/turnitin.py
@@ -1,17 +1,17 @@
 # -*- coding: utf-8 -*-
 from ast import literal_eval
 import re
 from ast import literal_eval

 import requests

 from .misc import parse_wiki_timestamp

 __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']
 __all__ = ["search_turnitin", "TURNITIN_API_ENDPOINT"]

 TURNITIN_API_ENDPOINT = "https://eranbot.toolforge.org/plagiabot/api.py"

 TURNITIN_API_ENDPOINT = 'https://eranbot.toolforge.org/plagiabot/api.py'

 def search_turnitin(page_title, lang):
    """ Search the Plagiabot database for Turnitin reports for a page.
    """Search the Plagiabot database for Turnitin reports for a page.

    Keyword arguments:
    page_title -- string containing the page title
@@ -21,14 +21,16 @@ def search_turnitin(page_title, lang):
    """
    return TurnitinResult(_make_api_request(page_title, lang))


 def _make_api_request(page_title, lang):
    """ Query the plagiabot API for Turnitin reports for a given page.
    """
    stripped_page_title = page_title.replace(' ', '_')
    api_parameters = {'action': 'suspected_diffs',
                      'page_title': stripped_page_title,
                      'lang': lang,
                      'report': 1}
    """Query the plagiabot API for Turnitin reports for a given page."""
    stripped_page_title = page_title.replace(" ", "_")
    api_parameters = {
        "action": "suspected_diffs",
        "page_title": stripped_page_title,
        "lang": lang,
        "report": 1,
    }

    result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters, verify=False)
    # use literal_eval to *safely* parse the resulting dict-containing string
@@ -38,14 +40,16 @@ def _make_api_request(page_title, lang):
        parsed_api_result = []
    return parsed_api_result

 class TurnitinResult(object):
    """ Container class for TurnitinReports. Each page may have zero or

 class TurnitinResult:
    """Container class for TurnitinReports. Each page may have zero or
    more reports of plagiarism. The list will have multiple
    TurnitinReports if plagiarism has been detected for more than one
    revision.

    TurnitinResult.reports -- list containing >= 0 TurnitinReport items
    """

    def __init__(self, turnitin_data):
        """
        Keyword argument:
@@ -54,14 +58,16 @@ class TurnitinResult(object):
        self.reports = []
        for item in turnitin_data:
            report = TurnitinReport(
                item['diff_timestamp'], item['diff'], item['report'])
                item["diff_timestamp"], item["diff"], item["report"]
            )
            self.reports.append(report)

    def __repr__(self):
        return str(self.__dict__)

 class TurnitinReport(object):
    """ Contains data for each Turnitin report (one on each potentially

 class TurnitinReport:
    """Contains data for each Turnitin report (one on each potentially
    plagiarized revision).

    TurnitinReport.reportid  -- Turnitin report ID, taken from plagiabot
@@ -72,6 +78,7 @@ class TurnitinReport(object):
        words   -- number of words found in both source and revision
        url     -- url for the possibly-plagiarized source
    """

    def __init__(self, timestamp, diffid, report):
        """
        Keyword argument:
@@ -86,9 +93,7 @@ class TurnitinReport(object):

        self.sources = []
        for item in self.report_data[1]:
            source = {'percent': item[0],
                      'words': item[1],
                      'url': item[2]}
            source = {"percent": item[0], "words": item[1], "url": item[2]}
            self.sources.append(source)

    def __repr__(self):
@@ -96,12 +101,11 @@ class TurnitinReport(object):

    def _parse_report(self, report_text):
        # extract report ID
        report_id_pattern = re.compile(r'\?rid=(\d*)')
        report_id_pattern = re.compile(r"\?rid=(\d*)")
        report_id = report_id_pattern.search(report_text).groups()[0]

        # extract percent match, words, and URL for each source in the report
        extract_info_pattern = re.compile(
            r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
        extract_info_pattern = re.compile(r"\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ")
        results = extract_info_pattern.findall(report_text)

        return (report_id, results)
--- a/static/api.min.css
+++ b/static/api.min.css
@@ -1 +1 @@
 h1,h2{font-family:sans-serif}pre{white-space:pre-wrap}#help{margin:auto;max-width:1200px}.json{font-family:monospace}.indent{display:inline-block;padding-left:2em}.code{font-family:monospace}.resp-cond,.resp-desc,.resp-dtype{padding:0 .25em;background-color:#eee}.resp-dtype{color:#009}.resp-cond:before,.resp-dtype:before{content:"("}.resp-cond:after,.resp-dtype:after{content:")"}.resp-desc{color:#050}.resp-cond{color:#900;font-style:italic}.param-key{color:#009;font-weight:700}.param-val{color:#900;font-weight:700}.parameters{margin:1em 0}.parameters tr:first-child{font-family:sans-serif;font-size:1.17em;color:#fff}.parameters tr:first-child th{background-color:#369}.parameters td,.parameters th{padding:.2em .5em}.parameters th{background-color:#f0f0f0}.parameters td:first-child{font-family:monospace}.parameters tr:nth-child(2n+3){background-color:#e0e0e0}.parameters tr:nth-child(2n+4){background-color:#f0f0f0}a:link,a:visited{color:#373;text-decoration:none}a:hover{color:#040}a:active,a:hover{text-decoration:underline}a:active{color:#404}.no-color:link,.no-color:visited{color:#000;text-decoration:none}.no-color:active,.no-color:hover{color:#000;text-decoration:underline}
 h1,h2{font-family:sans-serif}pre{white-space:pre-wrap}#help{margin:auto;max-width:1200px}.json{font-family:monospace}.indent{display:inline-block;padding-left:2em}.code{font-family:monospace}.resp-cond,.resp-desc,.resp-dtype{background-color:#eee;padding:0 .25em}.resp-dtype{color:#009}.resp-cond:before,.resp-dtype:before{content:"("}.resp-cond:after,.resp-dtype:after{content:")"}.resp-desc{color:#050}.resp-cond{color:#900;font-style:italic}.param-key{color:#009;font-weight:700}.param-val{color:#900;font-weight:700}.parameters{margin:1em 0}.parameters tr:first-child{color:#fff;font-family:sans-serif;font-size:1.17em}.parameters tr:first-child th{background-color:#369}.parameters td,.parameters th{padding:.2em .5em}.parameters th{background-color:#f0f0f0}.parameters td:first-child{font-family:monospace}.parameters tr:nth-child(2n+3){background-color:#e0e0e0}.parameters tr:nth-child(2n+4){background-color:#f0f0f0}a:link,a:visited{color:#373;text-decoration:none}a:hover{color:#040}a:active,a:hover{text-decoration:underline}a:active{color:#404}.no-color:link,.no-color:visited{color:#000;text-decoration:none}.no-color:active,.no-color:hover{color:#000;text-decoration:underline}
--- a/static/script.min.js
+++ b/static/script.min.js
@@ -1 +1 @@
 function update_screen_size(){var cache=cache_cookie(),data={width:window.screen.availWidth,height:window.screen.availHeight};cache&&cache.width==data.width&&cache.height==data.height||set_cookie("CopyviosScreenCache",JSON.stringify(data),1095)}function cache_cookie(){var cookie=get_cookie("CopyviosScreenCache");if(cookie)try{data=JSON.parse(cookie);var width=data.width,height=data.height;if(width&&height)return{width:width,height:height}}catch(SyntaxError){}return!1}function get_cookie(name){for(var nameEQ=name+"=",ca=document.cookie.split(";"),i=0;i<ca.length;i++){for(var c=ca[i];" "==c.charAt(0);)c=c.substring(1,c.length);if(0==c.indexOf(nameEQ)){var value=window.atob(c.substring(nameEQ.length,c.length));if(0==value.indexOf("--cpv2"))return value.substring("--cpv2".length,value.length)}}return null}function set_cookie_with_date(name,value,expires){value=window.btoa("--cpv2"+value);var path=window.location.pathname.split("/",2)[1];expires=expires?"; expires="+expires.toUTCString():"",document.cookie=name+"="+value+expires+"; path=/"+path}function set_cookie(name,value,days){var date;days?((date=new Date).setTime(date.getTime()+24*days*60*60*1e3),set_cookie_with_date(name,value,date)):set_cookie_with_date(name,value)}function delete_cookie(name){set_cookie(name,"",-1)}function toggle_notice(){var details=$("#notice-collapse-box"),trigger=$("#notice-collapse-trigger");details.is(":hidden")?(details.show(),trigger.text("[hide]")):(details.hide(),trigger.text("[show]"))}function install_notice(){var details=$("#notice-collapse-box"),trigger=$("#notice-collapse-trigger");0<=details.length&&0<=trigger.length&&(trigger.replaceWith($("<a/>",{id:"notice-collapse-trigger",href:"#",text:"[show]",click:function(){return toggle_notice(),!1}})),details.hide())}$(document).ready(function(){$("#action-search").change(function(){$(".cv-search").prop("disabled",!1),$(".cv-compare").prop("disabled",!0),$(".cv-search-oo-ui").addClass("oo-ui-widget-enabled").removeClass("oo-ui-widget-disabled"),$(".cv-compare-oo-ui").addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled")}),$("#action-compare").change(function(){$(".cv-search").prop("disabled",!0),$(".cv-compare").prop("disabled",!1),$(".cv-search-oo-ui").addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled"),$(".cv-compare-oo-ui").addClass("oo-ui-widget-enabled").removeClass("oo-ui-widget-disabled")}),$("#action-search").is(":checked")&&$("#action-search").change(),$("#action-compare").is(":checked")&&$("#action-compare").change(),$("#cv-form").submit(function(){$("#action-search").is(":checked")&&$.each([["engine","use_engine"],["links","use_links"],["turnitin","turnitin"]],function(i,val){$("#cv-cb-"+val[0]).is(":checked")&&$("#cv-form input[type='hidden'][name='"+val[1]+"']").prop("disabled",!0)}),$("#cv-form button[type='submit']").prop("disabled",!0).css("cursor","progress").parent().addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled")}),0<=$("#cv-additional").length&&($("#cv-additional").css("display","block"),$(".source-default-hidden").css("display","none"),$("#show-additional-sources").click(function(){return $(".source-default-hidden").css("display",""),$("#cv-additional").css("display","none"),!1})),install_notice()});
 function update_screen_size(){var cache=cache_cookie(),data={width:window.screen.availWidth,height:window.screen.availHeight};cache&&cache.width==data.width&&cache.height==data.height||set_cookie("CopyviosScreenCache",JSON.stringify(data),1095)}function cache_cookie(){var cookie=get_cookie("CopyviosScreenCache");if(cookie)try{var width=(data=JSON.parse(cookie)).width,height=data.height;if(width&&height)return{width:width,height:height}}catch(SyntaxError){}return!1}function get_cookie(name){for(var nameEQ=name+"=",ca=document.cookie.split(";"),i=0;i<ca.length;i++){for(var c=ca[i];" "==c.charAt(0);)c=c.substring(1,c.length);if(0==c.indexOf(nameEQ)){var value=window.atob(c.substring(nameEQ.length,c.length));if(0==value.indexOf("--cpv2"))return value.substring("--cpv2".length,value.length)}}return null}function set_cookie_with_date(name,value,date){value=window.btoa("--cpv2"+value);var path=window.location.pathname.split("/",2)[1];date=date?"; expires="+date.toUTCString():"",document.cookie=name+"="+value+date+"; path=/"+path}function set_cookie(name,value,days){var date;days?((date=new Date).setTime(date.getTime()+24*days*60*60*1e3),set_cookie_with_date(name,value,date)):set_cookie_with_date(name,value)}function delete_cookie(name){set_cookie(name,"",-1)}function toggle_notice(){var details=$("#notice-collapse-box"),trigger=$("#notice-collapse-trigger");details.is(":hidden")?(details.show(),trigger.text("[hide]")):(details.hide(),trigger.text("[show]"))}function install_notice(){var details=$("#notice-collapse-box"),trigger=$("#notice-collapse-trigger");0<=details.length&&0<=trigger.length&&(trigger.replaceWith($("<a/>",{id:"notice-collapse-trigger",href:"#",text:"[show]",click:function(){return toggle_notice(),!1}})),details.hide())}$(document).ready(function(){$("#action-search").change(function(){$(".cv-search").prop("disabled",!1),$(".cv-compare").prop("disabled",!0),$(".cv-search-oo-ui").addClass("oo-ui-widget-enabled").removeClass("oo-ui-widget-disabled"),$(".cv-compare-oo-ui").addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled")}),$("#action-compare").change(function(){$(".cv-search").prop("disabled",!0),$(".cv-compare").prop("disabled",!1),$(".cv-search-oo-ui").addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled"),$(".cv-compare-oo-ui").addClass("oo-ui-widget-enabled").removeClass("oo-ui-widget-disabled")}),$("#action-search").is(":checked")&&$("#action-search").change(),$("#action-compare").is(":checked")&&$("#action-compare").change(),$("#cv-form").submit(function(){$("#action-search").is(":checked")&&$.each([["engine","use_engine"],["links","use_links"],["turnitin","turnitin"]],function(i,val){$("#cv-cb-"+val[0]).is(":checked")&&$("#cv-form input[type='hidden'][name='"+val[1]+"']").prop("disabled",!0)}),$("#cv-form button[type='submit']").prop("disabled",!0).css("cursor","progress").parent().addClass("oo-ui-widget-disabled").removeClass("oo-ui-widget-enabled")}),0<=$("#cv-additional").length&&($("#cv-additional").css("display","block"),$(".source-default-hidden").css("display","none"),$("#show-additional-sources").click(function(){return $(".source-default-hidden").css("display",""),$("#cv-additional").css("display","none"),!1})),install_notice()});
--- a/static/style.min.css
+++ b/static/style.min.css