Python 3.11+ cleanup and bugfixes

9 months ago · 10bc4b3fd4
--- a/+ 3
+++ b/+ 3
@@ -1,12 +1,14 @@
 v0.4 (unreleased):

 - Migrated to Python 3.
 - Migrated to Python 3 (3.11+).
 - Migrated from oursql to pymysql.
 - Copyvios: Configurable proxy support for specific domains.
 - Copyvios: Parser-directed URL redirection.
 - Copyvios: General parsing improvements.
 - Copyvios: URL exclusion improvements.
 - Copyvios: Removed long-deprecated Yahoo! BOSS search engine.
 - Wiki: Fixed not sending Content-Type header in POST requests.
 - IRC: Moved default server from Freenode to Libera.
 - IRC: Remember joined channels across restarts.
 - IRC: Added !listchans.
 - IRC > !stalk: Added modifiers to change message format or filter messages.
--- a/earwigbot/irc/watcher.py
+++ b/earwigbot/irc/watcher.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
 # Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -20,12 +20,14 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 import imp
 import importlib.machinery
 import importlib.util

 from earwigbot.irc import IRCConnection, RC

 __all__ = ["Watcher"]


 class Watcher(IRCConnection):
    """
    **EarwigBot: IRC Watcher Component**
@@ -40,16 +42,23 @@ class Watcher(IRCConnection):
    def __init__(self, bot):
        self.bot = bot
        cf = bot.config.irc["watcher"]
        super().__init__(cf["host"], cf["port"], cf["nick"], cf["ident"],
                         cf["realname"], bot.logger.getChild("watcher"))
        super().__init__(
            cf["host"],
            cf["port"],
            cf["nick"],
            cf["ident"],
            cf["realname"],
            bot.logger.getChild("watcher"),
        )
        self._prepare_process_hook()
        self._connect()

    def __repr__(self):
        """Return the canonical string representation of the Watcher."""
        res = "Watcher(host={0!r}, port={1!r}, nick={2!r}, ident={3!r}, realname={4!r}, bot={5!r})"
        return res.format(self.host, self.port, self.nick, self.ident,
                          self.realname, self.bot)
        return res.format(
            self.host, self.port, self.nick, self.ident, self.realname, self.bot
        )

    def __str__(self):
        """Return a nice string representation of the Watcher."""
@@ -88,17 +97,11 @@ class Watcher(IRCConnection):
        self._process_hook = lambda bot, rc: ()

        path = self.bot.config.root_dir
        try:
            f, path, desc = imp.find_module("rules", [path])
        except ImportError:
        spec = importlib.machinery.PathFinder.find_spec("rules", [path])
        if spec is None or spec.loader is None:
            return
        try:
            module = imp.load_module("rules", f, path, desc)
        except Exception:
            return
        finally:
            f.close()

        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        self._process_hook_module = module
        try:
            self._process_hook = module.process
--- a/earwigbot/lazy.py
+++ b/earwigbot/lazy.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
 # Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,6 @@ manner, so that they can be referred to by name but are not actually loaded
 until they are used (i.e. their attributes are read or modified).
 """

 from imp import acquire_lock, release_lock
 import importlib
 import sys
 from threading import RLock
@@ -36,12 +35,16 @@ from types import ModuleType
 __all__ = ["LazyImporter"]

 _real_get = ModuleType.__getattribute__
 _lazy_init_lock = RLock()


 def _create_failing_get(exc):
    def _fail(self, attr):
        raise exc

    return _fail


 def _mock_get(self, attr):
    with _real_get(self, "_lock"):
        if _real_get(self, "_unloaded"):
@@ -59,14 +62,13 @@ def _mock_get(self, attr):

 class _LazyModule(type):
    def __new__(cls, name):
        acquire_lock()
        try:
        with _lazy_init_lock:
            if name not in sys.modules:
                attributes = {
                    "__name__": name,
                    "__getattribute__": _mock_get,
                    "_unloaded": True,
                    "_lock": RLock()
                    "_lock": RLock(),
                }
                parents = (ModuleType,)
                klass = type.__new__(cls, "module", parents, attributes)
@@ -74,8 +76,6 @@ class _LazyModule(type):
                if "." in name:  # Also ensure the parent exists
                    _LazyModule(name.rsplit(".", 1)[0])
            return sys.modules[name]
        finally:
            release_lock()


 class LazyImporter:
@@ -84,6 +84,7 @@ class LazyImporter:
    This inserts itself into :py:data:`sys.meta_path`, storing a dictionary of
    :py:class:`_LazyModule`\ s (which is added to with :py:meth:`new`).
    """

    def __init__(self):
        self._modules = {}
        sys.meta_path.append(self)
--- a/earwigbot/managers.py
+++ b/earwigbot/managers.py
@@ -1,7 +1,7 @@
 #! /usr/bin/env python
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
 # Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -21,7 +21,8 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 import imp
 import importlib.machinery
 import importlib.util
 from os import listdir, path
 from re import sub
 from threading import RLock, Thread
@@ -32,6 +33,7 @@ from earwigbot.tasks import Task

 __all__ = ["CommandManager", "TaskManager"]


 class _ResourceManager:
    """
    **EarwigBot: Resource Manager**
@@ -48,6 +50,7 @@ class _ResourceManager:
    :py:meth:`load`, retrieving specific resources via :py:meth:`get`, and
    iterating over all resources via :py:meth:`__iter__`.
    """

    def __init__(self, bot, name, base):
        self.bot = bot
        self.logger = bot.logger.getChild(name)
@@ -60,8 +63,9 @@ class _ResourceManager:
    def __repr__(self):
        """Return the canonical string representation of the manager."""
        res = "{0}(bot={1!r}, name={2!r}, base={3!r})"
        return res.format(self.__class__.__name__, self.bot,
                          self._resource_name, self._resource_base)
        return res.format(
            self.__class__.__name__, self.bot, self._resource_name, self._resource_base
        )

    def __str__(self):
        """Return a nice string representation of the manager."""
@@ -100,22 +104,22 @@ class _ResourceManager:
    def _load_module(self, name, path):
        """Load a specific resource from a module, identified by name and path.

        We'll first try to import it using imp magic, and if that works, make
        instances of any classes inside that are subclasses of the base
        We'll first try to import it using importlib magic, and if that works,
        make instances of any classes inside that are subclasses of the base
        (:py:attr:`self._resource_base <_resource_base>`), add them to the
        resources dictionary with :py:meth:`self._load_resource()
        <_load_resource>`, and finally log the addition. Any problems along
        the way will either be ignored or logged.
        """
        f, path, desc = imp.find_module(name, [path])
        spec = importlib.machinery.PathFinder.find_spec(name, [path])
        try:
            module = imp.load_module(name, f, path, desc)
            assert spec is not None, "Spec must not be None"
            assert spec.loader is not None, "Loader must not be None"
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)
        except Exception:
            e = "Couldn't load module '{0}' (from {1})"
            self.logger.exception(e.format(name, path))
            self.logger.exception(f"Couldn't load module {name!r} (from {path})")
            return
        finally:
            f.close()

        for obj in vars(module).values():
            if type(obj) is type:
@@ -132,7 +136,7 @@ class _ResourceManager:
                continue
            if name.startswith("_") or name.startswith("."):
                continue
            modname = sub("\.pyc?$", "", name)  # Remove extension
            modname = sub(r"\.pyc?$", "", name)  # Remove extension
            if modname in processed:
                continue
            processed.append(modname)
@@ -200,6 +204,7 @@ class CommandManager(_ResourceManager):
    """
    Manages (i.e., loads, reloads, and calls) IRC commands.
    """

    def __init__(self, bot):
        super().__init__(bot, "commands", Command)

@@ -234,8 +239,7 @@ class CommandManager(_ResourceManager):

        for command in self:
            if hook in command.hooks and self._wrap_check(command, data):
                thread = Thread(target=self._wrap_process,
                                args=(command, data))
                thread = Thread(target=self._wrap_process, args=(command, data))
                start_time = strftime("%b %d %H:%M:%S")
                thread.name = "irc:{0} ({1})".format(command.name, start_time)
                thread.daemon = True
@@ -247,6 +251,7 @@ class TaskManager(_ResourceManager):
    """
    Manages (i.e., loads, reloads, schedules, and runs) wiki bot tasks.
    """

    def __init__(self, bot):
        super().__init__(bot, "tasks", Task)

@@ -292,11 +297,12 @@ class TaskManager(_ResourceManager):
        if not now:
            now = gmtime()
        # Get list of tasks to run this turn:
        tasks = self.bot.config.schedule(now.tm_min, now.tm_hour, now.tm_mday,
                                         now.tm_mon, now.tm_wday)
        tasks = self.bot.config.schedule(
            now.tm_min, now.tm_hour, now.tm_mday, now.tm_mon, now.tm_wday
        )

        for task in tasks:
            if isinstance(task, list):          # They've specified kwargs,
            if isinstance(task, list):  # They've specified kwargs,
                self.start(task[0], **task[1])  # so pass those to start
            else:  # Otherwise, just pass task_name
                self.start(task)
--- a/earwigbot/tasks/init.py
+++ b/earwigbot/tasks/init.py
@@ -146,7 +146,7 @@ class Task:
        try:
            content = page.get()
        except exceptions.PageNotFoundError:
            return False
            return True
        if content == cfg.get("disabled", "run"):
            return False

--- a/earwigbot/tasks/wikiproject_tagger.py
+++ b/earwigbot/tasks/wikiproject_tagger.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
 # Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@ from earwigbot import exceptions
 from earwigbot.tasks import Task
 from earwigbot.wiki import constants


 class WikiProjectTagger(Task):
    """A task to tag talk pages with WikiProject banners.

@@ -76,28 +77,24 @@ class WikiProjectTagger(Task):
        edited

    """

    name = "wikiproject_tagger"

    # Regexes for template names that should always go above the banner, based
    # on [[Wikipedia:Talk page layout]]:
    TOP_TEMPS = [
        r"skip ?to ?(toc|talk|toctalk)$",

        r"ga ?nominee$",

        r"(user ?)?talk ?(header|page|page ?header)$",

        r"community ?article ?probation$",
        r"censor(-nudity)?$",
        r"blp(o| ?others?)?$",
        r"controvers(ial2?|y)$",

        r"(not ?(a ?)?)?forum$",
        r"tv(episode|series)talk$",
        r"recurring ?themes$",
        r"faq$",
        r"(round ?in ?)?circ(les|ular)$",

        r"ar(ti|it)cle ?(history|milestones)$",
        r"failed ?ga$",
        r"old ?prod( ?full)?$",
@@ -144,10 +141,18 @@ class WikiProjectTagger(Task):
        else:
            only_with = None

        job = _Job(banner=banner, names=names, summary=summary, update=update,
                   append=append, autoassess=autoassess, only_with=only_with,
                   nocreate=nocreate, tag_categories=tag_categories,
                   dry_run=dry_run)
        job = _Job(
            banner=banner,
            names=names,
            summary=summary,
            update=update,
            append=append,
            autoassess=autoassess,
            only_with=only_with,
            nocreate=nocreate,
            tag_categories=tag_categories,
            dry_run=dry_run,
        )

        try:
            self.run_job(kwargs, site, job, recursive)
@@ -165,7 +170,6 @@ class WikiProjectTagger(Task):
            with open(kwargs["file"], "r") as fileobj:
                for line in fileobj:
                    if line.strip():
                        line = line.decode("utf8")
                        if line.startswith("[[") and line.endswith("]]"):
                            line = line[2:-2]
                        page = site.get_page(line)
@@ -201,8 +205,13 @@ class WikiProjectTagger(Task):
            return banner, None

        names = {banner, title}
        result = site.api_query(action="query", list="backlinks", bllimit=500,
                                blfilterredir="redirects", bltitle=title)
        result = site.api_query(
            action="query",
            list="backlinks",
            bllimit=500,
            blfilterredir="redirects",
            bltitle=title,
        )
        for backlink in result["query"]["backlinks"]:
            names.add(backlink["title"])
            if backlink["ns"] == constants.NS_TEMPLATE:
@@ -215,8 +224,9 @@ class WikiProjectTagger(Task):
    def process_category(self, page, job, recursive):
        """Try to tag all pages in the given category."""
        if page.title in job.processed_cats:
            self.logger.debug("Skipping category, already processed: [[%s]]",
                              page.title)
            self.logger.debug(
                "Skipping category, already processed: [[%s]]", page.title
            )
            return
        self.logger.info("Processing category: [[%s]]", page.title)
        job.processed_cats.add(page.title)
@@ -243,8 +253,7 @@ class WikiProjectTagger(Task):
            page = page.toggle_talk()

        if page.title in job.processed_pages:
            self.logger.debug("Skipping page, already processed: [[%s]]",
                              page.title)
            self.logger.debug("Skipping page, already processed: [[%s]]", page.title)
            return
        job.processed_pages.add(page.title)

@@ -275,21 +284,22 @@ class WikiProjectTagger(Task):
                    return

        if job.only_with:
            if not any(template.name.matches(job.only_with)
                       for template in code.ifilter_templates(recursive=True)):
            if not any(
                template.name.matches(job.only_with)
                for template in code.ifilter_templates(recursive=True)
            ):
                log = "Skipping page: [[%s]]; fails only-with condition"
                self.logger.info(log, page.title)
                return

        if is_update:
            old_banner = str(banner)
            self.update_banner(banner, job, code)
            if banner == old_banner:
            updated = self.update_banner(banner, job, code)
            if not updated:
                log = "Skipping page: [[%s]]; already tagged and no updates"
                self.logger.info(log, page.title)
                return
            self.logger.info("Updating banner on page: [[%s]]", page.title)
            banner = banner.encode("utf8")
            banner = str(banner)
        else:
            self.logger.info("Tagging page: [[%s]]", page.title)
            banner = self.make_banner(job, code)
@@ -334,9 +344,11 @@ class WikiProjectTagger(Task):

    def update_banner(self, banner, job, code):
        """Update an existing *banner* based on a *job* and a page's *code*."""
        has = lambda key: (banner.has(key) and
                           banner.get(key).value.strip() not in ("", "?"))
        has = lambda key: (
            banner.has(key) and banner.get(key).value.strip() not in ("", "?")
        )

        updated = False
        if job.autoassess is not False:
            if not has("class"):
                assess, reason = self.get_autoassessment(code, job.autoassess)
@@ -349,6 +361,8 @@ class WikiProjectTagger(Task):
                key, value = param.split("=", 1)
                if not has(key):
                    banner.add(key, value)
                    updated = True
        return updated

    def get_autoassessment(self, code, only_classes=None):
        """Get an autoassessment for a page.
@@ -356,16 +370,27 @@ class WikiProjectTagger(Task):
        Return (assessed class as a string or None, assessment reason or None).
        """
        if only_classes is None or only_classes is True:
            classnames = ["a", "b", "book", "c", "dab", "fa", "fl", "ga",
                          "list", "redirect", "start", "stub"]
            classnames = [
                "a",
                "b",
                "book",
                "c",
                "dab",
                "fa",
                "fl",
                "ga",
                "list",
                "redirect",
                "start",
                "stub",
            ]
        else:
            classnames = [klass.strip().lower()
                          for klass in only_classes.split(",")]
            classnames = [klass.strip().lower() for klass in only_classes.split(",")]

        classes = {klass: 0 for klass in classnames}
        for template in code.ifilter_templates(recursive=True):
            if template.has("class"):
                value = str(template.get("class").value).lower()
                value = str(template.get("class").value).strip().lower()
                if value in classes:
                    classes[value] += 1

@@ -429,6 +454,7 @@ class WikiProjectTagger(Task):
            self.logger.debug("Inserting banner at beginning")
            code.insert(0, banner + "\n")


 class _Job:
    """Represents a single wikiproject-tagging task.

@@ -436,6 +462,7 @@ class _Job:
    or not to autoassess and create new pages from scratch, and a counter of
    the number of pages edited.
    """

    def __init__(self, **kwargs):
        self.banner = kwargs["banner"]
        self.names = kwargs["names"]
@@ -456,4 +483,5 @@ class _Job:
 class _ShutoffEnabled(Exception):
    """Raised by process_page() if shutoff is enabled. Caught by run(), which
    will then stop the task."""

    pass
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -280,7 +280,7 @@ class Page(CopyvioMixIn):
            self._assert_existence()

    def _edit(self, params=None, text=None, summary=None, minor=None, bot=None,
              force=None, section=None, captcha_id=None, captcha_word=None):
              force=None, section=None, captcha_id=None, captcha_word=None, **kwargs):
        """Edit the page!

        If *params* is given, we'll use it as our API query parameters.
@@ -297,7 +297,7 @@ class Page(CopyvioMixIn):
        # Build our API query string:
        if not params:
            params = self._build_edit_params(text, summary, minor, bot, force,
                                             section, captcha_id, captcha_word)
                                             section, captcha_id, captcha_word, kwargs)
        else: # Make sure we have the right token:
            params["token"] = self.site.get_token()

@@ -320,7 +320,7 @@ class Page(CopyvioMixIn):
        raise exceptions.EditError(result["edit"])

    def _build_edit_params(self, text, summary, minor, bot, force, section,
                           captcha_id, captcha_word):
                           captcha_id, captcha_word, kwargs):
        """Given some keyword arguments, build an API edit query string."""
        unitxt = text.encode("utf8") if isinstance(text, str) else text
        hashed = md5(unitxt).hexdigest()  # Checksum to ensure text is correct
@@ -351,6 +351,11 @@ class Page(CopyvioMixIn):
        else:
            params["recreate"] = "true"

        for key, val in kwargs.items():
            if val is None:
                params.pop(key, None)
            else:
                params[key] = val
        return params

    def _handle_edit_errors(self, error, params, retry=True):
@@ -657,7 +662,7 @@ class Page(CopyvioMixIn):
        """
        return mwparserfromhell.parse(self.get())

    def edit(self, text, summary, minor=False, bot=True, force=False):
    def edit(self, text, summary, minor=False, bot=True, force=False, **kwargs):
        """Replace the page's content or creates a new page.

        *text* is the new page content, with *summary* as the edit summary.
@@ -670,9 +675,9 @@ class Page(CopyvioMixIn):
        editing our page. Be careful with this!
        """
        self._edit(text=text, summary=summary, minor=minor, bot=bot,
                   force=force)
                   force=force, **kwargs)

    def add_section(self, text, title, minor=False, bot=True, force=False):
    def add_section(self, text, title, minor=False, bot=True, force=False, **kwargs):
        """Add a new section to the bottom of the page.

        The arguments for this are the same as those for :py:meth:`edit`, but
@@ -683,7 +688,7 @@ class Page(CopyvioMixIn):
        new section as content.
        """
        self._edit(text=text, summary=title, minor=minor, bot=bot, force=force,
                   section="new")
                   section="new", **kwargs)

    def check_exclusion(self, username=None, optouts=None):
        """Check whether or not we are allowed to edit the page.
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2021 Ben Kurtovic <ben.kurtovic@gmail.com>
 # Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -37,10 +37,11 @@ from earwigbot.wiki.category import Category
 from earwigbot.wiki.page import Page
 from earwigbot.wiki.user import User

 oursql = importer.new("oursql")
 pymysql = importer.new("pymysql")

 __all__ = ["Site"]


 class Site:
    """
    **EarwigBot: Wiki Toolset: Site**
@@ -80,18 +81,41 @@ class Site:
    - :py:meth:`get_user`:             returns a User object for the given name
    - :py:meth:`delegate`:             controls when the API or SQL is used
    """

    SERVICE_API = 1
    SERVICE_SQL = 2
    SPECIAL_TOKENS = ["createaccount", "deleteglobalaccount", "login",
                      "patrol", "rollback", "setglobalaccountstatus",
                      "userrights", "watch"]

    def __init__(self, name=None, project=None, lang=None, base_url=None,
                 article_path=None, script_path=None, sql=None,
                 namespaces=None, login=(None, None), oauth=None,
                 cookiejar=None, user_agent=None, use_https=True,
                 assert_edit=None, maxlag=None, wait_between_queries=1,
                 logger=None, search_config=None):
    SPECIAL_TOKENS = [
        "createaccount",
        "deleteglobalaccount",
        "login",
        "patrol",
        "rollback",
        "setglobalaccountstatus",
        "userrights",
        "watch",
    ]

    def __init__(
        self,
        name=None,
        project=None,
        lang=None,
        base_url=None,
        article_path=None,
        script_path=None,
        sql=None,
        namespaces=None,
        login=(None, None),
        oauth=None,
        cookiejar=None,
        user_agent=None,
        use_https=True,
        assert_edit=None,
        maxlag=None,
        wait_between_queries=1,
        logger=None,
        search_config=None,
    ):
        """Constructor for new Site instances.

        This probably isn't necessary to call yourself unless you're building a
@@ -160,8 +184,11 @@ class Site:
        self._session.headers["User-Agent"] = user_agent
        if oauth:
            self._session.auth = OAuth1(
                oauth["consumer_token"], oauth["consumer_secret"],
                oauth["access_token"], oauth["access_secret"])
                oauth["consumer_token"],
                oauth["consumer_secret"],
                oauth["access_token"],
                oauth["access_secret"],
            )

        # Set up our internal logger:
        if logger:
@@ -182,13 +209,24 @@ class Site:

    def __repr__(self):
        """Return the canonical string representation of the Site."""
        res = ", ".join((
            "Site(name={_name!r}", "project={_project!r}", "lang={_lang!r}",
            "base_url={_base_url!r}", "article_path={_article_path!r}",
            "script_path={_script_path!r}", "use_https={_use_https!r}",
            "assert_edit={_assert_edit!r}", "maxlag={_maxlag!r}",
            "sql={_sql_data!r}", "login={0}", "oauth={1}", "user_agent={3!r}",
            "cookiejar={2})"))
        res = ", ".join(
            (
                "Site(name={_name!r}",
                "project={_project!r}",
                "lang={_lang!r}",
                "base_url={_base_url!r}",
                "article_path={_article_path!r}",
                "script_path={_script_path!r}",
                "use_https={_use_https!r}",
                "assert_edit={_assert_edit!r}",
                "maxlag={_maxlag!r}",
                "sql={_sql_data!r}",
                "login={0}",
                "oauth={1}",
                "user_agent={3!r}",
                "cookiejar={2})",
            )
        )
        name, password = self._login_info
        login = "({0}, {1})".format(repr(name), "hidden" if password else None)
        oauth = "hidden" if self._oauth else None
@@ -211,8 +249,15 @@ class Site:
            return value
        return str(value, encoding)

    def _api_query(self, params, tries=0, wait=5, ignore_maxlag=False,
                   no_assert=False, ae_retry=True):
    def _api_query(
        self,
        params,
        tries=0,
        wait=5,
        ignore_maxlag=False,
        no_assert=False,
        ae_retry=True,
    ):
        """Do an API query with *params* as a dict of parameters.

        See the documentation for :py:meth:`api_query` for full implementation
@@ -348,8 +393,14 @@ class Site:
        """
        # All attributes to be loaded, except _namespaces, which is a special
        # case because it requires additional params in the API query:
        attrs = [self._name, self._project, self._lang, self._base_url,
                 self._article_path, self._script_path]
        attrs = [
            self._name,
            self._project,
            self._lang,
            self._base_url,
            self._article_path,
            self._script_path,
        ]

        params = {"action": "query", "meta": "siteinfo", "siprop": "general"}

@@ -359,7 +410,7 @@ class Site:
                result = self._api_query(params, no_assert=True)
            self._load_namespaces(result)
        elif all(attrs):  # Everything is already specified and we're not told
            return        # to force a reload, so do nothing
            return  # to force a reload, so do nothing
        else:  # We're only loading attributes other than _namespaces
            with self._api_lock:
                result = self._api_query(params, no_assert=True)
@@ -424,11 +475,11 @@ class Site:
        (for that, we'd do self._login_info[0]), but rather to get our current
        username without an unnecessary ?action=query&meta=userinfo API query.
        """
        name = ''.join((self._name, "Token"))
        name = "".join((self._name, "Token"))
        cookie = self._get_cookie(name, self.domain)

        if cookie:
            name = ''.join((self._name, "UserName"))
            name = "".join((self._name, "UserName"))
            user_name = self._get_cookie(name, self.domain)
            if user_name:
                return unquote_plus(user_name.value)
@@ -528,8 +579,12 @@ class Site:
        except KeyError:
            raise exceptions.LoginError("Couldn't get login token")

        params = {"action": "login", "lgname": name, "lgpassword": password,
                  "lgtoken": token}
        params = {
            "action": "login",
            "lgname": name,
            "lgpassword": password,
            "lgtoken": token,
        }
        with self._api_lock:
            result = self._api_query(params, no_assert=True)

@@ -564,18 +619,22 @@ class Site:
    def _sql_connect(self, **kwargs):
        """Attempt to establish a connection with this site's SQL database.

        oursql.connect() will be called with self._sql_data as its kwargs.
        pymysql.connect() will be called with self._sql_data as its kwargs.
        Any kwargs given to this function will be passed to connect() and will
        have precedence over the config file.

        Will raise SQLError() if the module "oursql" is not available. oursql
        may raise its own exceptions (e.g. oursql.InterfaceError) if it cannot
        Will raise SQLError() if the module "pymysql" is not available. pymysql
        may raise its own exceptions (e.g. pymysql.InterfaceError) if it cannot
        establish a connection.
        """
        args = self._sql_data
        for key, value in kwargs.items():
            args[key] = value
        if "read_default_file" not in args and "user" not in args and "passwd" not in args:
        if (
            "read_default_file" not in args
            and "user" not in args
            and "passwd" not in args
        ):
            args["read_default_file"] = expanduser("~/.my.cnf")
        elif "read_default_file" in args:
            args["read_default_file"] = expanduser(args["read_default_file"])
@@ -585,9 +644,9 @@ class Site:
            args["autoreconnect"] = True

        try:
            self._sql_conn = oursql.connect(**args)
            self._sql_conn = pymysql.connect(**args)
        except ImportError:
            e = "SQL querying requires the 'oursql' package: https://pythonhosted.org/oursql/"
            e = "SQL querying requires the 'pymysql' package: https://pymysql.readthedocs.io/"
            raise exceptions.SQLError(e)

    def _get_service_order(self):
@@ -608,8 +667,11 @@ class Site:
        if now - self._sql_info_cache["lastcheck"] > 120:
            self._sql_info_cache["lastcheck"] = now
            try:
                self._sql_info_cache["replag"] = sqllag = self.get_replag()
            except (exceptions.SQLError, oursql.Error):
                try:
                    self._sql_info_cache["replag"] = sqllag = self.get_replag()
                except pymysql.Error as exc:
                    raise exceptions.SQLError(str(exc))
            except (exceptions.SQLError, ImportError):
                self._sql_info_cache["usable"] = False
                return [self.SERVICE_API]
            self._sql_info_cache["usable"] = True
@@ -705,24 +767,31 @@ class Site:
        with self._api_lock:
            return self._api_query(kwargs)

    def sql_query(self, query, params=(), plain_query=False, dict_cursor=False,
                  cursor_class=None, show_table=False, buffsize=1024):
    def sql_query(
        self,
        query,
        params=(),
        plain_query=False,
        dict_cursor=False,
        cursor_class=None,
        buffsize=1024,
    ):
        """Do an SQL query and yield its results.

        If *plain_query* is ``True``, we will force an unparameterized query.
        Specifying both *params* and *plain_query* will cause an error. If
        *dict_cursor* is ``True``, we will use :py:class:`oursql.DictCursor` as
        our cursor, otherwise the default :py:class:`oursql.Cursor`. If
        *cursor_class* is given, it will override this option. If *show_table*
        is True, the name of the table will be prepended to the name of the
        column. This will mainly affect an :py:class:`~oursql.DictCursor`.
        *dict_cursor* is ``True``, we will use
        :py:class:`pymysql.cursors.DictCursor` as our cursor, otherwise the
        default :py:class:`pymysql.cursors.Cursor`. If *cursor_class* is given,
        it will override this option.

        *buffsize* is the size of each memory-buffered group of results, to
        reduce the number of conversations with the database; it is passed to
        :py:meth:`cursor.fetchmany() <oursql.Cursor.fetchmany>`. If set to
        ``0```, all results will be buffered in memory at once (this uses
        :py:meth:`fetchall() <oursql.Cursor.fetchall>`). If set to ``1``, it is
        equivalent to using :py:meth:`fetchone() <oursql.Cursor.fetchone>`.
        :py:meth:`cursor.fetchmany() <pymysql.cursors.Cursor.fetchmany>`. If
        set to ``0```, all results will be buffered in memory at once (this
        uses :py:meth:`fetchall() <pymysql.cursors.Cursor.fetchall>`). If set
        to ``1``, it is equivalent to using
        :py:meth:`fetchone() <pymysql.cursors.Cursor.fetchone>`.

        Example usage::

@@ -736,25 +805,25 @@ class Site:
            {'user_id': 7418060L, 'user_registration': '20080703215134'}

        This may raise :py:exc:`~earwigbot.exceptions.SQLError` or one of
        oursql's exceptions (:py:exc:`oursql.ProgrammingError`,
        :py:exc:`oursql.InterfaceError`, ...) if there were problems with the
        pymysql's exceptions (:py:exc:`pymysql.ProgrammingError`,
        :py:exc:`pymysql.InterfaceError`, ...) if there were problems with the
        query.

        See :py:meth:`_sql_connect` for information on how a connection is
        acquired. Also relevant is `oursql's documentation
        <https://pythonhosted.org/oursql/>`_ for details on that package.
        acquired. Also relevant is `pymysql's documentation
        <https://pymysql.readthedocs.io/>`_ for details on that package.
        """
        if not cursor_class:
            if dict_cursor:
                cursor_class = oursql.DictCursor
                cursor_class = pymysql.cursors.DictCursor
            else:
                cursor_class = oursql.Cursor
                cursor_class = pymysql.cursors.Cursor
        klass = cursor_class

        with self._sql_lock:
            if not self._sql_conn:
                self._sql_connect()
            with self._sql_conn.cursor(klass, show_table=show_table) as cur:
            with self._sql_conn.cursor(klass) as cur:
                cur.execute(query, params, plain_query)
                if buffsize:
                    while True:
@@ -798,8 +867,8 @@ class Site:
        time from the timestamp of the latest recent changes event.

        This may raise :py:exc:`~earwigbot.exceptions.SQLError` or one of
        oursql's exceptions (:py:exc:`oursql.ProgrammingError`,
        :py:exc:`oursql.InterfaceError`, ...) if there were problems.
        pymysql's exceptions (:py:exc:`pymysql.ProgrammingError`,
        :py:exc:`pymysql.InterfaceError`, ...) if there were problems.
        """
        query = """SELECT UNIX_TIMESTAMP() - UNIX_TIMESTAMP(rc_timestamp) FROM
                   recentchanges ORDER BY rc_timestamp DESC LIMIT 1"""
@@ -886,8 +955,7 @@ class Site:
        prefix = title.split(":", 1)[0]
        if prefix != title:  # Avoid a page that is simply "Category"
            if prefix in prefixes:
                return Category(self, title, follow_redirects, pageid,
                                self._logger)
                return Category(self, title, follow_redirects, pageid, self._logger)
        return Page(self, title, follow_redirects, pageid, self._logger)

    def get_category(self, catname, follow_redirects=False, pageid=None):
@@ -899,7 +967,7 @@ class Site:
        """
        catname = self._unicodeify(catname)
        prefix = self.namespace_id_to_name(constants.NS_CATEGORY)
        pagename = ':'.join((prefix, catname))
        pagename = ":".join((prefix, catname))
        return Category(self, pagename, follow_redirects, pageid, self._logger)

    def get_user(self, username=None):
--- a/earwigbot/wiki/sitesdb.py
+++ b/earwigbot/wiki/sitesdb.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2021 Ben Kurtovic <ben.kurtovic@gmail.com>
 # Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -35,6 +35,7 @@ from earwigbot.wiki.site import Site

 __all__ = ["SitesDB"]


 class SitesDB:
    """
    **EarwigBot: Wiki Toolset: Sites Database Manager**
@@ -106,7 +107,7 @@ class SitesDB:
                # Create the file and restrict reading/writing only to the
                # owner, so others can't peak at our cookies:
                open(self._cookie_file, "w").close()
                chmod(self._cookie_file, stat.S_IRUSR|stat.S_IWUSR)
                chmod(self._cookie_file, stat.S_IRUSR | stat.S_IWUSR)
            else:
                raise

@@ -172,8 +173,16 @@ class SitesDB:
            except KeyError:
                namespaces[ns_id] = [ns_name]

        return (name, project, lang, base_url, article_path, script_path, sql,
                namespaces)
        return (
            name,
            project,
            lang,
            base_url,
            article_path,
            script_path,
            sql,
            namespaces,
        )

    def _make_site_object(self, name):
        """Return a Site object associated with the site *name* in our sitesdb.
@@ -182,8 +191,9 @@ class SitesDB:
        raised if the site is not in our sitesdb.
        """
        cookiejar = self._get_cookiejar()
        (name, project, lang, base_url, article_path, script_path, sql,
         namespaces) = self._load_site_from_sitesdb(name)
        (name, project, lang, base_url, article_path, script_path, sql, namespaces) = (
            self._load_site_from_sitesdb(name)
        )

        config = self.config
        login = (config.wiki.get("username"), config.wiki.get("password"))
@@ -211,13 +221,26 @@ class SitesDB:
                if isinstance(value, str) and "$1" in value:
                    sql[key] = value.replace("$1", name)

        return Site(name=name, project=project, lang=lang, base_url=base_url,
                    article_path=article_path, script_path=script_path,
                    sql=sql, namespaces=namespaces, login=login, oauth=oauth,
                    cookiejar=cookiejar, user_agent=user_agent,
                    use_https=use_https, assert_edit=assert_edit,
                    maxlag=maxlag, wait_between_queries=wait_between_queries,
                    logger=logger, search_config=search_config)
        return Site(
            name=name,
            project=project,
            lang=lang,
            base_url=base_url,
            article_path=article_path,
            script_path=script_path,
            sql=sql,
            namespaces=namespaces,
            login=login,
            oauth=oauth,
            cookiejar=cookiejar,
            user_agent=user_agent,
            use_https=use_https,
            assert_edit=assert_edit,
            maxlag=maxlag,
            wait_between_queries=wait_between_queries,
            logger=logger,
            search_config=search_config,
        )

    def _get_site_name_from_sitesdb(self, project, lang):
        """Return the name of the first site with the given project and lang.
@@ -255,8 +278,14 @@ class SitesDB:
        database. If the sitesdb doesn't exist, we'll create it first.
        """
        name = site.name
        sites_data = (name, site.project, site.lang, site._base_url,
                      site._article_path, site._script_path)
        sites_data = (
            name,
            site.project,
            site.lang,
            site._base_url,
            site._article_path,
            site._script_path,
        )
        sql_data = [(name, key, val) for key, val in site._sql_data.items()]
        ns_data = []
        for ns_id, ns_names in site._namespaces.items():
@@ -353,8 +382,9 @@ class SitesDB:
        e = "Site '{0}:{1}' not found in the sitesdb.".format(project, lang)
        raise SiteNotFoundError(e)

    def add_site(self, project=None, lang=None, base_url=None,
                 script_path="/w", sql=None):
    def add_site(
        self, project=None, lang=None, base_url=None, script_path="/w", sql=None
    ):
        """Add a site to the sitesdb so it can be retrieved with get_site().

        If only a project and a lang are given, we'll guess the *base_url* as
@@ -368,8 +398,8 @@ class SitesDB:
        your wiki is different, provide the script_path as an argument. SQL
        connection settings are guessed automatically using config's template
        value. If this is wrong or not specified, provide a dict of kwargs as
        *sql* and Site will pass it to :py:func:`oursql.connect(**sql)
        <oursql.connect>`, allowing you to make queries with
        *sql* and Site will pass it to :py:func:`pymysql.connect(**sql)
        <pymysql.connect>`, allowing you to make queries with
        :py:meth:`site.sql_query <earwigbot.wiki.site.Site.sql_query>`.

        Returns ``True`` if the site was added successfully or ``False`` if the
@@ -399,11 +429,19 @@ class SitesDB:
            user_agent = user_agent.replace("$2", python_version())

        # Create a Site object to log in and load the other attributes:
        site = Site(base_url=base_url, script_path=script_path, sql=sql,
                    login=login, oauth=oauth, cookiejar=cookiejar,
                    user_agent=user_agent, use_https=use_https,
                    assert_edit=assert_edit, maxlag=maxlag,
                    wait_between_queries=wait_between_queries)
        site = Site(
            base_url=base_url,
            script_path=script_path,
            sql=sql,
            login=login,
            oauth=oauth,
            cookiejar=cookiejar,
            user_agent=user_agent,
            use_https=use_https,
            assert_edit=assert_edit,
            maxlag=maxlag,
            wait_between_queries=wait_between_queries,
        )

        self._logger.info("Added site '{0}'".format(site.name))
        self._add_site_to_sitesdb(site)
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 #! /usr/bin/env python
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2021 Ben Kurtovic <ben.kurtovic@gmail.com>
 # Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -37,11 +37,11 @@ extra_deps = {
        "cryptography >= 3.4.7",  # Storing bot passwords + keys in the config file
    ],
    "sql": [
        "oursql3 >= 0.9.4",  # Interfacing with MediaWiki databases
        "pymysql >= 1.1.0",  # Interfacing with MediaWiki databases
    ],
    "copyvios": [
        "beautifulsoup4 >= 4.9.3",  # Parsing/scraping HTML
        "cchardet >= 2.1.7",  # Encoding detection for BeautifulSoup
        "charset_normalizer >= 3.3.2",  # Encoding detection for BeautifulSoup
        "lxml >= 4.6.3",  # Faster parser for BeautifulSoup
        "nltk >= 3.6.1",  # Parsing sentences to split article content
        "pdfminer >= 20191125",  # Extracting text from PDF files
@@ -58,21 +58,21 @@ with open("README.rst") as fp:
    long_docs = fp.read()

 setup(
    name = "earwigbot",
    packages = find_packages(exclude=("tests",)),
    entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]},
    install_requires = dependencies,
    test_suite = "tests",
    version = __version__,
    author = "Ben Kurtovic",
    author_email = "ben.kurtovic@gmail.com",
    url = "https://github.com/earwig/earwigbot",
    description = "EarwigBot is a Python robot that edits Wikipedia and interacts with people over IRC.",
    long_description = long_docs,
    download_url = "https://github.com/earwig/earwigbot/tarball/v{0}".format(__version__),
    keywords = "earwig earwigbot irc wikipedia wiki mediawiki",
    license = "MIT License",
    classifiers = [
    name="earwigbot",
    packages=find_packages(exclude=("tests",)),
    entry_points={"console_scripts": ["earwigbot = earwigbot.util:main"]},
    install_requires=dependencies,
    test_suite="tests",
    version=__version__,
    author="Ben Kurtovic",
    author_email="ben.kurtovic@gmail.com",
    url="https://github.com/earwig/earwigbot",
    description="EarwigBot is a Python robot that edits Wikipedia and interacts with people over IRC.",
    long_description=long_docs,
    download_url="https://github.com/earwig/earwigbot/tarball/v{0}".format(__version__),
    keywords="earwig earwigbot irc wikipedia wiki mediawiki",
    license="MIT License",
    classifiers=[
        "Development Status :: 3 - Alpha",
        "Environment :: Console",
        "Intended Audience :: Developers",
@@ -81,6 +81,6 @@ setup(
        "Operating System :: OS Independent",
        "Programming Language :: Python :: 3",
        "Topic :: Communications :: Chat :: Internet Relay Chat",
        "Topic :: Internet :: WWW/HTTP"
        "Topic :: Internet :: WWW/HTTP",
    ],
 )