diff --git a/docs/api/earwigbot.config.rst b/docs/api/earwigbot.config.rst index d7da88c..4626ed5 100644 --- a/docs/api/earwigbot.config.rst +++ b/docs/api/earwigbot.config.rst @@ -23,14 +23,6 @@ config Package :members: :undoc-members: -:mod:`ordered_yaml` Module --------------------------- - -.. automodule:: earwigbot.config.ordered_yaml - :members: - :undoc-members: - :show-inheritance: - :mod:`permissions` Module ------------------------- diff --git a/docs/api/earwigbot.rst b/docs/api/earwigbot.rst index e94d678..a6179aa 100644 --- a/docs/api/earwigbot.rst +++ b/docs/api/earwigbot.rst @@ -30,13 +30,6 @@ earwigbot Package :undoc-members: :show-inheritance: -:mod:`lazy` Module ------------------- - -.. automodule:: earwigbot.lazy - :members: - :undoc-members: - :mod:`managers` Module ---------------------- diff --git a/pyproject.toml b/pyproject.toml index 384e3a1..caed9a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,9 +62,6 @@ build-backend = "setuptools.build_meta" exclude = [ # TODO "src/earwigbot/commands", - "src/earwigbot/config", - "src/earwigbot/lazy.py", - "src/earwigbot/irc", "src/earwigbot/tasks", "src/earwigbot/wiki/copyvios" ] diff --git a/src/earwigbot/__init__.py b/src/earwigbot/__init__.py index 0a21d1a..4a1dce1 100644 --- a/src/earwigbot/__init__.py +++ b/src/earwigbot/__init__.py @@ -26,8 +26,17 @@ See :file:`README.rst` for an overview, or the :file:`docs/` directory for detai This documentation is also available `online `_. """ -import typing - +__all__ = [ + "bot", + "cli", + "commands", + "config", + "exceptions", + "irc", + "managers", + "tasks", + "wiki", +] __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2009-2024 Ben Kurtovic" __license__ = "MIT License" @@ -54,30 +63,14 @@ if not __release__: finally: del _get_git_commit_id -from earwigbot import lazy - -importer = lazy.LazyImporter() - -if typing.TYPE_CHECKING: - from earwigbot import ( - bot, - cli, - commands, - config, - exceptions, - irc, - managers, - tasks, - wiki, - ) - -else: - bot = importer.new("earwigbot.bot") - cli = importer.new("earwigbot.cli") - commands = importer.new("earwigbot.commands") - config = importer.new("earwigbot.config") - exceptions = importer.new("earwigbot.exceptions") - irc = importer.new("earwigbot.irc") - managers = importer.new("earwigbot.managers") - tasks = importer.new("earwigbot.tasks") - wiki = importer.new("earwigbot.wiki") +from earwigbot import ( + bot, + cli, + commands, + config, + exceptions, + irc, + managers, + tasks, + wiki, +) diff --git a/src/earwigbot/commands/crypt.py b/src/earwigbot/commands/crypt.py index 6f3a497..54d962d 100644 --- a/src/earwigbot/commands/crypt.py +++ b/src/earwigbot/commands/crypt.py @@ -22,13 +22,8 @@ import base64 import hashlib import os -from earwigbot import importer from earwigbot.commands import Command -fernet = importer.new("cryptography.fernet") -hashes = importer.new("cryptography.hazmat.primitives.hashes") -pbkdf2 = importer.new("cryptography.hazmat.primitives.kdf.pbkdf2") - class Crypt(Command): """Provides hash functions with !hash (!hash list for supported algorithms) @@ -73,6 +68,16 @@ class Crypt(Command): return try: + from cryptography import fernet + from cryptography.hazmat.primitives import hashes + from cryptography.hazmat.primitives.kdf import pbkdf2 + except ModuleNotFoundError: + self.reply( + data, + "This command requires the 'cryptography' package: https://cryptography.io/", + ) + + try: if data.command == "encrypt": salt = os.urandom(saltlen) kdf = pbkdf2.PBKDF2HMAC( @@ -101,10 +106,5 @@ class Crypt(Command): base64.urlsafe_b64encode(kdf.derive(key.encode())) ) self.reply(data, f.decrypt(ciphertext).decode()) - except ImportError: - self.reply( - data, - "This command requires the 'cryptography' package: https://cryptography.io/", - ) except Exception as error: self.reply(data, f"{type(error).__name__}: {str(error)}") diff --git a/src/earwigbot/config/__init__.py b/src/earwigbot/config/__init__.py index da42559..24f794b 100644 --- a/src/earwigbot/config/__init__.py +++ b/src/earwigbot/config/__init__.py @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2024 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -28,18 +28,12 @@ from os import mkdir, path import yaml -from earwigbot import importer from earwigbot.config.formatter import BotFormatter from earwigbot.config.node import ConfigNode -from earwigbot.config.ordered_yaml import OrderedLoader from earwigbot.config.permissions import PermissionsDB from earwigbot.config.script import ConfigScript from earwigbot.exceptions import NoConfigError -fernet = importer.new("cryptography.fernet") -hashes = importer.new("cryptography.hazmat.primitives.hashes") -pbkdf2 = importer.new("cryptography.hazmat.primitives.kdf.pbkdf2") - __all__ = ["BotConfig"] @@ -128,12 +122,11 @@ class BotConfig: def _load(self): """Load data from our JSON config file (config.yml) into self._data.""" - filename = self._config_path - with open(filename) as fp: + with open(self._config_path) as fp: try: - self._data = yaml.load(fp, OrderedLoader) + self._data = yaml.load(fp, yaml.CSafeLoader) except yaml.YAMLError: - print(f"Error parsing config file {filename}:") + print(f"Error parsing config file {self._config_path}:") raise def _setup_logging(self): @@ -276,9 +269,7 @@ class BotConfig: if not path.exists(self._config_path): self._handle_missing_config() self._load() - if not self._data: - self._handle_missing_config() - self._load() + assert self._data is not None self.components._load(self._data.get("components", OrderedDict())) self.wiki._load(self._data.get("wiki", OrderedDict())) @@ -291,6 +282,10 @@ class BotConfig: if self.is_encrypted(): if not self._decryption_cipher: try: + from cryptography import fernet + from cryptography.hazmat.primitives import hashes + from cryptography.hazmat.primitives.kdf import pbkdf2 + salt = self.metadata["salt"] kdf = pbkdf2.PBKDF2HMAC( algorithm=hashes.SHA256(), @@ -298,7 +293,7 @@ class BotConfig: salt=salt, iterations=ConfigScript.PBKDF_ROUNDS, ) - except ImportError: + except ModuleNotFoundError: e = "Encryption requires the 'cryptography' package: https://cryptography.io/" raise NoConfigError(e) key = getpass("Enter key to decrypt bot passwords: ") @@ -352,6 +347,7 @@ class BotConfig: "week_day": week_day, } + assert self._data is not None data = self._data.get("schedule", []) for event in data: do = True diff --git a/src/earwigbot/config/node.py b/src/earwigbot/config/node.py index 86dd4dd..a308add 100644 --- a/src/earwigbot/config/node.py +++ b/src/earwigbot/config/node.py @@ -19,22 +19,21 @@ # SOFTWARE. import base64 -from collections import OrderedDict __all__ = ["ConfigNode"] class ConfigNode: def __init__(self): - self._data = OrderedDict() + self._data = {} - def __repr__(self): - return self._data + def __repr__(self) -> str: + return repr(self._data) - def __bool__(self): + def __bool__(self) -> bool: return bool(self._data) - def __len__(self): + def __len__(self) -> int: return len(self._data) def __getitem__(self, key): diff --git a/src/earwigbot/config/ordered_yaml.py b/src/earwigbot/config/ordered_yaml.py deleted file mode 100644 index e8ecfae..0000000 --- a/src/earwigbot/config/ordered_yaml.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright (C) 2009-2015 Ben Kurtovic -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -""" -Based on: - - * https://gist.github.com/844388 - * https://pyyaml.org/attachment/ticket/161/use_ordered_dict.py - -with modifications. -""" - -from collections import OrderedDict - -import yaml - -__all__ = ["OrderedLoader", "OrderedDumper"] - - -class OrderedLoader(yaml.Loader): - """A YAML loader that loads mappings into ordered dictionaries.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - constructor = type(self).construct_yaml_map - self.add_constructor("tag:yaml.org,2002:map", constructor) - self.add_constructor("tag:yaml.org,2002:omap", constructor) - - def construct_yaml_map(self, node): - data = OrderedDict() - yield data - value = self.construct_mapping(node) - data.update(value) - - def construct_mapping(self, node, deep=False): - if isinstance(node, yaml.MappingNode): - self.flatten_mapping(node) - else: - raise yaml.constructor.ConstructorError( - None, - None, - f"expected a mapping node, but found {node.id}", - node.start_mark, - ) - - mapping = OrderedDict() - for key_node, value_node in node.value: - key = self.construct_object(key_node, deep=deep) - try: - hash(key) - except TypeError as exc: - raise yaml.constructor.ConstructorError( - "while constructing a mapping", - node.start_mark, - f"found unacceptable key ({exc})", - key_node.start_mark, - ) - value = self.construct_object(value_node, deep=deep) - mapping[key] = value - return mapping - - -class OrderedDumper(yaml.SafeDumper): - """A YAML dumper that dumps ordered dictionaries into mappings.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.add_representer(OrderedDict, type(self).represent_dict) - - def represent_mapping(self, tag, mapping, flow_style=None): - value = [] - node = yaml.MappingNode(tag, value, flow_style=flow_style) - if self.alias_key is not None: - self.represented_objects[self.alias_key] = node - best_style = True - if hasattr(mapping, "items"): - mapping = list(mapping.items()) - for item_key, item_value in mapping: - node_key = self.represent_data(item_key) - node_value = self.represent_data(item_value) - if not (isinstance(node_key, yaml.ScalarNode) and not node_key.style): - best_style = False - if not (isinstance(node_value, yaml.ScalarNode) and not node_value.style): - best_style = False - value.append((node_key, node_value)) - if flow_style is None: - if self.default_flow_style is not None: - node.flow_style = self.default_flow_style - else: - node.flow_style = best_style - return node diff --git a/src/earwigbot/config/script.py b/src/earwigbot/config/script.py index cfb22ea..21dfcef 100644 --- a/src/earwigbot/config/script.py +++ b/src/earwigbot/config/script.py @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2016 Ben Kurtovic +# Copyright (C) 2009-2024 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -19,36 +19,40 @@ # SOFTWARE. import base64 +import getpass import os +import os.path import re import stat import sys -from collections import OrderedDict -from getpass import getpass -from os import chmod, makedirs, mkdir, path -from textwrap import fill, wrap +import textwrap +import typing +from typing import Any, Literal import yaml -from earwigbot import exceptions, importer -from earwigbot.config.ordered_yaml import OrderedDumper - -fernet = importer.new("cryptography.fernet") -hashes = importer.new("cryptography.hazmat.primitives.hashes") -pbkdf2 = importer.new("cryptography.hazmat.primitives.kdf.pbkdf2") +from earwigbot import exceptions __all__ = ["ConfigScript"] -RULES_TEMPLATE = """# -*- coding: utf-8 -*- +RULES_TEMPLATE = """\ +from earwigbot.bot import Bot +from earwigbot.irc import RC + +def process(bot: Bot, rc: RC): + \"\"\" + Return a list of channels to report this event to. -def process(bot, rc): - \"\"\"Given a Bot() object and an RC() object, return a list of channels - to report this event to. Also, start any wiki bot tasks within this - function if necessary.\"\"\" + Also, start any wiki bot tasks within this function if necessary. + \"\"\" pass """ +class RetryError(Exception): + pass + + class ConfigScript: """A script to guide a user through the creation of a new config file.""" @@ -58,17 +62,15 @@ class ConfigScript: def __init__(self, config): self.config = config - self.data = OrderedDict( - [ - ("metadata", OrderedDict()), - ("components", OrderedDict()), - ("wiki", OrderedDict()), - ("irc", OrderedDict()), - ("commands", OrderedDict()), - ("tasks", OrderedDict()), - ("schedule", []), - ] - ) + self.data = { + "metadata": {}, + "components": {}, + "wiki": {}, + "irc": {}, + "commands": {}, + "tasks": {}, + "schedule": [], + } self._cipher = None self._wmf = False @@ -76,20 +78,28 @@ class ConfigScript: self._lang = None def _print(self, text): - print(fill(re.sub(r"\s\s+", " ", text), self.WIDTH)) + print(textwrap.fill(re.sub(r"\s\s+", " ", text), self.WIDTH)) def _print_no_nl(self, text): - sys.stdout.write(fill(re.sub(r"\s\s+", " ", text), self.WIDTH)) + sys.stdout.write(textwrap.fill(re.sub(r"\s\s+", " ", text), self.WIDTH)) sys.stdout.flush() def _pause(self): input(self.PROMPT + "Press enter to continue: ") - def _ask(self, text, default=None, require=True): + @typing.overload + def _ask(self, text, default=None, require: Literal[True] = True) -> str: ... + + @typing.overload + def _ask( + self, text, default=None, require: Literal[False] = False + ) -> str | None: ... + + def _ask(self, text, default=None, require=True) -> str | None: text = self.PROMPT + text if default: text += f" \x1b[33m[{default}]\x1b[0m" - lines = wrap(re.sub(r"\s\s+", " ", text), self.WIDTH) + lines = textwrap.wrap(re.sub(r"\s\s+", " ", text), self.WIDTH) if len(lines) > 1: print("\n".join(lines[:-1])) while True: @@ -103,7 +113,7 @@ class ConfigScript: text += " \x1b[33m[Y/n]\x1b[0m" else: text += " \x1b[33m[y/N]\x1b[0m" - lines = wrap(re.sub(r"\s\s+", " ", text), self.WIDTH) + lines = textwrap.wrap(re.sub(r"\s\s+", " ", text), self.WIDTH) if len(lines) > 1: print("\n".join(lines[:-1])) while True: @@ -116,7 +126,7 @@ class ConfigScript: return False def _ask_pass(self, text, encrypt=True): - password = getpass(self.PROMPT + text + " ") + password = getpass.getpass(self.PROMPT + text + " ") if encrypt: return self._encrypt(password) return password @@ -128,7 +138,7 @@ class ConfigScript: return password def _ask_list(self, text): - print(fill(re.sub(r"\s\s+", " ", self.PROMPT + text), self.WIDTH)) + print(textwrap.fill(re.sub(r"\s\s+", " ", self.PROMPT + text), self.WIDTH)) print("[one item per line; blank line to end]:") result = [] while True: @@ -140,18 +150,24 @@ class ConfigScript: def _set_metadata(self): print() - self.data["metadata"] = OrderedDict([("version", 1)]) - self._print("""I can encrypt passwords stored in your config file in - addition to preventing other users on your system from - reading the file. Encryption is recommended if the bot - is to run on a public server like Toolforge, but the - need to enter a key every time you start the bot may be - an inconvenience.""") - self.data["metadata"]["encryptPasswords"] = False + metadata: dict[str, Any] = {"version": 1} + self.data["metadata"] = metadata + self._print( + """I can encrypt passwords stored in your config file in addition to + preventing other users on your system from reading the file. Encryption is + recommended if the bot is to run on a public server like Toolforge, but the + need to enter a key every time you start the bot may be an + inconvenience.""" + ) + metadata["encryptPasswords"] = False if self._ask_bool("Encrypt stored passwords?"): - key = getpass(self.PROMPT + "Enter an encryption key: ") + key = getpass.getpass(self.PROMPT + "Enter an encryption key: ") self._print_no_nl("Generating key...") try: + from cryptography import fernet + from cryptography.hazmat.primitives import hashes + from cryptography.hazmat.primitives.kdf import pbkdf2 + salt = os.urandom(16) kdf = pbkdf2.PBKDF2HMAC( algorithm=hashes.SHA256(), @@ -162,44 +178,52 @@ class ConfigScript: self._cipher = fernet.Fernet( base64.urlsafe_b64encode(kdf.derive(key.encode())) ) - except ImportError: + except ModuleNotFoundError: print(" error!") - self._print("""Encryption requires the 'cryptography' package: - https://cryptography.io/""") - self._print("""I will disable encryption for now; restart - configuration after installing these packages if - you want it.""") + self._print( + "Encryption requires the 'cryptography' package: https://cryptography.io/" + ) + self._print( + """I will disable encryption for now; restart configuration after + installing these packages if you want it.""" + ) self._pause() else: - self.data["metadata"]["encryptPasswords"] = True - self.data["metadata"]["salt"] = base64.b64encode(salt).decode() + metadata["encryptPasswords"] = True + metadata["salt"] = base64.b64encode(salt).decode() print(" done.") print() - self._print("""The bot can temporarily store its logs in the logs/ - subdirectory. Error logs are kept for a month whereas - normal logs are kept for a week. If you disable this, - the bot will still print logs to stdout.""") + self._print( + """The bot can temporarily store its logs in the logs/ subdirectory. Error + logs are kept for a month whereas normal logs are kept for a week. If you + disable this, the bot will still print logs to stdout.""" + ) logging = self._ask_bool("Enable logging?") - self.data["metadata"]["enableLogging"] = logging + metadata["enableLogging"] = logging def _set_components(self): print() - self._print("""The bot contains three separate components that can run - independently of each other.""") - self._print("""- The IRC front-end runs on a normal IRC server, like - Libera, and expects users to interact with it through - commands.""") - self._print("""- The IRC watcher runs on a wiki recent-changes server, - like irc.wikimedia.org, and listens for edits. Users - cannot interact with this component. It can detect - specific events and report them to "feed" channels on - the front-end or start bot tasks.""") - self._print("""- The wiki task scheduler runs wiki-editing bot tasks in - separate threads at user-defined times through a - cron-like interface. Tasks which are not scheduled can - be started by the IRC watcher manually through the IRC - front-end.""") + self._print( + """The bot contains three separate components that can run independently of + each other.""" + ) + self._print( + """- The IRC front-end runs on a normal IRC server, like Libera, and + expects users to interact with it through commands.""" + ) + self._print( + """- The IRC watcher runs on a wiki recent-changes server, like + irc.wikimedia.org, and listens for edits. Users cannot interact with this + component. It can detect specific events and report them to "feed" channels + on the front-end or start bot tasks.""" + ) + self._print( + """- The wiki task scheduler runs wiki-editing bot tasks in separate + threads at user-defined times through a cron-like interface. Tasks which + are not scheduled can be started by the IRC watcher manually through the + IRC front-end.""" + ) frontend = self._ask_bool("Enable the IRC front-end?") watcher = self._ask_bool("Enable the IRC watcher?") scheduler = self._ask_bool("Enable the wiki task scheduler?") @@ -214,17 +238,17 @@ class ConfigScript: site = self.config.bot.wiki.add_site(**kwargs) except exceptions.APIError as exc: print(" API error!") - print("\x1b[31m" + exc.message + "\x1b[0m") + print(f"\x1b[31m{exc}\x1b[0m") question = "Would you like to re-enter the site information?" if self._ask_bool(question): - return self._set_wiki() + raise RetryError() question = "This will cancel the setup process. Are you sure?" if self._ask_bool(question, default=False): raise exceptions.NoConfigError() - return self._set_wiki() + raise RetryError() except exceptions.LoginError as exc: print(" login error!") - print("\x1b[31m" + exc.message + "\x1b[0m") + print(f"\x1b[31m{exc}\x1b[0m") question = "Would you like to re-enter your login information?" if self._ask_bool(question): self.data["wiki"]["username"] = self._ask("Bot username:") @@ -235,10 +259,12 @@ class ConfigScript: password = self.data["wiki"]["password"] question = "Would you like to re-enter the site information?" if self._ask_bool(question): - return self._set_wiki() + raise RetryError() print() - self._print("""Moving on. You can modify the login information - stored in the bot's config in the future.""") + self._print( + """Moving on. You can modify the login information stored in the bot's + config in the future.""" + ) self.data["wiki"]["password"] = None # Clear so we don't login self.config.wiki._load(self.data["wiki"]) self._print_no_nl("Trying to connect to the site...") @@ -255,8 +281,9 @@ class ConfigScript: def _set_wiki(self): print() - self._wmf = self._ask_bool("""Will this bot run on Wikimedia Foundation - wikis, like Wikipedia?""") + self._wmf = self._ask_bool( + "Will this bot run on Wikimedia Foundation wikis, like Wikipedia?" + ) if self._wmf: msg = "Site project (e.g. 'wikipedia', 'wiktionary', 'wikimedia'):" self._proj = project = self._ask(msg, "wikipedia").lower() @@ -288,39 +315,32 @@ class ConfigScript: msg = "Will this bot run from the Wikimedia Tool Labs?" labs = self._ask_bool(msg, default=False) if labs: - args = [ - ("host", "$1.labsdb"), - ("db", "$1_p"), - ("read_default_file", "~/replica.my.cnf"), - ] - self.data["wiki"]["sql"] = OrderedDict(args) - else: - msg = "Will this bot run from the Wikimedia Toolserver?" - toolserver = self._ask_bool(msg, default=False) - if toolserver: - args = [("host", "$1-p.rrdb.toolserver.org"), ("db", "$1_p")] - self.data["wiki"]["sql"] = OrderedDict(args) + self.data["wiki"]["sql"] = { + "host": "$1.labsdb", + "db": "$1_p", + "read_default_file": "~/replica.my.cnf", + } self.data["wiki"]["shutoff"] = {} msg = "Would you like to enable an automatic shutoff page for the bot?" if self._ask_bool(msg): print() - self._print("""The page title can contain two wildcards: $1 will be - substituted with the bot's username, and $2 with the - current task number. This can be used to implement a - separate shutoff page for each task.""") + self._print( + """The page title can contain two wildcards: $1 will be substituted + with the bot's username, and $2 with the current task number. This can + be used to implement a separate shutoff page for each task.""" + ) page = self._ask("Page title:", "User:$1/Shutoff") msg = "Page content to indicate the bot is *not* shut off:" disabled = self._ask(msg, "run") - args = [("page", page), ("disabled", disabled)] - self.data["wiki"]["shutoff"] = OrderedDict(args) + self.data["wiki"]["shutoff"] = {"page": page, "disabled": disabled} self.data["wiki"]["search"] = {} def _set_irc(self): if self.data["components"]["irc_frontend"]: print() - frontend = self.data["irc"]["frontend"] = OrderedDict() + frontend = self.data["irc"]["frontend"] = {} frontend["host"] = self._ask( "Hostname of the frontend's IRC server:", "irc.libera.chat" ) @@ -339,14 +359,14 @@ class ConfigScript: chan_question = "Frontend channels to join by default:" frontend["channels"] = self._ask_list(chan_question) print() - self._print("""The bot keeps a database of its admins (users who - can use certain sensitive commands) and owners - (users who can quit the bot and modify its access - list), identified by nick, ident, and/or hostname. - Hostname is the most secure option since it cannot - be easily spoofed. If you have a cloak, this will - probably look like 'wikipedia/Username' or - 'user/nickname'.""") + self._print( + """The bot keeps a database of its admins (users who can use certain + sensitive commands) and owners (users who can quit the bot and modify + its access list), identified by nick, ident, and/or hostname. Hostname + is the most secure option since it cannot be easily spoofed. If you + have a cloak, this will probably look like 'wikipedia/Username' or + 'user/nickname'.""" + ) host = self._ask("Your hostname on the frontend:", require=False) if host: permdb = self.config._permissions @@ -358,7 +378,7 @@ class ConfigScript: if self.data["components"]["irc_watcher"]: print() - watcher = self.data["irc"]["watcher"] = OrderedDict() + watcher = self.data["irc"]["watcher"] = {} if self._wmf: watcher["host"] = "irc.wikimedia.org" watcher["port"] = 6667 @@ -386,14 +406,14 @@ class ConfigScript: chan_question = "Watcher channels to join by default:" watcher["channels"] = self._ask_list(chan_question) print() - self._print("""I am now creating a blank 'rules.py' file, which - will determine how the bot handles messages received - from the IRC watcher. It contains a process() - function that takes a Bot object (allowing you to - start tasks) and an RC object (storing the message - from the watcher). See the documentation for - details.""") - with open(path.join(self.config.root_dir, "rules.py"), "w") as fp: + self._print( + """I am now creating a blank 'rules.py' file, which will determine how + the bot handles messages received from the IRC watcher. It contains a + process() function that takes a Bot object (allowing you to start + tasks) and an RC object (storing the message from the watcher). See the + documentation for details.""" + ) + with open(os.path.join(self.config.root_dir, "rules.py"), "w") as fp: fp.write(RULES_TEMPLATE) self._pause() @@ -403,47 +423,55 @@ class ConfigScript: def _set_commands(self): print() - msg = """Would you like to disable the default IRC commands? You can - fine-tune which commands are disabled later on.""" + msg = """Would you like to disable the default IRC commands? You can fine-tune + which commands are disabled later on.""" if not self.data["components"]["irc_frontend"] or self._ask_bool( msg, default=False ): self.data["commands"]["disable"] = True print() - self._print("""I am now creating the 'commands/' directory, where you - can place custom IRC commands and plugins. Creating your - own commands is described in the documentation.""") - mkdir(path.join(self.config.root_dir, "commands")) + self._print( + """I am now creating the 'commands/' directory, where you can place custom + IRC commands and plugins. Creating your own commands is described in the + documentation.""" + ) + os.mkdir(os.path.join(self.config.root_dir, "commands")) self._pause() def _set_tasks(self): print() - self._print("""I am now creating the 'tasks/' directory, where you can - place custom bot tasks and plugins. Creating your own - tasks is described in the documentation.""") - mkdir(path.join(self.config.root_dir, "tasks")) + self._print( + """I am now creating the 'tasks/' directory, where you can place custom bot + tasks and plugins. Creating your own tasks is described in the + documentation.""" + ) + os.mkdir(os.path.join(self.config.root_dir, "tasks")) self._pause() def _set_schedule(self): print() - self._print("""The final section of your config file, 'schedule', is a - list of bot tasks to be started by the wiki scheduler. - Each entry contains cron-like time quantifiers and a - list of tasks. For example, the following starts the - 'foobot' task every hour on the half-hour:""") + self._print( + """The final section of your config file, 'schedule', is a list of bot + tasks to be started by the wiki scheduler. Each entry contains cron-like + time quantifiers and a list of tasks. For example, the following starts the + 'foobot' task every hour on the half-hour:""" + ) print("\x1b[33mschedule:") print(" - minute: 30") print(" tasks:") print(" - foobot\x1b[0m") - self._print("""The following starts the 'barbot' task with the keyword - arguments 'action="baz"' every Monday at 05:00 UTC:""") + self._print( + """The following starts the 'barbot' task with the keyword arguments + 'action="baz"' every Monday at 05:00 UTC:""" + ) print("\x1b[33m - week_day: 1") print(" hour: 5") print(" tasks:") print(' - ["barbot", {"action": "baz"}]\x1b[0m') - self._print("""The full list of quantifiers is minute, hour, month_day, - month, and week_day. See the documentation for more - information.""") + self._print( + """The full list of quantifiers is minute, hour, month_day, month, and + week_day. See the documentation for more information.""" + ) self._pause() def _save(self): @@ -451,7 +479,7 @@ class ConfigScript: yaml.dump( self.data, stream, - OrderedDumper, + yaml.CSafeDumper, indent=4, allow_unicode=True, default_flow_style=False, @@ -460,19 +488,24 @@ class ConfigScript: def make_new(self): """Make a new config file based on the user's input.""" try: - makedirs(path.dirname(self.config.path)) + os.makedirs(os.path.dirname(self.config.path)) except OSError as exc: if exc.errno != 17: raise try: open(self.config.path, "w").close() - chmod(self.config.path, stat.S_IRUSR | stat.S_IWUSR) + os.chmod(self.config.path, stat.S_IRUSR | stat.S_IWUSR) except OSError: print("I can't seem to write to the config file:") raise self._set_metadata() self._set_components() - self._set_wiki() + while True: + try: + self._set_wiki() + break + except RetryError: + continue components = self.data["components"] if components["irc_frontend"] or components["irc_watcher"]: self._set_irc() @@ -481,12 +514,12 @@ class ConfigScript: if components["wiki_scheduler"]: self._set_schedule() print() - self._print("""I am now saving config.yml with your settings. YAML is a - relatively straightforward format and you should be able - to update these settings in the future when necessary. - I will start the bot at your signal. Feel free to - contact me at wikipedia.earwig@gmail.com if you have any - questions.""") + self._print( + """I am now saving config.yml with your settings. YAML is a relatively + straightforward format and you should be able to update these settings in + the future when necessary. I will start the bot at your signal. Feel free + to contact me at wikipedia.earwig@gmail.com if you have any questions.""" + ) self._save() if not self._ask_bool("Start the bot now?"): exit() diff --git a/src/earwigbot/irc/__init__.py b/src/earwigbot/irc/__init__.py index f4a2be5..b251451 100644 --- a/src/earwigbot/irc/__init__.py +++ b/src/earwigbot/irc/__init__.py @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2024 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -18,8 +18,10 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from earwigbot.irc.connection import * -from earwigbot.irc.data import * -from earwigbot.irc.frontend import * -from earwigbot.irc.rc import * -from earwigbot.irc.watcher import * +__all__ = ["Data", "Frontend", "IRCConnection", "RC", "Watcher"] + +from earwigbot.irc.connection import IRCConnection +from earwigbot.irc.data import Data +from earwigbot.irc.frontend import Frontend +from earwigbot.irc.rc import RC +from earwigbot.irc.watcher import Watcher diff --git a/src/earwigbot/irc/connection.py b/src/earwigbot/irc/connection.py index 823becd..5fba7f1 100644 --- a/src/earwigbot/irc/connection.py +++ b/src/earwigbot/irc/connection.py @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2024 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -18,14 +18,14 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +__all__ = ["IRCConnection"] + import socket from threading import Lock from time import sleep, time from earwigbot.exceptions import BrokenSocketError -__all__ = ["IRCConnection"] - class IRCConnection: """Interface with an IRC server.""" diff --git a/src/earwigbot/irc/data.py b/src/earwigbot/irc/data.py index b150438..788716d 100644 --- a/src/earwigbot/irc/data.py +++ b/src/earwigbot/irc/data.py @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2016 Ben Kurtovic +# Copyright (C) 2009-2024 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -18,10 +18,10 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import re - __all__ = ["Data"] +import re + class Data: """Store data from an individual line received on IRC.""" @@ -78,6 +78,7 @@ class Data: bot's name); self.is_command will be set to True, and self.trigger will store the trigger string. Otherwise, is_command will be set to False. """ + assert self.msg is not None self._args = self.msg.strip().split() try: @@ -87,16 +88,16 @@ class Data: return # e.g. "!command>user arg1 arg2" - if ">" in self.command: + if ">" in self._command: command_uc, self._reply_nick = command_uc.split(">", 1) self._command = command_uc.lower() - if self.command.startswith("!") or self.command.startswith("."): + if self._command.startswith("!") or self._command.startswith("."): # e.g. "!command arg1 arg2" self._is_command = True - self._trigger = self.command[0] - self._command = self.command[1:] # Strip the "!" or "." - elif re.match(rf"{re.escape(self.my_nick)}\W*?$", self.command, re.U): + self._trigger = self._command[0] + self._command = self._command[1:] # Strip the "!" or "." + elif re.match(rf"{re.escape(self.my_nick)}\W*?$", self._command, re.U): # e.g. "EarwigBot, command arg1 arg2" self._is_command = True self._trigger = self.my_nick @@ -110,7 +111,7 @@ class Data: if self.args: self.args[-1] = self.args[-1][:-1] else: - self._command = self.command[:-1] + self._command = self._command[:-1] except IndexError: pass diff --git a/src/earwigbot/irc/frontend.py b/src/earwigbot/irc/frontend.py index 4bbe5c7..d7c9734 100644 --- a/src/earwigbot/irc/frontend.py +++ b/src/earwigbot/irc/frontend.py @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2021 Ben Kurtovic +# Copyright (C) 2009-2024 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -18,11 +18,11 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from time import sleep +__all__ = ["Frontend"] -from earwigbot.irc import Data, IRCConnection +import time -__all__ = ["Frontend"] +from earwigbot.irc import Data, IRCConnection class Frontend(IRCConnection): @@ -121,10 +121,11 @@ class Frontend(IRCConnection): elif line[1] == "NOTICE": data = Data(self.nick, line, msgtype="NOTICE") if self._auth_wait and data.nick == self.NICK_SERVICES: + assert data.msg is not None if data.msg.startswith("This nickname is registered."): return self._auth_wait = False - sleep(2) # Wait for hostname change to propagate + time.sleep(2) # Wait for hostname change to propagate self._join_channels() elif line[1] == "KICK": diff --git a/src/earwigbot/irc/rc.py b/src/earwigbot/irc/rc.py index be29441..ffb8664 100644 --- a/src/earwigbot/irc/rc.py +++ b/src/earwigbot/irc/rc.py @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2021 Ben Kurtovic +# Copyright (C) 2009-2024 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -18,19 +18,19 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import re - __all__ = ["RC"] +import re + class RC: """Store data from an event received from our IRC watcher.""" re_color = re.compile("\x03([0-9]{1,2}(,[0-9]{1,2})?)?") re_edit = re.compile( - "\A\[\[(.*?)\]\]\s(.*?)\s(https?://.*?)\s\*\s(.*?)\s\*\s(.*?)\Z" + r"\A\[\[(.*?)\]\]\s(.*?)\s(https?://.*?)\s\*\s(.*?)\s\*\s(.*?)\Z" ) - re_log = re.compile("\A\[\[(.*?)\]\]\s(.*?)\s\s\*\s(.*?)\s\*\s(.*?)\Z") + re_log = re.compile(r"\A\[\[(.*?)\]\]\s(.*?)\s\s\*\s(.*?)\s\*\s(.*?)\Z") pretty_edit = "\x02New {0}\x0f: \x0314[[\x0307{1}\x0314]]\x0306 * \x0303{2}\x0306 * \x0302{3}\x0306 * \x0310{4}" pretty_log = "\x02New {0}\x0f: \x0303{1}\x0306 * \x0302{2}\x0306 * \x0310{3}" diff --git a/src/earwigbot/lazy.py b/src/earwigbot/lazy.py deleted file mode 100644 index 21c0351..0000000 --- a/src/earwigbot/lazy.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright (C) 2009-2024 Ben Kurtovic -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -""" -Implements a hierarchy of importing classes as defined in `PEP 302 -`_ to load modules in a safe yet lazy -manner, so that they can be referred to by name but are not actually loaded -until they are used (i.e. their attributes are read or modified). -""" - -import importlib -import sys -from threading import RLock -from types import ModuleType - -__all__ = ["LazyImporter"] - -_real_get = ModuleType.__getattribute__ -_lazy_init_lock = RLock() - - -def _create_failing_get(exc): - def _fail(self, attr): - raise exc - - return _fail - - -def _mock_get(self, attr): - with _real_get(self, "_lock"): - if _real_get(self, "_unloaded"): - type(self)._unloaded = False - try: - importlib.reload(self) - except ImportError as exc: - type(self).__getattribute__ = _create_failing_get(exc) - del type(self)._lock - raise - type(self).__getattribute__ = _real_get - del type(self)._lock - return _real_get(self, attr) - - -class _LazyModule(type): - def __new__(cls, name): - with _lazy_init_lock: - if name not in sys.modules: - attributes = { - "__name__": name, - "__getattribute__": _mock_get, - "_unloaded": True, - "_lock": RLock(), - } - parents = (ModuleType,) - klass = type.__new__(cls, "module", parents, attributes) - sys.modules[name] = klass(name) - if "." in name: # Also ensure the parent exists - _LazyModule(name.rsplit(".", 1)[0]) - return sys.modules[name] - - -class LazyImporter: - """An importer for modules that are loaded lazily. - - This inserts itself into :py:data:`sys.meta_path`, storing a dictionary of - :py:class:`_LazyModule`\ s (which is added to with :py:meth:`new`). - """ - - def __init__(self): - self._modules = {} - sys.meta_path.append(self) - - def new(self, name): - module = _LazyModule(name) - self._modules[name] = module - return module - - def find_module(self, fullname, path=None): - if fullname in self._modules and fullname not in sys.modules: - return self - - def load_module(self, fullname): - return self._modules.pop(fullname) diff --git a/src/earwigbot/wiki/copyvios/__init__.py b/src/earwigbot/wiki/copyvios/__init__.py index dd8eca8..4602e28 100644 --- a/src/earwigbot/wiki/copyvios/__init__.py +++ b/src/earwigbot/wiki/copyvios/__init__.py @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2016 Ben Kurtovic +# Copyright (C) 2009-2024 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -18,7 +18,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from time import sleep +import time from urllib.request import build_opener from earwigbot import exceptions @@ -73,7 +73,7 @@ class CopyvioMixIn: for dep in klass.requirements(): try: __import__(dep).__name__ - except (ImportError, AttributeError): + except (ModuleNotFoundError, AttributeError): e = "Missing a required dependency ({}) for the {} engine" e = e.format(dep, engine) raise exceptions.UnsupportedSearchEngineError(e) @@ -173,7 +173,7 @@ class CopyvioMixIn: self._logger.debug(log.format(self.title, searcher.name, chunk)) workspace.enqueue(searcher.search(chunk)) num_queries += 1 - sleep(1) + time.sleep(1) workspace.wait() result = workspace.get_result(num_queries) diff --git a/src/earwigbot/wiki/copyvios/exclusions.py b/src/earwigbot/wiki/copyvios/exclusions.py index 8e8994a..6634cf0 100644 --- a/src/earwigbot/wiki/copyvios/exclusions.py +++ b/src/earwigbot/wiki/copyvios/exclusions.py @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2016 Ben Kurtovic +# Copyright (C) 2009-2024 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -19,10 +19,10 @@ # SOFTWARE. import re -import sqlite3 as sqlite -from threading import Lock -from time import time -from urllib.parse import urlparse +import sqlite3 +import threading +import time +import urllib.parse from earwigbot import exceptions @@ -60,7 +60,7 @@ class ExclusionsDB: self._sitesdb = sitesdb self._dbfile = dbfile self._logger = logger - self._db_access_lock = Lock() + self._db_access_lock = threading.Lock() def __repr__(self): """Return the canonical string representation of the ExclusionsDB.""" @@ -84,7 +84,7 @@ class ExclusionsDB: for page in pages: sources.append((sitename, page)) - with sqlite.connect(self._dbfile) as conn: + with sqlite3.connect(self._dbfile) as conn: conn.executescript(script) conn.executemany(query, sources) @@ -139,7 +139,7 @@ class ExclusionsDB: site = self._sitesdb.get_site("enwiki") else: site = self._sitesdb.get_site(sitename) - with self._db_access_lock, sqlite.connect(self._dbfile) as conn: + with self._db_access_lock, sqlite3.connect(self._dbfile) as conn: urls = set() for (source,) in conn.execute(query1, (sitename,)): urls |= self._load_source(site, source) @@ -150,17 +150,17 @@ class ExclusionsDB: conn.execute(query3, (sitename, url)) conn.executemany(query4, [(sitename, url) for url in urls]) if conn.execute(query5, (sitename,)).fetchone(): - conn.execute(query6, (int(time()), sitename)) + conn.execute(query6, (int(time.time()), sitename)) else: - conn.execute(query7, (sitename, int(time()))) + conn.execute(query7, (sitename, int(time.time()))) def _get_last_update(self, sitename): """Return the UNIX timestamp of the last time the db was updated.""" query = "SELECT update_time FROM updates WHERE update_sitename = ?" - with self._db_access_lock, sqlite.connect(self._dbfile) as conn: + with self._db_access_lock, sqlite3.connect(self._dbfile) as conn: try: result = conn.execute(query, (sitename,)).fetchone() - except sqlite.OperationalError: + except sqlite3.OperationalError: self._create() return 0 return result[0] if result else 0 @@ -174,7 +174,7 @@ class ExclusionsDB: after 12 hours. """ max_staleness = 60 * 60 * (12 if sitename == "all" else 48) - time_since_update = int(time() - self._get_last_update(sitename)) + time_since_update = int(time.time() - self._get_last_update(sitename)) if force or time_since_update > max_staleness: log = "Updating stale database: {0} (last updated {1} seconds ago)" self._logger.info(log.format(sitename, time_since_update)) @@ -191,10 +191,10 @@ class ExclusionsDB: Return ``True`` if the URL is in the database, or ``False`` otherwise. """ normalized = re.sub(_RE_STRIP_PREFIX, "", url.lower()) - parsed = urlparse(url.lower()) + parsed = urllib.parse.urlparse(url.lower()) query = """SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ? OR exclusion_sitename = ?""" - with self._db_access_lock, sqlite.connect(self._dbfile) as conn: + with self._db_access_lock, sqlite3.connect(self._dbfile) as conn: for (excl,) in conn.execute(query, (sitename, "all")): excl = excl.lower() if excl.startswith("*."): @@ -231,7 +231,7 @@ class ExclusionsDB: certain HTML tag attributes (``"href"`` and ``"src"``). """ site = page.site - path = urlparse(page.url).path + path = urllib.parse.urlparse(page.url).path roots = [site.domain] scripts = ["index.php", "load.php", "api.php"] diff --git a/src/earwigbot/wiki/copyvios/markov.py b/src/earwigbot/wiki/copyvios/markov.py index 7b3c486..5cf7a7f 100644 --- a/src/earwigbot/wiki/copyvios/markov.py +++ b/src/earwigbot/wiki/copyvios/markov.py @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2024 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -18,7 +18,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from re import UNICODE, sub +import re __all__ = ["EMPTY", "EMPTY_INTERSECTION", "MarkovChain", "MarkovChainIntersection"] @@ -38,7 +38,7 @@ class MarkovChain: def _build(self): """Build and return the Markov chain from the input text.""" padding = self.degree - 1 - words = sub(r"[^\w\s-]", "", self.text.lower(), flags=UNICODE).split() + words = re.sub(r"[^\w\s-]", "", self.text.lower(), flags=re.UNICODE).split() words = ([self.START] * padding) + words + ([self.END] * padding) chain = {} diff --git a/src/earwigbot/wiki/copyvios/parsers.py b/src/earwigbot/wiki/copyvios/parsers.py index 4d3720c..09553e6 100644 --- a/src/earwigbot/wiki/copyvios/parsers.py +++ b/src/earwigbot/wiki/copyvios/parsers.py @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2019 Ben Kurtovic +# Copyright (C) 2009-2024 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -18,24 +18,17 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import io import json +import os.path import re import urllib.parse import urllib.request -from io import StringIO -from os import path import mwparserfromhell -from earwigbot import importer from earwigbot.exceptions import ParserExclusionError, ParserRedirectError -bs4 = importer.new("bs4") -nltk = importer.new("nltk") -converter = importer.new("pdfminer.converter") -pdfinterp = importer.new("pdfminer.pdfinterp") -pdfpage = importer.new("pdfminer.pdfpage") - __all__ = ["ArticleTextParser", "get_parser"] @@ -101,9 +94,10 @@ class ArticleTextParser(_BaseTextParser): def _get_tokenizer(self): """Return a NLTK punctuation tokenizer for the article's language.""" + import nltk def datafile(lang): - return "file:" + path.join( + return "file:" + os.path.join( self._args["nltk_dir"], "tokenizers", "punkt", lang + ".pickle" ) @@ -213,11 +207,11 @@ class ArticleTextParser(_BaseTextParser): elif len(chunks) % 5 == 1: chunk = sentences.pop() # Pop from end elif len(chunks) % 5 == 2: - chunk = sentences.pop(len(sentences) / 2) # Pop from Q2 + chunk = sentences.pop(len(sentences) // 2) # Pop from Q2 elif len(chunks) % 5 == 3: - chunk = sentences.pop(len(sentences) / 4) # Pop from Q1 + chunk = sentences.pop(len(sentences) // 4) # Pop from Q1 else: - chunk = sentences.pop(3 * len(sentences) / 4) # Pop from Q3 + chunk = sentences.pop(3 * len(sentences) // 4) # Pop from Q3 chunks.append(chunk) return chunks @@ -256,6 +250,8 @@ class _HTMLParser(_BaseTextParser): @staticmethod def _get_soup(text): """Parse some text using BeautifulSoup.""" + import bs4 + try: return bs4.BeautifulSoup(text, "lxml") except ValueError: @@ -263,6 +259,7 @@ class _HTMLParser(_BaseTextParser): def _clean_soup(self, soup): """Clean a BeautifulSoup tree of invisible tags.""" + import bs4 def is_comment(text): return isinstance(text, bs4.element.Comment) @@ -353,21 +350,23 @@ class _PDFParser(_BaseTextParser): def parse(self): """Return extracted text from the PDF.""" - output = StringIO() + from pdfminer import converter, pdfinterp, pdfpage + + output = io.StringIO() manager = pdfinterp.PDFResourceManager() conv = converter.TextConverter(manager, output) interp = pdfinterp.PDFPageInterpreter(manager, conv) try: - pages = pdfpage.PDFPage.get_pages(StringIO(self.text)) + pages = pdfpage.PDFPage.get_pages(io.StringIO(self.text)) for page in pages: interp.process_page(page) except Exception: # pylint: disable=broad-except - return output.getvalue().decode("utf8") + return output.getvalue() finally: conv.close() - value = output.getvalue().decode("utf8") + value = output.getvalue() for orig, new in self.substitutions: value = value.replace(orig, new) return re.sub(r"\n\n+", "\n", value).strip() @@ -380,7 +379,9 @@ class _PlainTextParser(_BaseTextParser): def parse(self): """Unicode-ify and strip whitespace from the plain text document.""" - converted = bs4.UnicodeDammit(self.text).unicode_markup + from bs4.dammit import UnicodeDammit + + converted = UnicodeDammit(self.text).unicode_markup return converted.strip() if converted else "" diff --git a/src/earwigbot/wiki/copyvios/result.py b/src/earwigbot/wiki/copyvios/result.py index 937bad3..75436c7 100644 --- a/src/earwigbot/wiki/copyvios/result.py +++ b/src/earwigbot/wiki/copyvios/result.py @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2024 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/src/earwigbot/wiki/copyvios/search.py b/src/earwigbot/wiki/copyvios/search.py index 1a7d47c..bc5f9fa 100644 --- a/src/earwigbot/wiki/copyvios/search.py +++ b/src/earwigbot/wiki/copyvios/search.py @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2016 Ben Kurtovic +# Copyright (C) 2009-2024 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -18,18 +18,15 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import re from gzip import GzipFile from io import StringIO from json import loads -from re import sub as re_sub from urllib.error import URLError from urllib.parse import urlencode -from earwigbot import importer from earwigbot.exceptions import SearchQueryError -lxml = importer.new("lxml") - __all__ = [ "BingSearchEngine", "GoogleSearchEngine", @@ -104,7 +101,7 @@ class BingSearchEngine(_BaseSearchEngine): auth = (key + ":" + key).encode("base64").replace("\n", "") self.opener.addheaders.append(("Authorization", "Basic " + auth)) - def search(self, query): + def search(self, query: str) -> list[str]: """Do a Bing web search for *query*. Returns a list of URLs ranked by relevance (as determined by Bing). @@ -142,7 +139,7 @@ class GoogleSearchEngine(_BaseSearchEngine): name = "Google" - def search(self, query): + def search(self, query: str) -> list[str]: """Do a Google web search for *query*. Returns a list of URLs ranked by relevance (as determined by Google). @@ -153,7 +150,7 @@ class GoogleSearchEngine(_BaseSearchEngine): params = { "cx": self.cred["id"], "key": self.cred["key"], - "q": '"' + query.replace('"', "").encode("utf8") + '"', + "q": '"' + query.replace('"', "") + '"', "alt": "json", "num": str(self.count), "safe": "off", @@ -183,15 +180,17 @@ class YandexSearchEngine(_BaseSearchEngine): def requirements(): return ["lxml.etree"] - def search(self, query): + def search(self, query: str) -> list[str]: """Do a Yandex web search for *query*. Returns a list of URLs ranked by relevance (as determined by Yandex). Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. """ + import lxml.etree + domain = self.cred.get("proxy", "yandex.com") url = f"https://{domain}/search/xml?" - query = re_sub(r"[^a-zA-Z0-9 ]", "", query).encode("utf8") + query = re.sub(r"[^a-zA-Z0-9 ]", "", query) params = { "user": self.cred["user"], "key": self.cred["key"], @@ -205,7 +204,7 @@ class YandexSearchEngine(_BaseSearchEngine): result = self._open(url + urlencode(params)) try: - data = lxml.etree.fromstring(result) + data = lxml.etree.fromstring(result) # type: ignore return [elem.text for elem in data.xpath(".//url")] except lxml.etree.Error as exc: raise SearchQueryError("Yandex XML parse error: " + str(exc)) diff --git a/src/earwigbot/wiki/site.py b/src/earwigbot/wiki/site.py index e8ca6da..cdb5f62 100644 --- a/src/earwigbot/wiki/site.py +++ b/src/earwigbot/wiki/site.py @@ -35,7 +35,7 @@ import requests from requests.cookies import RequestsCookieJar from requests_oauthlib import OAuth1 -from earwigbot import exceptions, importer +from earwigbot import exceptions from earwigbot.wiki import constants from earwigbot.wiki.category import Category from earwigbot.wiki.constants import Service @@ -47,7 +47,11 @@ if typing.TYPE_CHECKING: import pymysql.cursors from pymysql.cursors import Cursor else: - pymysql = importer.new("pymysql") + try: + import pymysql + import pymysql.cursors + except ModuleNotFoundError: + pymysql = None __all__ = ["Site"] @@ -711,11 +715,11 @@ class Site: if "autoreconnect" not in args: args["autoreconnect"] = True - try: - return pymysql.connect(**args) - except ImportError: - e = "SQL querying requires the 'pymysql' package: https://pymysql.readthedocs.io/" - raise exceptions.SQLError(e) + if pymysql is None: + raise exceptions.SQLError( + "SQL querying requires the 'pymysql' package: https://pymysql.readthedocs.io/" + ) + return pymysql.connect(**args) def _get_service_order(self) -> list[Service]: """ @@ -731,6 +735,10 @@ class Site: lag is also very high. self.SERVICE_SQL will not be included in the list if we cannot form a proper SQL connection. """ + if pymysql is None: + self._sql_info_cache["usable"] = False + return [Service.API] + now = time.time() if now - self._sql_info_cache["lastcheck"] > 120: self._sql_info_cache["lastcheck"] = now @@ -739,7 +747,7 @@ class Site: self._sql_info_cache["replag"] = sqllag = self.get_replag() except pymysql.Error as exc: raise exceptions.SQLError(str(exc)) - except (exceptions.SQLError, ImportError): + except exceptions.SQLError: self._sql_info_cache["usable"] = False return [Service.API] self._sql_info_cache["usable"] = True