Browse Source

More Python 3 conversion

tags/v0.4
Ben Kurtovic 4 months ago
parent
commit
1343ccb0d5
22 changed files with 329 additions and 520 deletions
  1. +0
    -7
      docs/api/earwigbot.config.rst
  2. +0
    -6
      docs/api/earwigbot.rst
  3. +0
    -3
      pyproject.toml
  4. +22
    -29
      src/earwigbot/__init__.py
  5. +10
    -10
      src/earwigbot/commands/crypt.py
  6. +11
    -15
      src/earwigbot/config/__init__.py
  7. +5
    -6
      src/earwigbot/config/node.py
  8. +0
    -108
      src/earwigbot/config/ordered_yaml.py
  9. +179
    -146
      src/earwigbot/config/script.py
  10. +8
    -6
      src/earwigbot/irc/__init__.py
  11. +3
    -3
      src/earwigbot/irc/connection.py
  12. +10
    -9
      src/earwigbot/irc/data.py
  13. +6
    -5
      src/earwigbot/irc/frontend.py
  14. +5
    -5
      src/earwigbot/irc/rc.py
  15. +0
    -100
      src/earwigbot/lazy.py
  16. +4
    -4
      src/earwigbot/wiki/copyvios/__init__.py
  17. +16
    -16
      src/earwigbot/wiki/copyvios/exclusions.py
  18. +3
    -3
      src/earwigbot/wiki/copyvios/markov.py
  19. +20
    -19
      src/earwigbot/wiki/copyvios/parsers.py
  20. +1
    -1
      src/earwigbot/wiki/copyvios/result.py
  21. +10
    -11
      src/earwigbot/wiki/copyvios/search.py
  22. +16
    -8
      src/earwigbot/wiki/site.py

+ 0
- 7
docs/api/earwigbot.config.rst View File

@@ -23,14 +23,6 @@ config Package
:members:
:undoc-members:

:mod:`ordered_yaml` Module

.. automodule:: earwigbot.config.ordered_yaml
:members:
:undoc-members:
:show-inheritance:

:mod:`permissions` Module
-------------------------



+ 0
- 6
docs/api/earwigbot.rst View File

@@ -30,13 +30,6 @@ earwigbot Package
:undoc-members:
:show-inheritance:

:mod:`lazy` Module

.. automodule:: earwigbot.lazy
:members:
:undoc-members:

:mod:`managers` Module
----------------------



+ 0
- 3
pyproject.toml View File

@@ -62,9 +62,6 @@ build-backend = "setuptools.build_meta"
exclude = [
# TODO
"src/earwigbot/commands",
"src/earwigbot/config",
"src/earwigbot/lazy.py",
"src/earwigbot/irc",
"src/earwigbot/tasks",
"src/earwigbot/wiki/copyvios"
]


+ 22
- 29
src/earwigbot/__init__.py View File

@@ -26,8 +26,17 @@ See :file:`README.rst` for an overview, or the :file:`docs/` directory for detai
This documentation is also available `online <https://packages.python.org/earwigbot>`_.
"""

import typing

__all__ = [
"bot",
"cli",
"commands",
"config",
"exceptions",
"irc",
"managers",
"tasks",
"wiki",
]
__author__ = "Ben Kurtovic"
__copyright__ = "Copyright (C) 2009-2024 Ben Kurtovic"
__license__ = "MIT License"
@@ -54,30 +63,14 @@ if not __release__:
finally:
del _get_git_commit_id

from earwigbot import lazy

importer = lazy.LazyImporter()

if typing.TYPE_CHECKING:
from earwigbot import (
bot,
cli,
commands,
config,
exceptions,
irc,
managers,
tasks,
wiki,
)

else:
bot = importer.new("earwigbot.bot")
cli = importer.new("earwigbot.cli")
commands = importer.new("earwigbot.commands")
config = importer.new("earwigbot.config")
exceptions = importer.new("earwigbot.exceptions")
irc = importer.new("earwigbot.irc")
managers = importer.new("earwigbot.managers")
tasks = importer.new("earwigbot.tasks")
wiki = importer.new("earwigbot.wiki")
from earwigbot import (
bot,
cli,
commands,
config,
exceptions,
irc,
managers,
tasks,
wiki,
)

+ 10
- 10
src/earwigbot/commands/crypt.py View File

@@ -22,13 +22,8 @@ import base64
import hashlib
import os

from earwigbot import importer
from earwigbot.commands import Command

fernet = importer.new("cryptography.fernet")
hashes = importer.new("cryptography.hazmat.primitives.hashes")
pbkdf2 = importer.new("cryptography.hazmat.primitives.kdf.pbkdf2")


class Crypt(Command):
"""Provides hash functions with !hash (!hash list for supported algorithms)
@@ -73,6 +68,16 @@ class Crypt(Command):
return

try:
from cryptography import fernet
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf import pbkdf2
except ModuleNotFoundError:
self.reply(
data,
"This command requires the 'cryptography' package: https://cryptography.io/",
)

try:
if data.command == "encrypt":
salt = os.urandom(saltlen)
kdf = pbkdf2.PBKDF2HMAC(
@@ -101,10 +106,5 @@ class Crypt(Command):
base64.urlsafe_b64encode(kdf.derive(key.encode()))
)
self.reply(data, f.decrypt(ciphertext).decode())
except ImportError:
self.reply(
data,
"This command requires the 'cryptography' package: https://cryptography.io/",
)
except Exception as error:
self.reply(data, f"{type(error).__name__}: {str(error)}")

+ 11
- 15
src/earwigbot/config/__init__.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -28,18 +28,12 @@ from os import mkdir, path

import yaml

from earwigbot import importer
from earwigbot.config.formatter import BotFormatter
from earwigbot.config.node import ConfigNode
from earwigbot.config.ordered_yaml import OrderedLoader
from earwigbot.config.permissions import PermissionsDB
from earwigbot.config.script import ConfigScript
from earwigbot.exceptions import NoConfigError

fernet = importer.new("cryptography.fernet")
hashes = importer.new("cryptography.hazmat.primitives.hashes")
pbkdf2 = importer.new("cryptography.hazmat.primitives.kdf.pbkdf2")

__all__ = ["BotConfig"]


@@ -128,12 +122,11 @@ class BotConfig:

def _load(self):
"""Load data from our JSON config file (config.yml) into self._data."""
filename = self._config_path
with open(filename) as fp:
with open(self._config_path) as fp:
try:
self._data = yaml.load(fp, OrderedLoader)
self._data = yaml.load(fp, yaml.CSafeLoader)
except yaml.YAMLError:
print(f"Error parsing config file {filename}:")
print(f"Error parsing config file {self._config_path}:")
raise

def _setup_logging(self):
@@ -276,9 +269,7 @@ class BotConfig:
if not path.exists(self._config_path):
self._handle_missing_config()
self._load()
if not self._data:
self._handle_missing_config()
self._load()
assert self._data is not None

self.components._load(self._data.get("components", OrderedDict()))
self.wiki._load(self._data.get("wiki", OrderedDict()))
@@ -291,6 +282,10 @@ class BotConfig:
if self.is_encrypted():
if not self._decryption_cipher:
try:
from cryptography import fernet
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf import pbkdf2

salt = self.metadata["salt"]
kdf = pbkdf2.PBKDF2HMAC(
algorithm=hashes.SHA256(),
@@ -298,7 +293,7 @@ class BotConfig:
salt=salt,
iterations=ConfigScript.PBKDF_ROUNDS,
)
except ImportError:
except ModuleNotFoundError:
e = "Encryption requires the 'cryptography' package: https://cryptography.io/"
raise NoConfigError(e)
key = getpass("Enter key to decrypt bot passwords: ")
@@ -352,6 +347,7 @@ class BotConfig:
"week_day": week_day,
}

assert self._data is not None
data = self._data.get("schedule", [])
for event in data:
do = True


+ 5
- 6
src/earwigbot/config/node.py View File

@@ -19,22 +19,21 @@
# SOFTWARE.

import base64
from collections import OrderedDict

__all__ = ["ConfigNode"]


class ConfigNode:
def __init__(self):
self._data = OrderedDict()
self._data = {}

def __repr__(self):
return self._data
def __repr__(self) -> str:
return repr(self._data)

def __bool__(self):
def __bool__(self) -> bool:
return bool(self._data)

def __len__(self):
def __len__(self) -> int:
return len(self._data)

def __getitem__(self, key):


+ 0
- 108
src/earwigbot/config/ordered_yaml.py View File

@@ -1,108 +0,0 @@
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""
Based on:

* https://gist.github.com/844388
* https://pyyaml.org/attachment/ticket/161/use_ordered_dict.py

with modifications.
"""

from collections import OrderedDict

import yaml

__all__ = ["OrderedLoader", "OrderedDumper"]


class OrderedLoader(yaml.Loader):
"""A YAML loader that loads mappings into ordered dictionaries."""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
constructor = type(self).construct_yaml_map
self.add_constructor("tag:yaml.org,2002:map", constructor)
self.add_constructor("tag:yaml.org,2002:omap", constructor)

def construct_yaml_map(self, node):
data = OrderedDict()
yield data
value = self.construct_mapping(node)
data.update(value)

def construct_mapping(self, node, deep=False):
if isinstance(node, yaml.MappingNode):
self.flatten_mapping(node)
else:
raise yaml.constructor.ConstructorError(
None,
None,
f"expected a mapping node, but found {node.id}",
node.start_mark,
)

mapping = OrderedDict()
for key_node, value_node in node.value:
key = self.construct_object(key_node, deep=deep)
try:
hash(key)
except TypeError as exc:
raise yaml.constructor.ConstructorError(
"while constructing a mapping",
node.start_mark,
f"found unacceptable key ({exc})",
key_node.start_mark,
)
value = self.construct_object(value_node, deep=deep)
mapping[key] = value
return mapping


class OrderedDumper(yaml.SafeDumper):
"""A YAML dumper that dumps ordered dictionaries into mappings."""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.add_representer(OrderedDict, type(self).represent_dict)

def represent_mapping(self, tag, mapping, flow_style=None):
value = []
node = yaml.MappingNode(tag, value, flow_style=flow_style)
if self.alias_key is not None:
self.represented_objects[self.alias_key] = node
best_style = True
if hasattr(mapping, "items"):
mapping = list(mapping.items())
for item_key, item_value in mapping:
node_key = self.represent_data(item_key)
node_value = self.represent_data(item_value)
if not (isinstance(node_key, yaml.ScalarNode) and not node_key.style):
best_style = False
if not (isinstance(node_value, yaml.ScalarNode) and not node_value.style):
best_style = False
value.append((node_key, node_value))
if flow_style is None:
if self.default_flow_style is not None:
node.flow_style = self.default_flow_style
else:
node.flow_style = best_style
return node

+ 179
- 146
src/earwigbot/config/script.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2009-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -19,36 +19,40 @@
# SOFTWARE.

import base64
import getpass
import os
import os.path
import re
import stat
import sys
from collections import OrderedDict
from getpass import getpass
from os import chmod, makedirs, mkdir, path
from textwrap import fill, wrap
import textwrap
import typing
from typing import Any, Literal

import yaml

from earwigbot import exceptions, importer
from earwigbot.config.ordered_yaml import OrderedDumper

fernet = importer.new("cryptography.fernet")
hashes = importer.new("cryptography.hazmat.primitives.hashes")
pbkdf2 = importer.new("cryptography.hazmat.primitives.kdf.pbkdf2")
from earwigbot import exceptions

__all__ = ["ConfigScript"]

RULES_TEMPLATE = """# -*- coding: utf-8 -*-
RULES_TEMPLATE = """\
from earwigbot.bot import Bot
from earwigbot.irc import RC

def process(bot: Bot, rc: RC):
\"\"\"
Return a list of channels to report this event to.

def process(bot, rc):
\"\"\"Given a Bot() object and an RC() object, return a list of channels
to report this event to. Also, start any wiki bot tasks within this
function if necessary.\"\"\"
Also, start any wiki bot tasks within this function if necessary.
\"\"\"
pass
"""


class RetryError(Exception):
pass


class ConfigScript:
"""A script to guide a user through the creation of a new config file."""

@@ -58,17 +62,15 @@ class ConfigScript:

def __init__(self, config):
self.config = config
self.data = OrderedDict(
[
("metadata", OrderedDict()),
("components", OrderedDict()),
("wiki", OrderedDict()),
("irc", OrderedDict()),
("commands", OrderedDict()),
("tasks", OrderedDict()),
("schedule", []),
]
)
self.data = {
"metadata": {},
"components": {},
"wiki": {},
"irc": {},
"commands": {},
"tasks": {},
"schedule": [],
}

self._cipher = None
self._wmf = False
@@ -76,20 +78,28 @@ class ConfigScript:
self._lang = None

def _print(self, text):
print(fill(re.sub(r"\s\s+", " ", text), self.WIDTH))
print(textwrap.fill(re.sub(r"\s\s+", " ", text), self.WIDTH))

def _print_no_nl(self, text):
sys.stdout.write(fill(re.sub(r"\s\s+", " ", text), self.WIDTH))
sys.stdout.write(textwrap.fill(re.sub(r"\s\s+", " ", text), self.WIDTH))
sys.stdout.flush()

def _pause(self):
input(self.PROMPT + "Press enter to continue: ")

def _ask(self, text, default=None, require=True):
@typing.overload
def _ask(self, text, default=None, require: Literal[True] = True) -> str: ...

@typing.overload
def _ask(
self, text, default=None, require: Literal[False] = False
) -> str | None: ...

def _ask(self, text, default=None, require=True) -> str | None:
text = self.PROMPT + text
if default:
text += f" \x1b[33m[{default}]\x1b[0m"
lines = wrap(re.sub(r"\s\s+", " ", text), self.WIDTH)
lines = textwrap.wrap(re.sub(r"\s\s+", " ", text), self.WIDTH)
if len(lines) > 1:
print("\n".join(lines[:-1]))
while True:
@@ -103,7 +113,7 @@ class ConfigScript:
text += " \x1b[33m[Y/n]\x1b[0m"
else:
text += " \x1b[33m[y/N]\x1b[0m"
lines = wrap(re.sub(r"\s\s+", " ", text), self.WIDTH)
lines = textwrap.wrap(re.sub(r"\s\s+", " ", text), self.WIDTH)
if len(lines) > 1:
print("\n".join(lines[:-1]))
while True:
@@ -116,7 +126,7 @@ class ConfigScript:
return False

def _ask_pass(self, text, encrypt=True):
password = getpass(self.PROMPT + text + " ")
password = getpass.getpass(self.PROMPT + text + " ")
if encrypt:
return self._encrypt(password)
return password
@@ -128,7 +138,7 @@ class ConfigScript:
return password

def _ask_list(self, text):
print(fill(re.sub(r"\s\s+", " ", self.PROMPT + text), self.WIDTH))
print(textwrap.fill(re.sub(r"\s\s+", " ", self.PROMPT + text), self.WIDTH))
print("[one item per line; blank line to end]:")
result = []
while True:
@@ -140,18 +150,24 @@ class ConfigScript:

def _set_metadata(self):
print()
self.data["metadata"] = OrderedDict([("version", 1)])
self._print("""I can encrypt passwords stored in your config file in
addition to preventing other users on your system from
reading the file. Encryption is recommended if the bot
is to run on a public server like Toolforge, but the
need to enter a key every time you start the bot may be
an inconvenience.""")
self.data["metadata"]["encryptPasswords"] = False
metadata: dict[str, Any] = {"version": 1}
self.data["metadata"] = metadata
self._print(
"""I can encrypt passwords stored in your config file in addition to
preventing other users on your system from reading the file. Encryption is
recommended if the bot is to run on a public server like Toolforge, but the
need to enter a key every time you start the bot may be an
inconvenience."""
)
metadata["encryptPasswords"] = False
if self._ask_bool("Encrypt stored passwords?"):
key = getpass(self.PROMPT + "Enter an encryption key: ")
key = getpass.getpass(self.PROMPT + "Enter an encryption key: ")
self._print_no_nl("Generating key...")
try:
from cryptography import fernet
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf import pbkdf2

salt = os.urandom(16)
kdf = pbkdf2.PBKDF2HMAC(
algorithm=hashes.SHA256(),
@@ -162,44 +178,52 @@ class ConfigScript:
self._cipher = fernet.Fernet(
base64.urlsafe_b64encode(kdf.derive(key.encode()))
)
except ImportError:
except ModuleNotFoundError:
print(" error!")
self._print("""Encryption requires the 'cryptography' package:
https://cryptography.io/""")
self._print("""I will disable encryption for now; restart
configuration after installing these packages if
you want it.""")
self._print(
"Encryption requires the 'cryptography' package: https://cryptography.io/"
)
self._print(
"""I will disable encryption for now; restart configuration after
installing these packages if you want it."""
)
self._pause()
else:
self.data["metadata"]["encryptPasswords"] = True
self.data["metadata"]["salt"] = base64.b64encode(salt).decode()
metadata["encryptPasswords"] = True
metadata["salt"] = base64.b64encode(salt).decode()
print(" done.")

print()
self._print("""The bot can temporarily store its logs in the logs/
subdirectory. Error logs are kept for a month whereas
normal logs are kept for a week. If you disable this,
the bot will still print logs to stdout.""")
self._print(
"""The bot can temporarily store its logs in the logs/ subdirectory. Error
logs are kept for a month whereas normal logs are kept for a week. If you
disable this, the bot will still print logs to stdout."""
)
logging = self._ask_bool("Enable logging?")
self.data["metadata"]["enableLogging"] = logging
metadata["enableLogging"] = logging

def _set_components(self):
print()
self._print("""The bot contains three separate components that can run
independently of each other.""")
self._print("""- The IRC front-end runs on a normal IRC server, like
Libera, and expects users to interact with it through
commands.""")
self._print("""- The IRC watcher runs on a wiki recent-changes server,
like irc.wikimedia.org, and listens for edits. Users
cannot interact with this component. It can detect
specific events and report them to "feed" channels on
the front-end or start bot tasks.""")
self._print("""- The wiki task scheduler runs wiki-editing bot tasks in
separate threads at user-defined times through a
cron-like interface. Tasks which are not scheduled can
be started by the IRC watcher manually through the IRC
front-end.""")
self._print(
"""The bot contains three separate components that can run independently of
each other."""
)
self._print(
"""- The IRC front-end runs on a normal IRC server, like Libera, and
expects users to interact with it through commands."""
)
self._print(
"""- The IRC watcher runs on a wiki recent-changes server, like
irc.wikimedia.org, and listens for edits. Users cannot interact with this
component. It can detect specific events and report them to "feed" channels
on the front-end or start bot tasks."""
)
self._print(
"""- The wiki task scheduler runs wiki-editing bot tasks in separate
threads at user-defined times through a cron-like interface. Tasks which
are not scheduled can be started by the IRC watcher manually through the
IRC front-end."""
)
frontend = self._ask_bool("Enable the IRC front-end?")
watcher = self._ask_bool("Enable the IRC watcher?")
scheduler = self._ask_bool("Enable the wiki task scheduler?")
@@ -214,17 +238,17 @@ class ConfigScript:
site = self.config.bot.wiki.add_site(**kwargs)
except exceptions.APIError as exc:
print(" API error!")
print("\x1b[31m" + exc.message + "\x1b[0m")
print(f"\x1b[31m{exc}\x1b[0m")
question = "Would you like to re-enter the site information?"
if self._ask_bool(question):
return self._set_wiki()
raise RetryError()
question = "This will cancel the setup process. Are you sure?"
if self._ask_bool(question, default=False):
raise exceptions.NoConfigError()
return self._set_wiki()
raise RetryError()
except exceptions.LoginError as exc:
print(" login error!")
print("\x1b[31m" + exc.message + "\x1b[0m")
print(f"\x1b[31m{exc}\x1b[0m")
question = "Would you like to re-enter your login information?"
if self._ask_bool(question):
self.data["wiki"]["username"] = self._ask("Bot username:")
@@ -235,10 +259,12 @@ class ConfigScript:
password = self.data["wiki"]["password"]
question = "Would you like to re-enter the site information?"
if self._ask_bool(question):
return self._set_wiki()
raise RetryError()
print()
self._print("""Moving on. You can modify the login information
stored in the bot's config in the future.""")
self._print(
"""Moving on. You can modify the login information stored in the bot's
config in the future."""
)
self.data["wiki"]["password"] = None # Clear so we don't login
self.config.wiki._load(self.data["wiki"])
self._print_no_nl("Trying to connect to the site...")
@@ -255,8 +281,9 @@ class ConfigScript:

def _set_wiki(self):
print()
self._wmf = self._ask_bool("""Will this bot run on Wikimedia Foundation
wikis, like Wikipedia?""")
self._wmf = self._ask_bool(
"Will this bot run on Wikimedia Foundation wikis, like Wikipedia?"
)
if self._wmf:
msg = "Site project (e.g. 'wikipedia', 'wiktionary', 'wikimedia'):"
self._proj = project = self._ask(msg, "wikipedia").lower()
@@ -288,39 +315,32 @@ class ConfigScript:
msg = "Will this bot run from the Wikimedia Tool Labs?"
labs = self._ask_bool(msg, default=False)
if labs:
args = [
("host", "$1.labsdb"),
("db", "$1_p"),
("read_default_file", "~/replica.my.cnf"),
]
self.data["wiki"]["sql"] = OrderedDict(args)
else:
msg = "Will this bot run from the Wikimedia Toolserver?"
toolserver = self._ask_bool(msg, default=False)
if toolserver:
args = [("host", "$1-p.rrdb.toolserver.org"), ("db", "$1_p")]
self.data["wiki"]["sql"] = OrderedDict(args)
self.data["wiki"]["sql"] = {
"host": "$1.labsdb",
"db": "$1_p",
"read_default_file": "~/replica.my.cnf",
}

self.data["wiki"]["shutoff"] = {}
msg = "Would you like to enable an automatic shutoff page for the bot?"
if self._ask_bool(msg):
print()
self._print("""The page title can contain two wildcards: $1 will be
substituted with the bot's username, and $2 with the
current task number. This can be used to implement a
separate shutoff page for each task.""")
self._print(
"""The page title can contain two wildcards: $1 will be substituted
with the bot's username, and $2 with the current task number. This can
be used to implement a separate shutoff page for each task."""
)
page = self._ask("Page title:", "User:$1/Shutoff")
msg = "Page content to indicate the bot is *not* shut off:"
disabled = self._ask(msg, "run")
args = [("page", page), ("disabled", disabled)]
self.data["wiki"]["shutoff"] = OrderedDict(args)
self.data["wiki"]["shutoff"] = {"page": page, "disabled": disabled}

self.data["wiki"]["search"] = {}

def _set_irc(self):
if self.data["components"]["irc_frontend"]:
print()
frontend = self.data["irc"]["frontend"] = OrderedDict()
frontend = self.data["irc"]["frontend"] = {}
frontend["host"] = self._ask(
"Hostname of the frontend's IRC server:", "irc.libera.chat"
)
@@ -339,14 +359,14 @@ class ConfigScript:
chan_question = "Frontend channels to join by default:"
frontend["channels"] = self._ask_list(chan_question)
print()
self._print("""The bot keeps a database of its admins (users who
can use certain sensitive commands) and owners
(users who can quit the bot and modify its access
list), identified by nick, ident, and/or hostname.
Hostname is the most secure option since it cannot
be easily spoofed. If you have a cloak, this will
probably look like 'wikipedia/Username' or
'user/nickname'.""")
self._print(
"""The bot keeps a database of its admins (users who can use certain
sensitive commands) and owners (users who can quit the bot and modify
its access list), identified by nick, ident, and/or hostname. Hostname
is the most secure option since it cannot be easily spoofed. If you
have a cloak, this will probably look like 'wikipedia/Username' or
'user/nickname'."""
)
host = self._ask("Your hostname on the frontend:", require=False)
if host:
permdb = self.config._permissions
@@ -358,7 +378,7 @@ class ConfigScript:

if self.data["components"]["irc_watcher"]:
print()
watcher = self.data["irc"]["watcher"] = OrderedDict()
watcher = self.data["irc"]["watcher"] = {}
if self._wmf:
watcher["host"] = "irc.wikimedia.org"
watcher["port"] = 6667
@@ -386,14 +406,14 @@ class ConfigScript:
chan_question = "Watcher channels to join by default:"
watcher["channels"] = self._ask_list(chan_question)
print()
self._print("""I am now creating a blank 'rules.py' file, which
will determine how the bot handles messages received
from the IRC watcher. It contains a process()
function that takes a Bot object (allowing you to
start tasks) and an RC object (storing the message
from the watcher). See the documentation for
details.""")
with open(path.join(self.config.root_dir, "rules.py"), "w") as fp:
self._print(
"""I am now creating a blank 'rules.py' file, which will determine how
the bot handles messages received from the IRC watcher. It contains a
process() function that takes a Bot object (allowing you to start
tasks) and an RC object (storing the message from the watcher). See the
documentation for details."""
)
with open(os.path.join(self.config.root_dir, "rules.py"), "w") as fp:
fp.write(RULES_TEMPLATE)
self._pause()

@@ -403,47 +423,55 @@ class ConfigScript:

def _set_commands(self):
print()
msg = """Would you like to disable the default IRC commands? You can
fine-tune which commands are disabled later on."""
msg = """Would you like to disable the default IRC commands? You can fine-tune
which commands are disabled later on."""
if not self.data["components"]["irc_frontend"] or self._ask_bool(
msg, default=False
):
self.data["commands"]["disable"] = True
print()
self._print("""I am now creating the 'commands/' directory, where you
can place custom IRC commands and plugins. Creating your
own commands is described in the documentation.""")
mkdir(path.join(self.config.root_dir, "commands"))
self._print(
"""I am now creating the 'commands/' directory, where you can place custom
IRC commands and plugins. Creating your own commands is described in the
documentation."""
)
os.mkdir(os.path.join(self.config.root_dir, "commands"))
self._pause()

def _set_tasks(self):
print()
self._print("""I am now creating the 'tasks/' directory, where you can
place custom bot tasks and plugins. Creating your own
tasks is described in the documentation.""")
mkdir(path.join(self.config.root_dir, "tasks"))
self._print(
"""I am now creating the 'tasks/' directory, where you can place custom bot
tasks and plugins. Creating your own tasks is described in the
documentation."""
)
os.mkdir(os.path.join(self.config.root_dir, "tasks"))
self._pause()

def _set_schedule(self):
print()
self._print("""The final section of your config file, 'schedule', is a
list of bot tasks to be started by the wiki scheduler.
Each entry contains cron-like time quantifiers and a
list of tasks. For example, the following starts the
'foobot' task every hour on the half-hour:""")
self._print(
"""The final section of your config file, 'schedule', is a list of bot
tasks to be started by the wiki scheduler. Each entry contains cron-like
time quantifiers and a list of tasks. For example, the following starts the
'foobot' task every hour on the half-hour:"""
)
print("\x1b[33mschedule:")
print(" - minute: 30")
print(" tasks:")
print(" - foobot\x1b[0m")
self._print("""The following starts the 'barbot' task with the keyword
arguments 'action="baz"' every Monday at 05:00 UTC:""")
self._print(
"""The following starts the 'barbot' task with the keyword arguments
'action="baz"' every Monday at 05:00 UTC:"""
)
print("\x1b[33m - week_day: 1")
print(" hour: 5")
print(" tasks:")
print(' - ["barbot", {"action": "baz"}]\x1b[0m')
self._print("""The full list of quantifiers is minute, hour, month_day,
month, and week_day. See the documentation for more
information.""")
self._print(
"""The full list of quantifiers is minute, hour, month_day, month, and
week_day. See the documentation for more information."""
)
self._pause()

def _save(self):
@@ -451,7 +479,7 @@ class ConfigScript:
yaml.dump(
self.data,
stream,
OrderedDumper,
yaml.CSafeDumper,
indent=4,
allow_unicode=True,
default_flow_style=False,
@@ -460,19 +488,24 @@ class ConfigScript:
def make_new(self):
"""Make a new config file based on the user's input."""
try:
makedirs(path.dirname(self.config.path))
os.makedirs(os.path.dirname(self.config.path))
except OSError as exc:
if exc.errno != 17:
raise
try:
open(self.config.path, "w").close()
chmod(self.config.path, stat.S_IRUSR | stat.S_IWUSR)
os.chmod(self.config.path, stat.S_IRUSR | stat.S_IWUSR)
except OSError:
print("I can't seem to write to the config file:")
raise
self._set_metadata()
self._set_components()
self._set_wiki()
while True:
try:
self._set_wiki()
break
except RetryError:
continue
components = self.data["components"]
if components["irc_frontend"] or components["irc_watcher"]:
self._set_irc()
@@ -481,12 +514,12 @@ class ConfigScript:
if components["wiki_scheduler"]:
self._set_schedule()
print()
self._print("""I am now saving config.yml with your settings. YAML is a
relatively straightforward format and you should be able
to update these settings in the future when necessary.
I will start the bot at your signal. Feel free to
contact me at wikipedia.earwig@gmail.com if you have any
questions.""")
self._print(
"""I am now saving config.yml with your settings. YAML is a relatively
straightforward format and you should be able to update these settings in
the future when necessary. I will start the bot at your signal. Feel free
to contact me at wikipedia.earwig@gmail.com if you have any questions."""
)
self._save()
if not self._ask_bool("Start the bot now?"):
exit()

+ 8
- 6
src/earwigbot/irc/__init__.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -18,8 +18,10 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from earwigbot.irc.connection import *
from earwigbot.irc.data import *
from earwigbot.irc.frontend import *
from earwigbot.irc.rc import *
from earwigbot.irc.watcher import *
__all__ = ["Data", "Frontend", "IRCConnection", "RC", "Watcher"]

from earwigbot.irc.connection import IRCConnection
from earwigbot.irc.data import Data
from earwigbot.irc.frontend import Frontend
from earwigbot.irc.rc import RC
from earwigbot.irc.watcher import Watcher

+ 3
- 3
src/earwigbot/irc/connection.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -18,14 +18,14 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

__all__ = ["IRCConnection"]

import socket
from threading import Lock
from time import sleep, time

from earwigbot.exceptions import BrokenSocketError

__all__ = ["IRCConnection"]


class IRCConnection:
"""Interface with an IRC server."""


+ 10
- 9
src/earwigbot/irc/data.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2009-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -18,10 +18,10 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import re

__all__ = ["Data"]

import re


class Data:
"""Store data from an individual line received on IRC."""
@@ -78,6 +78,7 @@ class Data:
bot's name); self.is_command will be set to True, and self.trigger will
store the trigger string. Otherwise, is_command will be set to False.
"""
assert self.msg is not None
self._args = self.msg.strip().split()

try:
@@ -87,16 +88,16 @@ class Data:
return

# e.g. "!command>user arg1 arg2"
if ">" in self.command:
if ">" in self._command:
command_uc, self._reply_nick = command_uc.split(">", 1)
self._command = command_uc.lower()

if self.command.startswith("!") or self.command.startswith("."):
if self._command.startswith("!") or self._command.startswith("."):
# e.g. "!command arg1 arg2"
self._is_command = True
self._trigger = self.command[0]
self._command = self.command[1:] # Strip the "!" or "."
elif re.match(rf"{re.escape(self.my_nick)}\W*?$", self.command, re.U):
self._trigger = self._command[0]
self._command = self._command[1:] # Strip the "!" or "."
elif re.match(rf"{re.escape(self.my_nick)}\W*?$", self._command, re.U):
# e.g. "EarwigBot, command arg1 arg2"
self._is_command = True
self._trigger = self.my_nick
@@ -110,7 +111,7 @@ class Data:
if self.args:
self.args[-1] = self.args[-1][:-1]
else:
self._command = self.command[:-1]
self._command = self._command[:-1]
except IndexError:
pass



+ 6
- 5
src/earwigbot/irc/frontend.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2009-2021 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -18,11 +18,11 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from time import sleep
__all__ = ["Frontend"]

from earwigbot.irc import Data, IRCConnection
import time

__all__ = ["Frontend"]
from earwigbot.irc import Data, IRCConnection


class Frontend(IRCConnection):
@@ -121,10 +121,11 @@ class Frontend(IRCConnection):
elif line[1] == "NOTICE":
data = Data(self.nick, line, msgtype="NOTICE")
if self._auth_wait and data.nick == self.NICK_SERVICES:
assert data.msg is not None
if data.msg.startswith("This nickname is registered."):
return
self._auth_wait = False
sleep(2) # Wait for hostname change to propagate
time.sleep(2) # Wait for hostname change to propagate
self._join_channels()

elif line[1] == "KICK":


+ 5
- 5
src/earwigbot/irc/rc.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2009-2021 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -18,19 +18,19 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import re

__all__ = ["RC"]

import re


class RC:
"""Store data from an event received from our IRC watcher."""

re_color = re.compile("\x03([0-9]{1,2}(,[0-9]{1,2})?)?")
re_edit = re.compile(
"\A\[\[(.*?)\]\]\s(.*?)\s(https?://.*?)\s\*\s(.*?)\s\*\s(.*?)\Z"
r"\A\[\[(.*?)\]\]\s(.*?)\s(https?://.*?)\s\*\s(.*?)\s\*\s(.*?)\Z"
)
re_log = re.compile("\A\[\[(.*?)\]\]\s(.*?)\s\s\*\s(.*?)\s\*\s(.*?)\Z")
re_log = re.compile(r"\A\[\[(.*?)\]\]\s(.*?)\s\s\*\s(.*?)\s\*\s(.*?)\Z")

pretty_edit = "\x02New {0}\x0f: \x0314[[\x0307{1}\x0314]]\x0306 * \x0303{2}\x0306 * \x0302{3}\x0306 * \x0310{4}"
pretty_log = "\x02New {0}\x0f: \x0303{1}\x0306 * \x0302{2}\x0306 * \x0310{3}"


+ 0
- 100
src/earwigbot/lazy.py View File

@@ -1,100 +0,0 @@
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""
Implements a hierarchy of importing classes as defined in `PEP 302
<https://www.python.org/dev/peps/pep-0302/>`_ to load modules in a safe yet lazy
manner, so that they can be referred to by name but are not actually loaded
until they are used (i.e. their attributes are read or modified).
"""

import importlib
import sys
from threading import RLock
from types import ModuleType

__all__ = ["LazyImporter"]

_real_get = ModuleType.__getattribute__
_lazy_init_lock = RLock()


def _create_failing_get(exc):
def _fail(self, attr):
raise exc

return _fail


def _mock_get(self, attr):
with _real_get(self, "_lock"):
if _real_get(self, "_unloaded"):
type(self)._unloaded = False
try:
importlib.reload(self)
except ImportError as exc:
type(self).__getattribute__ = _create_failing_get(exc)
del type(self)._lock
raise
type(self).__getattribute__ = _real_get
del type(self)._lock
return _real_get(self, attr)


class _LazyModule(type):
def __new__(cls, name):
with _lazy_init_lock:
if name not in sys.modules:
attributes = {
"__name__": name,
"__getattribute__": _mock_get,
"_unloaded": True,
"_lock": RLock(),
}
parents = (ModuleType,)
klass = type.__new__(cls, "module", parents, attributes)
sys.modules[name] = klass(name)
if "." in name: # Also ensure the parent exists
_LazyModule(name.rsplit(".", 1)[0])
return sys.modules[name]


class LazyImporter:
"""An importer for modules that are loaded lazily.

This inserts itself into :py:data:`sys.meta_path`, storing a dictionary of
:py:class:`_LazyModule`\ s (which is added to with :py:meth:`new`).
"""

def __init__(self):
self._modules = {}
sys.meta_path.append(self)

def new(self, name):
module = _LazyModule(name)
self._modules[name] = module
return module

def find_module(self, fullname, path=None):
if fullname in self._modules and fullname not in sys.modules:
return self

def load_module(self, fullname):
return self._modules.pop(fullname)

+ 4
- 4
src/earwigbot/wiki/copyvios/__init__.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2009-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -18,7 +18,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from time import sleep
import time
from urllib.request import build_opener

from earwigbot import exceptions
@@ -73,7 +73,7 @@ class CopyvioMixIn:
for dep in klass.requirements():
try:
__import__(dep).__name__
except (ImportError, AttributeError):
except (ModuleNotFoundError, AttributeError):
e = "Missing a required dependency ({}) for the {} engine"
e = e.format(dep, engine)
raise exceptions.UnsupportedSearchEngineError(e)
@@ -173,7 +173,7 @@ class CopyvioMixIn:
self._logger.debug(log.format(self.title, searcher.name, chunk))
workspace.enqueue(searcher.search(chunk))
num_queries += 1
sleep(1)
time.sleep(1)

workspace.wait()
result = workspace.get_result(num_queries)


+ 16
- 16
src/earwigbot/wiki/copyvios/exclusions.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2009-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -19,10 +19,10 @@
# SOFTWARE.

import re
import sqlite3 as sqlite
from threading import Lock
from time import time
from urllib.parse import urlparse
import sqlite3
import threading
import time
import urllib.parse

from earwigbot import exceptions

@@ -60,7 +60,7 @@ class ExclusionsDB:
self._sitesdb = sitesdb
self._dbfile = dbfile
self._logger = logger
self._db_access_lock = Lock()
self._db_access_lock = threading.Lock()

def __repr__(self):
"""Return the canonical string representation of the ExclusionsDB."""
@@ -84,7 +84,7 @@ class ExclusionsDB:
for page in pages:
sources.append((sitename, page))

with sqlite.connect(self._dbfile) as conn:
with sqlite3.connect(self._dbfile) as conn:
conn.executescript(script)
conn.executemany(query, sources)

@@ -139,7 +139,7 @@ class ExclusionsDB:
site = self._sitesdb.get_site("enwiki")
else:
site = self._sitesdb.get_site(sitename)
with self._db_access_lock, sqlite.connect(self._dbfile) as conn:
with self._db_access_lock, sqlite3.connect(self._dbfile) as conn:
urls = set()
for (source,) in conn.execute(query1, (sitename,)):
urls |= self._load_source(site, source)
@@ -150,17 +150,17 @@ class ExclusionsDB:
conn.execute(query3, (sitename, url))
conn.executemany(query4, [(sitename, url) for url in urls])
if conn.execute(query5, (sitename,)).fetchone():
conn.execute(query6, (int(time()), sitename))
conn.execute(query6, (int(time.time()), sitename))
else:
conn.execute(query7, (sitename, int(time())))
conn.execute(query7, (sitename, int(time.time())))

def _get_last_update(self, sitename):
"""Return the UNIX timestamp of the last time the db was updated."""
query = "SELECT update_time FROM updates WHERE update_sitename = ?"
with self._db_access_lock, sqlite.connect(self._dbfile) as conn:
with self._db_access_lock, sqlite3.connect(self._dbfile) as conn:
try:
result = conn.execute(query, (sitename,)).fetchone()
except sqlite.OperationalError:
except sqlite3.OperationalError:
self._create()
return 0
return result[0] if result else 0
@@ -174,7 +174,7 @@ class ExclusionsDB:
after 12 hours.
"""
max_staleness = 60 * 60 * (12 if sitename == "all" else 48)
time_since_update = int(time() - self._get_last_update(sitename))
time_since_update = int(time.time() - self._get_last_update(sitename))
if force or time_since_update > max_staleness:
log = "Updating stale database: {0} (last updated {1} seconds ago)"
self._logger.info(log.format(sitename, time_since_update))
@@ -191,10 +191,10 @@ class ExclusionsDB:
Return ``True`` if the URL is in the database, or ``False`` otherwise.
"""
normalized = re.sub(_RE_STRIP_PREFIX, "", url.lower())
parsed = urlparse(url.lower())
parsed = urllib.parse.urlparse(url.lower())
query = """SELECT exclusion_url FROM exclusions
WHERE exclusion_sitename = ? OR exclusion_sitename = ?"""
with self._db_access_lock, sqlite.connect(self._dbfile) as conn:
with self._db_access_lock, sqlite3.connect(self._dbfile) as conn:
for (excl,) in conn.execute(query, (sitename, "all")):
excl = excl.lower()
if excl.startswith("*."):
@@ -231,7 +231,7 @@ class ExclusionsDB:
certain HTML tag attributes (``"href"`` and ``"src"``).
"""
site = page.site
path = urlparse(page.url).path
path = urllib.parse.urlparse(page.url).path
roots = [site.domain]
scripts = ["index.php", "load.php", "api.php"]



+ 3
- 3
src/earwigbot/wiki/copyvios/markov.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -18,7 +18,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from re import UNICODE, sub
import re

__all__ = ["EMPTY", "EMPTY_INTERSECTION", "MarkovChain", "MarkovChainIntersection"]

@@ -38,7 +38,7 @@ class MarkovChain:
def _build(self):
"""Build and return the Markov chain from the input text."""
padding = self.degree - 1
words = sub(r"[^\w\s-]", "", self.text.lower(), flags=UNICODE).split()
words = re.sub(r"[^\w\s-]", "", self.text.lower(), flags=re.UNICODE).split()
words = ([self.START] * padding) + words + ([self.END] * padding)
chain = {}



+ 20
- 19
src/earwigbot/wiki/copyvios/parsers.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2009-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -18,24 +18,17 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import io
import json
import os.path
import re
import urllib.parse
import urllib.request
from io import StringIO
from os import path

import mwparserfromhell

from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError

bs4 = importer.new("bs4")
nltk = importer.new("nltk")
converter = importer.new("pdfminer.converter")
pdfinterp = importer.new("pdfminer.pdfinterp")
pdfpage = importer.new("pdfminer.pdfpage")

__all__ = ["ArticleTextParser", "get_parser"]


@@ -101,9 +94,10 @@ class ArticleTextParser(_BaseTextParser):

def _get_tokenizer(self):
"""Return a NLTK punctuation tokenizer for the article's language."""
import nltk

def datafile(lang):
return "file:" + path.join(
return "file:" + os.path.join(
self._args["nltk_dir"], "tokenizers", "punkt", lang + ".pickle"
)

@@ -213,11 +207,11 @@ class ArticleTextParser(_BaseTextParser):
elif len(chunks) % 5 == 1:
chunk = sentences.pop() # Pop from end
elif len(chunks) % 5 == 2:
chunk = sentences.pop(len(sentences) / 2) # Pop from Q2
chunk = sentences.pop(len(sentences) // 2) # Pop from Q2
elif len(chunks) % 5 == 3:
chunk = sentences.pop(len(sentences) / 4) # Pop from Q1
chunk = sentences.pop(len(sentences) // 4) # Pop from Q1
else:
chunk = sentences.pop(3 * len(sentences) / 4) # Pop from Q3
chunk = sentences.pop(3 * len(sentences) // 4) # Pop from Q3
chunks.append(chunk)
return chunks

@@ -256,6 +250,8 @@ class _HTMLParser(_BaseTextParser):
@staticmethod
def _get_soup(text):
"""Parse some text using BeautifulSoup."""
import bs4

try:
return bs4.BeautifulSoup(text, "lxml")
except ValueError:
@@ -263,6 +259,7 @@ class _HTMLParser(_BaseTextParser):

def _clean_soup(self, soup):
"""Clean a BeautifulSoup tree of invisible tags."""
import bs4

def is_comment(text):
return isinstance(text, bs4.element.Comment)
@@ -353,21 +350,23 @@ class _PDFParser(_BaseTextParser):

def parse(self):
"""Return extracted text from the PDF."""
output = StringIO()
from pdfminer import converter, pdfinterp, pdfpage

output = io.StringIO()
manager = pdfinterp.PDFResourceManager()
conv = converter.TextConverter(manager, output)
interp = pdfinterp.PDFPageInterpreter(manager, conv)

try:
pages = pdfpage.PDFPage.get_pages(StringIO(self.text))
pages = pdfpage.PDFPage.get_pages(io.StringIO(self.text))
for page in pages:
interp.process_page(page)
except Exception: # pylint: disable=broad-except
return output.getvalue().decode("utf8")
return output.getvalue()
finally:
conv.close()

value = output.getvalue().decode("utf8")
value = output.getvalue()
for orig, new in self.substitutions:
value = value.replace(orig, new)
return re.sub(r"\n\n+", "\n", value).strip()
@@ -380,7 +379,9 @@ class _PlainTextParser(_BaseTextParser):

def parse(self):
"""Unicode-ify and strip whitespace from the plain text document."""
converted = bs4.UnicodeDammit(self.text).unicode_markup
from bs4.dammit import UnicodeDammit

converted = UnicodeDammit(self.text).unicode_markup
return converted.strip() if converted else ""




+ 1
- 1
src/earwigbot/wiki/copyvios/result.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal


+ 10
- 11
src/earwigbot/wiki/copyvios/search.py View File

@@ -1,4 +1,4 @@
# Copyright (C) 2009-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -18,18 +18,15 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import re
from gzip import GzipFile
from io import StringIO
from json import loads
from re import sub as re_sub
from urllib.error import URLError
from urllib.parse import urlencode

from earwigbot import importer
from earwigbot.exceptions import SearchQueryError

lxml = importer.new("lxml")

__all__ = [
"BingSearchEngine",
"GoogleSearchEngine",
@@ -104,7 +101,7 @@ class BingSearchEngine(_BaseSearchEngine):
auth = (key + ":" + key).encode("base64").replace("\n", "")
self.opener.addheaders.append(("Authorization", "Basic " + auth))

def search(self, query):
def search(self, query: str) -> list[str]:
"""Do a Bing web search for *query*.

Returns a list of URLs ranked by relevance (as determined by Bing).
@@ -142,7 +139,7 @@ class GoogleSearchEngine(_BaseSearchEngine):

name = "Google"

def search(self, query):
def search(self, query: str) -> list[str]:
"""Do a Google web search for *query*.

Returns a list of URLs ranked by relevance (as determined by Google).
@@ -153,7 +150,7 @@ class GoogleSearchEngine(_BaseSearchEngine):
params = {
"cx": self.cred["id"],
"key": self.cred["key"],
"q": '"' + query.replace('"', "").encode("utf8") + '"',
"q": '"' + query.replace('"', "") + '"',
"alt": "json",
"num": str(self.count),
"safe": "off",
@@ -183,15 +180,17 @@ class YandexSearchEngine(_BaseSearchEngine):
def requirements():
return ["lxml.etree"]

def search(self, query):
def search(self, query: str) -> list[str]:
"""Do a Yandex web search for *query*.

Returns a list of URLs ranked by relevance (as determined by Yandex).
Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
"""
import lxml.etree

domain = self.cred.get("proxy", "yandex.com")
url = f"https://{domain}/search/xml?"
query = re_sub(r"[^a-zA-Z0-9 ]", "", query).encode("utf8")
query = re.sub(r"[^a-zA-Z0-9 ]", "", query)
params = {
"user": self.cred["user"],
"key": self.cred["key"],
@@ -205,7 +204,7 @@ class YandexSearchEngine(_BaseSearchEngine):
result = self._open(url + urlencode(params))

try:
data = lxml.etree.fromstring(result)
data = lxml.etree.fromstring(result) # type: ignore
return [elem.text for elem in data.xpath(".//url")]
except lxml.etree.Error as exc:
raise SearchQueryError("Yandex XML parse error: " + str(exc))


+ 16
- 8
src/earwigbot/wiki/site.py View File

@@ -35,7 +35,7 @@ import requests
from requests.cookies import RequestsCookieJar
from requests_oauthlib import OAuth1

from earwigbot import exceptions, importer
from earwigbot import exceptions
from earwigbot.wiki import constants
from earwigbot.wiki.category import Category
from earwigbot.wiki.constants import Service
@@ -47,7 +47,11 @@ if typing.TYPE_CHECKING:
import pymysql.cursors
from pymysql.cursors import Cursor
else:
pymysql = importer.new("pymysql")
try:
import pymysql
import pymysql.cursors
except ModuleNotFoundError:
pymysql = None

__all__ = ["Site"]

@@ -711,11 +715,11 @@ class Site:
if "autoreconnect" not in args:
args["autoreconnect"] = True

try:
return pymysql.connect(**args)
except ImportError:
e = "SQL querying requires the 'pymysql' package: https://pymysql.readthedocs.io/"
raise exceptions.SQLError(e)
if pymysql is None:
raise exceptions.SQLError(
"SQL querying requires the 'pymysql' package: https://pymysql.readthedocs.io/"
)
return pymysql.connect(**args)

def _get_service_order(self) -> list[Service]:
"""
@@ -731,6 +735,10 @@ class Site:
lag is also very high. self.SERVICE_SQL will not be included in the list if we
cannot form a proper SQL connection.
"""
if pymysql is None:
self._sql_info_cache["usable"] = False
return [Service.API]

now = time.time()
if now - self._sql_info_cache["lastcheck"] > 120:
self._sql_info_cache["lastcheck"] = now
@@ -739,7 +747,7 @@ class Site:
self._sql_info_cache["replag"] = sqllag = self.get_replag()
except pymysql.Error as exc:
raise exceptions.SQLError(str(exc))
except (exceptions.SQLError, ImportError):
except exceptions.SQLError:
self._sql_info_cache["usable"] = False
return [Service.API]
self._sql_info_cache["usable"] = True


Loading…
Cancel
Save