@@ -1,12 +1,14 @@ | |||
v0.4 (unreleased): | |||
- Migrated to Python 3. | |||
- Migrated to Python 3 (3.11+). | |||
- Migrated from oursql to pymysql. | |||
- Copyvios: Configurable proxy support for specific domains. | |||
- Copyvios: Parser-directed URL redirection. | |||
- Copyvios: General parsing improvements. | |||
- Copyvios: URL exclusion improvements. | |||
- Copyvios: Removed long-deprecated Yahoo! BOSS search engine. | |||
- Wiki: Fixed not sending Content-Type header in POST requests. | |||
- IRC: Moved default server from Freenode to Libera. | |||
- IRC: Remember joined channels across restarts. | |||
- IRC: Added !listchans. | |||
- IRC > !stalk: Added modifiers to change message format or filter messages. | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -20,12 +20,14 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
import imp | |||
import importlib.machinery | |||
import importlib.util | |||
from earwigbot.irc import IRCConnection, RC | |||
__all__ = ["Watcher"] | |||
class Watcher(IRCConnection): | |||
""" | |||
**EarwigBot: IRC Watcher Component** | |||
@@ -40,16 +42,23 @@ class Watcher(IRCConnection): | |||
def __init__(self, bot): | |||
self.bot = bot | |||
cf = bot.config.irc["watcher"] | |||
super().__init__(cf["host"], cf["port"], cf["nick"], cf["ident"], | |||
cf["realname"], bot.logger.getChild("watcher")) | |||
super().__init__( | |||
cf["host"], | |||
cf["port"], | |||
cf["nick"], | |||
cf["ident"], | |||
cf["realname"], | |||
bot.logger.getChild("watcher"), | |||
) | |||
self._prepare_process_hook() | |||
self._connect() | |||
def __repr__(self): | |||
"""Return the canonical string representation of the Watcher.""" | |||
res = "Watcher(host={0!r}, port={1!r}, nick={2!r}, ident={3!r}, realname={4!r}, bot={5!r})" | |||
return res.format(self.host, self.port, self.nick, self.ident, | |||
self.realname, self.bot) | |||
return res.format( | |||
self.host, self.port, self.nick, self.ident, self.realname, self.bot | |||
) | |||
def __str__(self): | |||
"""Return a nice string representation of the Watcher.""" | |||
@@ -88,17 +97,11 @@ class Watcher(IRCConnection): | |||
self._process_hook = lambda bot, rc: () | |||
path = self.bot.config.root_dir | |||
try: | |||
f, path, desc = imp.find_module("rules", [path]) | |||
except ImportError: | |||
spec = importlib.machinery.PathFinder.find_spec("rules", [path]) | |||
if spec is None or spec.loader is None: | |||
return | |||
try: | |||
module = imp.load_module("rules", f, path, desc) | |||
except Exception: | |||
return | |||
finally: | |||
f.close() | |||
module = importlib.util.module_from_spec(spec) | |||
spec.loader.exec_module(module) | |||
self._process_hook_module = module | |||
try: | |||
self._process_hook = module.process | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -27,7 +27,6 @@ manner, so that they can be referred to by name but are not actually loaded | |||
until they are used (i.e. their attributes are read or modified). | |||
""" | |||
from imp import acquire_lock, release_lock | |||
import importlib | |||
import sys | |||
from threading import RLock | |||
@@ -36,12 +35,16 @@ from types import ModuleType | |||
__all__ = ["LazyImporter"] | |||
_real_get = ModuleType.__getattribute__ | |||
_lazy_init_lock = RLock() | |||
def _create_failing_get(exc): | |||
def _fail(self, attr): | |||
raise exc | |||
return _fail | |||
def _mock_get(self, attr): | |||
with _real_get(self, "_lock"): | |||
if _real_get(self, "_unloaded"): | |||
@@ -59,14 +62,13 @@ def _mock_get(self, attr): | |||
class _LazyModule(type): | |||
def __new__(cls, name): | |||
acquire_lock() | |||
try: | |||
with _lazy_init_lock: | |||
if name not in sys.modules: | |||
attributes = { | |||
"__name__": name, | |||
"__getattribute__": _mock_get, | |||
"_unloaded": True, | |||
"_lock": RLock() | |||
"_lock": RLock(), | |||
} | |||
parents = (ModuleType,) | |||
klass = type.__new__(cls, "module", parents, attributes) | |||
@@ -74,8 +76,6 @@ class _LazyModule(type): | |||
if "." in name: # Also ensure the parent exists | |||
_LazyModule(name.rsplit(".", 1)[0]) | |||
return sys.modules[name] | |||
finally: | |||
release_lock() | |||
class LazyImporter: | |||
@@ -84,6 +84,7 @@ class LazyImporter: | |||
This inserts itself into :py:data:`sys.meta_path`, storing a dictionary of | |||
:py:class:`_LazyModule`\ s (which is added to with :py:meth:`new`). | |||
""" | |||
def __init__(self): | |||
self._modules = {} | |||
sys.meta_path.append(self) | |||
@@ -1,7 +1,7 @@ | |||
#! /usr/bin/env python | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -21,7 +21,8 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
import imp | |||
import importlib.machinery | |||
import importlib.util | |||
from os import listdir, path | |||
from re import sub | |||
from threading import RLock, Thread | |||
@@ -32,6 +33,7 @@ from earwigbot.tasks import Task | |||
__all__ = ["CommandManager", "TaskManager"] | |||
class _ResourceManager: | |||
""" | |||
**EarwigBot: Resource Manager** | |||
@@ -48,6 +50,7 @@ class _ResourceManager: | |||
:py:meth:`load`, retrieving specific resources via :py:meth:`get`, and | |||
iterating over all resources via :py:meth:`__iter__`. | |||
""" | |||
def __init__(self, bot, name, base): | |||
self.bot = bot | |||
self.logger = bot.logger.getChild(name) | |||
@@ -60,8 +63,9 @@ class _ResourceManager: | |||
def __repr__(self): | |||
"""Return the canonical string representation of the manager.""" | |||
res = "{0}(bot={1!r}, name={2!r}, base={3!r})" | |||
return res.format(self.__class__.__name__, self.bot, | |||
self._resource_name, self._resource_base) | |||
return res.format( | |||
self.__class__.__name__, self.bot, self._resource_name, self._resource_base | |||
) | |||
def __str__(self): | |||
"""Return a nice string representation of the manager.""" | |||
@@ -100,22 +104,22 @@ class _ResourceManager: | |||
def _load_module(self, name, path): | |||
"""Load a specific resource from a module, identified by name and path. | |||
We'll first try to import it using imp magic, and if that works, make | |||
instances of any classes inside that are subclasses of the base | |||
We'll first try to import it using importlib magic, and if that works, | |||
make instances of any classes inside that are subclasses of the base | |||
(:py:attr:`self._resource_base <_resource_base>`), add them to the | |||
resources dictionary with :py:meth:`self._load_resource() | |||
<_load_resource>`, and finally log the addition. Any problems along | |||
the way will either be ignored or logged. | |||
""" | |||
f, path, desc = imp.find_module(name, [path]) | |||
spec = importlib.machinery.PathFinder.find_spec(name, [path]) | |||
try: | |||
module = imp.load_module(name, f, path, desc) | |||
assert spec is not None, "Spec must not be None" | |||
assert spec.loader is not None, "Loader must not be None" | |||
module = importlib.util.module_from_spec(spec) | |||
spec.loader.exec_module(module) | |||
except Exception: | |||
e = "Couldn't load module '{0}' (from {1})" | |||
self.logger.exception(e.format(name, path)) | |||
self.logger.exception(f"Couldn't load module {name!r} (from {path})") | |||
return | |||
finally: | |||
f.close() | |||
for obj in vars(module).values(): | |||
if type(obj) is type: | |||
@@ -132,7 +136,7 @@ class _ResourceManager: | |||
continue | |||
if name.startswith("_") or name.startswith("."): | |||
continue | |||
modname = sub("\.pyc?$", "", name) # Remove extension | |||
modname = sub(r"\.pyc?$", "", name) # Remove extension | |||
if modname in processed: | |||
continue | |||
processed.append(modname) | |||
@@ -200,6 +204,7 @@ class CommandManager(_ResourceManager): | |||
""" | |||
Manages (i.e., loads, reloads, and calls) IRC commands. | |||
""" | |||
def __init__(self, bot): | |||
super().__init__(bot, "commands", Command) | |||
@@ -234,8 +239,7 @@ class CommandManager(_ResourceManager): | |||
for command in self: | |||
if hook in command.hooks and self._wrap_check(command, data): | |||
thread = Thread(target=self._wrap_process, | |||
args=(command, data)) | |||
thread = Thread(target=self._wrap_process, args=(command, data)) | |||
start_time = strftime("%b %d %H:%M:%S") | |||
thread.name = "irc:{0} ({1})".format(command.name, start_time) | |||
thread.daemon = True | |||
@@ -247,6 +251,7 @@ class TaskManager(_ResourceManager): | |||
""" | |||
Manages (i.e., loads, reloads, schedules, and runs) wiki bot tasks. | |||
""" | |||
def __init__(self, bot): | |||
super().__init__(bot, "tasks", Task) | |||
@@ -292,11 +297,12 @@ class TaskManager(_ResourceManager): | |||
if not now: | |||
now = gmtime() | |||
# Get list of tasks to run this turn: | |||
tasks = self.bot.config.schedule(now.tm_min, now.tm_hour, now.tm_mday, | |||
now.tm_mon, now.tm_wday) | |||
tasks = self.bot.config.schedule( | |||
now.tm_min, now.tm_hour, now.tm_mday, now.tm_mon, now.tm_wday | |||
) | |||
for task in tasks: | |||
if isinstance(task, list): # They've specified kwargs, | |||
if isinstance(task, list): # They've specified kwargs, | |||
self.start(task[0], **task[1]) # so pass those to start | |||
else: # Otherwise, just pass task_name | |||
self.start(task) |
@@ -146,7 +146,7 @@ class Task: | |||
try: | |||
content = page.get() | |||
except exceptions.PageNotFoundError: | |||
return False | |||
return True | |||
if content == cfg.get("disabled", "run"): | |||
return False | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -26,6 +26,7 @@ from earwigbot import exceptions | |||
from earwigbot.tasks import Task | |||
from earwigbot.wiki import constants | |||
class WikiProjectTagger(Task): | |||
"""A task to tag talk pages with WikiProject banners. | |||
@@ -76,28 +77,24 @@ class WikiProjectTagger(Task): | |||
edited | |||
""" | |||
name = "wikiproject_tagger" | |||
# Regexes for template names that should always go above the banner, based | |||
# on [[Wikipedia:Talk page layout]]: | |||
TOP_TEMPS = [ | |||
r"skip ?to ?(toc|talk|toctalk)$", | |||
r"ga ?nominee$", | |||
r"(user ?)?talk ?(header|page|page ?header)$", | |||
r"community ?article ?probation$", | |||
r"censor(-nudity)?$", | |||
r"blp(o| ?others?)?$", | |||
r"controvers(ial2?|y)$", | |||
r"(not ?(a ?)?)?forum$", | |||
r"tv(episode|series)talk$", | |||
r"recurring ?themes$", | |||
r"faq$", | |||
r"(round ?in ?)?circ(les|ular)$", | |||
r"ar(ti|it)cle ?(history|milestones)$", | |||
r"failed ?ga$", | |||
r"old ?prod( ?full)?$", | |||
@@ -144,10 +141,18 @@ class WikiProjectTagger(Task): | |||
else: | |||
only_with = None | |||
job = _Job(banner=banner, names=names, summary=summary, update=update, | |||
append=append, autoassess=autoassess, only_with=only_with, | |||
nocreate=nocreate, tag_categories=tag_categories, | |||
dry_run=dry_run) | |||
job = _Job( | |||
banner=banner, | |||
names=names, | |||
summary=summary, | |||
update=update, | |||
append=append, | |||
autoassess=autoassess, | |||
only_with=only_with, | |||
nocreate=nocreate, | |||
tag_categories=tag_categories, | |||
dry_run=dry_run, | |||
) | |||
try: | |||
self.run_job(kwargs, site, job, recursive) | |||
@@ -165,7 +170,6 @@ class WikiProjectTagger(Task): | |||
with open(kwargs["file"], "r") as fileobj: | |||
for line in fileobj: | |||
if line.strip(): | |||
line = line.decode("utf8") | |||
if line.startswith("[[") and line.endswith("]]"): | |||
line = line[2:-2] | |||
page = site.get_page(line) | |||
@@ -201,8 +205,13 @@ class WikiProjectTagger(Task): | |||
return banner, None | |||
names = {banner, title} | |||
result = site.api_query(action="query", list="backlinks", bllimit=500, | |||
blfilterredir="redirects", bltitle=title) | |||
result = site.api_query( | |||
action="query", | |||
list="backlinks", | |||
bllimit=500, | |||
blfilterredir="redirects", | |||
bltitle=title, | |||
) | |||
for backlink in result["query"]["backlinks"]: | |||
names.add(backlink["title"]) | |||
if backlink["ns"] == constants.NS_TEMPLATE: | |||
@@ -215,8 +224,9 @@ class WikiProjectTagger(Task): | |||
def process_category(self, page, job, recursive): | |||
"""Try to tag all pages in the given category.""" | |||
if page.title in job.processed_cats: | |||
self.logger.debug("Skipping category, already processed: [[%s]]", | |||
page.title) | |||
self.logger.debug( | |||
"Skipping category, already processed: [[%s]]", page.title | |||
) | |||
return | |||
self.logger.info("Processing category: [[%s]]", page.title) | |||
job.processed_cats.add(page.title) | |||
@@ -243,8 +253,7 @@ class WikiProjectTagger(Task): | |||
page = page.toggle_talk() | |||
if page.title in job.processed_pages: | |||
self.logger.debug("Skipping page, already processed: [[%s]]", | |||
page.title) | |||
self.logger.debug("Skipping page, already processed: [[%s]]", page.title) | |||
return | |||
job.processed_pages.add(page.title) | |||
@@ -275,21 +284,22 @@ class WikiProjectTagger(Task): | |||
return | |||
if job.only_with: | |||
if not any(template.name.matches(job.only_with) | |||
for template in code.ifilter_templates(recursive=True)): | |||
if not any( | |||
template.name.matches(job.only_with) | |||
for template in code.ifilter_templates(recursive=True) | |||
): | |||
log = "Skipping page: [[%s]]; fails only-with condition" | |||
self.logger.info(log, page.title) | |||
return | |||
if is_update: | |||
old_banner = str(banner) | |||
self.update_banner(banner, job, code) | |||
if banner == old_banner: | |||
updated = self.update_banner(banner, job, code) | |||
if not updated: | |||
log = "Skipping page: [[%s]]; already tagged and no updates" | |||
self.logger.info(log, page.title) | |||
return | |||
self.logger.info("Updating banner on page: [[%s]]", page.title) | |||
banner = banner.encode("utf8") | |||
banner = str(banner) | |||
else: | |||
self.logger.info("Tagging page: [[%s]]", page.title) | |||
banner = self.make_banner(job, code) | |||
@@ -334,9 +344,11 @@ class WikiProjectTagger(Task): | |||
def update_banner(self, banner, job, code): | |||
"""Update an existing *banner* based on a *job* and a page's *code*.""" | |||
has = lambda key: (banner.has(key) and | |||
banner.get(key).value.strip() not in ("", "?")) | |||
has = lambda key: ( | |||
banner.has(key) and banner.get(key).value.strip() not in ("", "?") | |||
) | |||
updated = False | |||
if job.autoassess is not False: | |||
if not has("class"): | |||
assess, reason = self.get_autoassessment(code, job.autoassess) | |||
@@ -349,6 +361,8 @@ class WikiProjectTagger(Task): | |||
key, value = param.split("=", 1) | |||
if not has(key): | |||
banner.add(key, value) | |||
updated = True | |||
return updated | |||
def get_autoassessment(self, code, only_classes=None): | |||
"""Get an autoassessment for a page. | |||
@@ -356,16 +370,27 @@ class WikiProjectTagger(Task): | |||
Return (assessed class as a string or None, assessment reason or None). | |||
""" | |||
if only_classes is None or only_classes is True: | |||
classnames = ["a", "b", "book", "c", "dab", "fa", "fl", "ga", | |||
"list", "redirect", "start", "stub"] | |||
classnames = [ | |||
"a", | |||
"b", | |||
"book", | |||
"c", | |||
"dab", | |||
"fa", | |||
"fl", | |||
"ga", | |||
"list", | |||
"redirect", | |||
"start", | |||
"stub", | |||
] | |||
else: | |||
classnames = [klass.strip().lower() | |||
for klass in only_classes.split(",")] | |||
classnames = [klass.strip().lower() for klass in only_classes.split(",")] | |||
classes = {klass: 0 for klass in classnames} | |||
for template in code.ifilter_templates(recursive=True): | |||
if template.has("class"): | |||
value = str(template.get("class").value).lower() | |||
value = str(template.get("class").value).strip().lower() | |||
if value in classes: | |||
classes[value] += 1 | |||
@@ -429,6 +454,7 @@ class WikiProjectTagger(Task): | |||
self.logger.debug("Inserting banner at beginning") | |||
code.insert(0, banner + "\n") | |||
class _Job: | |||
"""Represents a single wikiproject-tagging task. | |||
@@ -436,6 +462,7 @@ class _Job: | |||
or not to autoassess and create new pages from scratch, and a counter of | |||
the number of pages edited. | |||
""" | |||
def __init__(self, **kwargs): | |||
self.banner = kwargs["banner"] | |||
self.names = kwargs["names"] | |||
@@ -456,4 +483,5 @@ class _Job: | |||
class _ShutoffEnabled(Exception): | |||
"""Raised by process_page() if shutoff is enabled. Caught by run(), which | |||
will then stop the task.""" | |||
pass |
@@ -280,7 +280,7 @@ class Page(CopyvioMixIn): | |||
self._assert_existence() | |||
def _edit(self, params=None, text=None, summary=None, minor=None, bot=None, | |||
force=None, section=None, captcha_id=None, captcha_word=None): | |||
force=None, section=None, captcha_id=None, captcha_word=None, **kwargs): | |||
"""Edit the page! | |||
If *params* is given, we'll use it as our API query parameters. | |||
@@ -297,7 +297,7 @@ class Page(CopyvioMixIn): | |||
# Build our API query string: | |||
if not params: | |||
params = self._build_edit_params(text, summary, minor, bot, force, | |||
section, captcha_id, captcha_word) | |||
section, captcha_id, captcha_word, kwargs) | |||
else: # Make sure we have the right token: | |||
params["token"] = self.site.get_token() | |||
@@ -320,7 +320,7 @@ class Page(CopyvioMixIn): | |||
raise exceptions.EditError(result["edit"]) | |||
def _build_edit_params(self, text, summary, minor, bot, force, section, | |||
captcha_id, captcha_word): | |||
captcha_id, captcha_word, kwargs): | |||
"""Given some keyword arguments, build an API edit query string.""" | |||
unitxt = text.encode("utf8") if isinstance(text, str) else text | |||
hashed = md5(unitxt).hexdigest() # Checksum to ensure text is correct | |||
@@ -351,6 +351,11 @@ class Page(CopyvioMixIn): | |||
else: | |||
params["recreate"] = "true" | |||
for key, val in kwargs.items(): | |||
if val is None: | |||
params.pop(key, None) | |||
else: | |||
params[key] = val | |||
return params | |||
def _handle_edit_errors(self, error, params, retry=True): | |||
@@ -657,7 +662,7 @@ class Page(CopyvioMixIn): | |||
""" | |||
return mwparserfromhell.parse(self.get()) | |||
def edit(self, text, summary, minor=False, bot=True, force=False): | |||
def edit(self, text, summary, minor=False, bot=True, force=False, **kwargs): | |||
"""Replace the page's content or creates a new page. | |||
*text* is the new page content, with *summary* as the edit summary. | |||
@@ -670,9 +675,9 @@ class Page(CopyvioMixIn): | |||
editing our page. Be careful with this! | |||
""" | |||
self._edit(text=text, summary=summary, minor=minor, bot=bot, | |||
force=force) | |||
force=force, **kwargs) | |||
def add_section(self, text, title, minor=False, bot=True, force=False): | |||
def add_section(self, text, title, minor=False, bot=True, force=False, **kwargs): | |||
"""Add a new section to the bottom of the page. | |||
The arguments for this are the same as those for :py:meth:`edit`, but | |||
@@ -683,7 +688,7 @@ class Page(CopyvioMixIn): | |||
new section as content. | |||
""" | |||
self._edit(text=text, summary=title, minor=minor, bot=bot, force=force, | |||
section="new") | |||
section="new", **kwargs) | |||
def check_exclusion(self, username=None, optouts=None): | |||
"""Check whether or not we are allowed to edit the page. | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2021 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -37,10 +37,11 @@ from earwigbot.wiki.category import Category | |||
from earwigbot.wiki.page import Page | |||
from earwigbot.wiki.user import User | |||
oursql = importer.new("oursql") | |||
pymysql = importer.new("pymysql") | |||
__all__ = ["Site"] | |||
class Site: | |||
""" | |||
**EarwigBot: Wiki Toolset: Site** | |||
@@ -80,18 +81,41 @@ class Site: | |||
- :py:meth:`get_user`: returns a User object for the given name | |||
- :py:meth:`delegate`: controls when the API or SQL is used | |||
""" | |||
SERVICE_API = 1 | |||
SERVICE_SQL = 2 | |||
SPECIAL_TOKENS = ["createaccount", "deleteglobalaccount", "login", | |||
"patrol", "rollback", "setglobalaccountstatus", | |||
"userrights", "watch"] | |||
def __init__(self, name=None, project=None, lang=None, base_url=None, | |||
article_path=None, script_path=None, sql=None, | |||
namespaces=None, login=(None, None), oauth=None, | |||
cookiejar=None, user_agent=None, use_https=True, | |||
assert_edit=None, maxlag=None, wait_between_queries=1, | |||
logger=None, search_config=None): | |||
SPECIAL_TOKENS = [ | |||
"createaccount", | |||
"deleteglobalaccount", | |||
"login", | |||
"patrol", | |||
"rollback", | |||
"setglobalaccountstatus", | |||
"userrights", | |||
"watch", | |||
] | |||
def __init__( | |||
self, | |||
name=None, | |||
project=None, | |||
lang=None, | |||
base_url=None, | |||
article_path=None, | |||
script_path=None, | |||
sql=None, | |||
namespaces=None, | |||
login=(None, None), | |||
oauth=None, | |||
cookiejar=None, | |||
user_agent=None, | |||
use_https=True, | |||
assert_edit=None, | |||
maxlag=None, | |||
wait_between_queries=1, | |||
logger=None, | |||
search_config=None, | |||
): | |||
"""Constructor for new Site instances. | |||
This probably isn't necessary to call yourself unless you're building a | |||
@@ -160,8 +184,11 @@ class Site: | |||
self._session.headers["User-Agent"] = user_agent | |||
if oauth: | |||
self._session.auth = OAuth1( | |||
oauth["consumer_token"], oauth["consumer_secret"], | |||
oauth["access_token"], oauth["access_secret"]) | |||
oauth["consumer_token"], | |||
oauth["consumer_secret"], | |||
oauth["access_token"], | |||
oauth["access_secret"], | |||
) | |||
# Set up our internal logger: | |||
if logger: | |||
@@ -182,13 +209,24 @@ class Site: | |||
def __repr__(self): | |||
"""Return the canonical string representation of the Site.""" | |||
res = ", ".join(( | |||
"Site(name={_name!r}", "project={_project!r}", "lang={_lang!r}", | |||
"base_url={_base_url!r}", "article_path={_article_path!r}", | |||
"script_path={_script_path!r}", "use_https={_use_https!r}", | |||
"assert_edit={_assert_edit!r}", "maxlag={_maxlag!r}", | |||
"sql={_sql_data!r}", "login={0}", "oauth={1}", "user_agent={3!r}", | |||
"cookiejar={2})")) | |||
res = ", ".join( | |||
( | |||
"Site(name={_name!r}", | |||
"project={_project!r}", | |||
"lang={_lang!r}", | |||
"base_url={_base_url!r}", | |||
"article_path={_article_path!r}", | |||
"script_path={_script_path!r}", | |||
"use_https={_use_https!r}", | |||
"assert_edit={_assert_edit!r}", | |||
"maxlag={_maxlag!r}", | |||
"sql={_sql_data!r}", | |||
"login={0}", | |||
"oauth={1}", | |||
"user_agent={3!r}", | |||
"cookiejar={2})", | |||
) | |||
) | |||
name, password = self._login_info | |||
login = "({0}, {1})".format(repr(name), "hidden" if password else None) | |||
oauth = "hidden" if self._oauth else None | |||
@@ -211,8 +249,15 @@ class Site: | |||
return value | |||
return str(value, encoding) | |||
def _api_query(self, params, tries=0, wait=5, ignore_maxlag=False, | |||
no_assert=False, ae_retry=True): | |||
def _api_query( | |||
self, | |||
params, | |||
tries=0, | |||
wait=5, | |||
ignore_maxlag=False, | |||
no_assert=False, | |||
ae_retry=True, | |||
): | |||
"""Do an API query with *params* as a dict of parameters. | |||
See the documentation for :py:meth:`api_query` for full implementation | |||
@@ -348,8 +393,14 @@ class Site: | |||
""" | |||
# All attributes to be loaded, except _namespaces, which is a special | |||
# case because it requires additional params in the API query: | |||
attrs = [self._name, self._project, self._lang, self._base_url, | |||
self._article_path, self._script_path] | |||
attrs = [ | |||
self._name, | |||
self._project, | |||
self._lang, | |||
self._base_url, | |||
self._article_path, | |||
self._script_path, | |||
] | |||
params = {"action": "query", "meta": "siteinfo", "siprop": "general"} | |||
@@ -359,7 +410,7 @@ class Site: | |||
result = self._api_query(params, no_assert=True) | |||
self._load_namespaces(result) | |||
elif all(attrs): # Everything is already specified and we're not told | |||
return # to force a reload, so do nothing | |||
return # to force a reload, so do nothing | |||
else: # We're only loading attributes other than _namespaces | |||
with self._api_lock: | |||
result = self._api_query(params, no_assert=True) | |||
@@ -424,11 +475,11 @@ class Site: | |||
(for that, we'd do self._login_info[0]), but rather to get our current | |||
username without an unnecessary ?action=query&meta=userinfo API query. | |||
""" | |||
name = ''.join((self._name, "Token")) | |||
name = "".join((self._name, "Token")) | |||
cookie = self._get_cookie(name, self.domain) | |||
if cookie: | |||
name = ''.join((self._name, "UserName")) | |||
name = "".join((self._name, "UserName")) | |||
user_name = self._get_cookie(name, self.domain) | |||
if user_name: | |||
return unquote_plus(user_name.value) | |||
@@ -528,8 +579,12 @@ class Site: | |||
except KeyError: | |||
raise exceptions.LoginError("Couldn't get login token") | |||
params = {"action": "login", "lgname": name, "lgpassword": password, | |||
"lgtoken": token} | |||
params = { | |||
"action": "login", | |||
"lgname": name, | |||
"lgpassword": password, | |||
"lgtoken": token, | |||
} | |||
with self._api_lock: | |||
result = self._api_query(params, no_assert=True) | |||
@@ -564,18 +619,22 @@ class Site: | |||
def _sql_connect(self, **kwargs): | |||
"""Attempt to establish a connection with this site's SQL database. | |||
oursql.connect() will be called with self._sql_data as its kwargs. | |||
pymysql.connect() will be called with self._sql_data as its kwargs. | |||
Any kwargs given to this function will be passed to connect() and will | |||
have precedence over the config file. | |||
Will raise SQLError() if the module "oursql" is not available. oursql | |||
may raise its own exceptions (e.g. oursql.InterfaceError) if it cannot | |||
Will raise SQLError() if the module "pymysql" is not available. pymysql | |||
may raise its own exceptions (e.g. pymysql.InterfaceError) if it cannot | |||
establish a connection. | |||
""" | |||
args = self._sql_data | |||
for key, value in kwargs.items(): | |||
args[key] = value | |||
if "read_default_file" not in args and "user" not in args and "passwd" not in args: | |||
if ( | |||
"read_default_file" not in args | |||
and "user" not in args | |||
and "passwd" not in args | |||
): | |||
args["read_default_file"] = expanduser("~/.my.cnf") | |||
elif "read_default_file" in args: | |||
args["read_default_file"] = expanduser(args["read_default_file"]) | |||
@@ -585,9 +644,9 @@ class Site: | |||
args["autoreconnect"] = True | |||
try: | |||
self._sql_conn = oursql.connect(**args) | |||
self._sql_conn = pymysql.connect(**args) | |||
except ImportError: | |||
e = "SQL querying requires the 'oursql' package: https://pythonhosted.org/oursql/" | |||
e = "SQL querying requires the 'pymysql' package: https://pymysql.readthedocs.io/" | |||
raise exceptions.SQLError(e) | |||
def _get_service_order(self): | |||
@@ -608,8 +667,11 @@ class Site: | |||
if now - self._sql_info_cache["lastcheck"] > 120: | |||
self._sql_info_cache["lastcheck"] = now | |||
try: | |||
self._sql_info_cache["replag"] = sqllag = self.get_replag() | |||
except (exceptions.SQLError, oursql.Error): | |||
try: | |||
self._sql_info_cache["replag"] = sqllag = self.get_replag() | |||
except pymysql.Error as exc: | |||
raise exceptions.SQLError(str(exc)) | |||
except (exceptions.SQLError, ImportError): | |||
self._sql_info_cache["usable"] = False | |||
return [self.SERVICE_API] | |||
self._sql_info_cache["usable"] = True | |||
@@ -705,24 +767,31 @@ class Site: | |||
with self._api_lock: | |||
return self._api_query(kwargs) | |||
def sql_query(self, query, params=(), plain_query=False, dict_cursor=False, | |||
cursor_class=None, show_table=False, buffsize=1024): | |||
def sql_query( | |||
self, | |||
query, | |||
params=(), | |||
plain_query=False, | |||
dict_cursor=False, | |||
cursor_class=None, | |||
buffsize=1024, | |||
): | |||
"""Do an SQL query and yield its results. | |||
If *plain_query* is ``True``, we will force an unparameterized query. | |||
Specifying both *params* and *plain_query* will cause an error. If | |||
*dict_cursor* is ``True``, we will use :py:class:`oursql.DictCursor` as | |||
our cursor, otherwise the default :py:class:`oursql.Cursor`. If | |||
*cursor_class* is given, it will override this option. If *show_table* | |||
is True, the name of the table will be prepended to the name of the | |||
column. This will mainly affect an :py:class:`~oursql.DictCursor`. | |||
*dict_cursor* is ``True``, we will use | |||
:py:class:`pymysql.cursors.DictCursor` as our cursor, otherwise the | |||
default :py:class:`pymysql.cursors.Cursor`. If *cursor_class* is given, | |||
it will override this option. | |||
*buffsize* is the size of each memory-buffered group of results, to | |||
reduce the number of conversations with the database; it is passed to | |||
:py:meth:`cursor.fetchmany() <oursql.Cursor.fetchmany>`. If set to | |||
``0```, all results will be buffered in memory at once (this uses | |||
:py:meth:`fetchall() <oursql.Cursor.fetchall>`). If set to ``1``, it is | |||
equivalent to using :py:meth:`fetchone() <oursql.Cursor.fetchone>`. | |||
:py:meth:`cursor.fetchmany() <pymysql.cursors.Cursor.fetchmany>`. If | |||
set to ``0```, all results will be buffered in memory at once (this | |||
uses :py:meth:`fetchall() <pymysql.cursors.Cursor.fetchall>`). If set | |||
to ``1``, it is equivalent to using | |||
:py:meth:`fetchone() <pymysql.cursors.Cursor.fetchone>`. | |||
Example usage:: | |||
@@ -736,25 +805,25 @@ class Site: | |||
{'user_id': 7418060L, 'user_registration': '20080703215134'} | |||
This may raise :py:exc:`~earwigbot.exceptions.SQLError` or one of | |||
oursql's exceptions (:py:exc:`oursql.ProgrammingError`, | |||
:py:exc:`oursql.InterfaceError`, ...) if there were problems with the | |||
pymysql's exceptions (:py:exc:`pymysql.ProgrammingError`, | |||
:py:exc:`pymysql.InterfaceError`, ...) if there were problems with the | |||
query. | |||
See :py:meth:`_sql_connect` for information on how a connection is | |||
acquired. Also relevant is `oursql's documentation | |||
<https://pythonhosted.org/oursql/>`_ for details on that package. | |||
acquired. Also relevant is `pymysql's documentation | |||
<https://pymysql.readthedocs.io/>`_ for details on that package. | |||
""" | |||
if not cursor_class: | |||
if dict_cursor: | |||
cursor_class = oursql.DictCursor | |||
cursor_class = pymysql.cursors.DictCursor | |||
else: | |||
cursor_class = oursql.Cursor | |||
cursor_class = pymysql.cursors.Cursor | |||
klass = cursor_class | |||
with self._sql_lock: | |||
if not self._sql_conn: | |||
self._sql_connect() | |||
with self._sql_conn.cursor(klass, show_table=show_table) as cur: | |||
with self._sql_conn.cursor(klass) as cur: | |||
cur.execute(query, params, plain_query) | |||
if buffsize: | |||
while True: | |||
@@ -798,8 +867,8 @@ class Site: | |||
time from the timestamp of the latest recent changes event. | |||
This may raise :py:exc:`~earwigbot.exceptions.SQLError` or one of | |||
oursql's exceptions (:py:exc:`oursql.ProgrammingError`, | |||
:py:exc:`oursql.InterfaceError`, ...) if there were problems. | |||
pymysql's exceptions (:py:exc:`pymysql.ProgrammingError`, | |||
:py:exc:`pymysql.InterfaceError`, ...) if there were problems. | |||
""" | |||
query = """SELECT UNIX_TIMESTAMP() - UNIX_TIMESTAMP(rc_timestamp) FROM | |||
recentchanges ORDER BY rc_timestamp DESC LIMIT 1""" | |||
@@ -886,8 +955,7 @@ class Site: | |||
prefix = title.split(":", 1)[0] | |||
if prefix != title: # Avoid a page that is simply "Category" | |||
if prefix in prefixes: | |||
return Category(self, title, follow_redirects, pageid, | |||
self._logger) | |||
return Category(self, title, follow_redirects, pageid, self._logger) | |||
return Page(self, title, follow_redirects, pageid, self._logger) | |||
def get_category(self, catname, follow_redirects=False, pageid=None): | |||
@@ -899,7 +967,7 @@ class Site: | |||
""" | |||
catname = self._unicodeify(catname) | |||
prefix = self.namespace_id_to_name(constants.NS_CATEGORY) | |||
pagename = ':'.join((prefix, catname)) | |||
pagename = ":".join((prefix, catname)) | |||
return Category(self, pagename, follow_redirects, pageid, self._logger) | |||
def get_user(self, username=None): | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2021 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -35,6 +35,7 @@ from earwigbot.wiki.site import Site | |||
__all__ = ["SitesDB"] | |||
class SitesDB: | |||
""" | |||
**EarwigBot: Wiki Toolset: Sites Database Manager** | |||
@@ -106,7 +107,7 @@ class SitesDB: | |||
# Create the file and restrict reading/writing only to the | |||
# owner, so others can't peak at our cookies: | |||
open(self._cookie_file, "w").close() | |||
chmod(self._cookie_file, stat.S_IRUSR|stat.S_IWUSR) | |||
chmod(self._cookie_file, stat.S_IRUSR | stat.S_IWUSR) | |||
else: | |||
raise | |||
@@ -172,8 +173,16 @@ class SitesDB: | |||
except KeyError: | |||
namespaces[ns_id] = [ns_name] | |||
return (name, project, lang, base_url, article_path, script_path, sql, | |||
namespaces) | |||
return ( | |||
name, | |||
project, | |||
lang, | |||
base_url, | |||
article_path, | |||
script_path, | |||
sql, | |||
namespaces, | |||
) | |||
def _make_site_object(self, name): | |||
"""Return a Site object associated with the site *name* in our sitesdb. | |||
@@ -182,8 +191,9 @@ class SitesDB: | |||
raised if the site is not in our sitesdb. | |||
""" | |||
cookiejar = self._get_cookiejar() | |||
(name, project, lang, base_url, article_path, script_path, sql, | |||
namespaces) = self._load_site_from_sitesdb(name) | |||
(name, project, lang, base_url, article_path, script_path, sql, namespaces) = ( | |||
self._load_site_from_sitesdb(name) | |||
) | |||
config = self.config | |||
login = (config.wiki.get("username"), config.wiki.get("password")) | |||
@@ -211,13 +221,26 @@ class SitesDB: | |||
if isinstance(value, str) and "$1" in value: | |||
sql[key] = value.replace("$1", name) | |||
return Site(name=name, project=project, lang=lang, base_url=base_url, | |||
article_path=article_path, script_path=script_path, | |||
sql=sql, namespaces=namespaces, login=login, oauth=oauth, | |||
cookiejar=cookiejar, user_agent=user_agent, | |||
use_https=use_https, assert_edit=assert_edit, | |||
maxlag=maxlag, wait_between_queries=wait_between_queries, | |||
logger=logger, search_config=search_config) | |||
return Site( | |||
name=name, | |||
project=project, | |||
lang=lang, | |||
base_url=base_url, | |||
article_path=article_path, | |||
script_path=script_path, | |||
sql=sql, | |||
namespaces=namespaces, | |||
login=login, | |||
oauth=oauth, | |||
cookiejar=cookiejar, | |||
user_agent=user_agent, | |||
use_https=use_https, | |||
assert_edit=assert_edit, | |||
maxlag=maxlag, | |||
wait_between_queries=wait_between_queries, | |||
logger=logger, | |||
search_config=search_config, | |||
) | |||
def _get_site_name_from_sitesdb(self, project, lang): | |||
"""Return the name of the first site with the given project and lang. | |||
@@ -255,8 +278,14 @@ class SitesDB: | |||
database. If the sitesdb doesn't exist, we'll create it first. | |||
""" | |||
name = site.name | |||
sites_data = (name, site.project, site.lang, site._base_url, | |||
site._article_path, site._script_path) | |||
sites_data = ( | |||
name, | |||
site.project, | |||
site.lang, | |||
site._base_url, | |||
site._article_path, | |||
site._script_path, | |||
) | |||
sql_data = [(name, key, val) for key, val in site._sql_data.items()] | |||
ns_data = [] | |||
for ns_id, ns_names in site._namespaces.items(): | |||
@@ -353,8 +382,9 @@ class SitesDB: | |||
e = "Site '{0}:{1}' not found in the sitesdb.".format(project, lang) | |||
raise SiteNotFoundError(e) | |||
def add_site(self, project=None, lang=None, base_url=None, | |||
script_path="/w", sql=None): | |||
def add_site( | |||
self, project=None, lang=None, base_url=None, script_path="/w", sql=None | |||
): | |||
"""Add a site to the sitesdb so it can be retrieved with get_site(). | |||
If only a project and a lang are given, we'll guess the *base_url* as | |||
@@ -368,8 +398,8 @@ class SitesDB: | |||
your wiki is different, provide the script_path as an argument. SQL | |||
connection settings are guessed automatically using config's template | |||
value. If this is wrong or not specified, provide a dict of kwargs as | |||
*sql* and Site will pass it to :py:func:`oursql.connect(**sql) | |||
<oursql.connect>`, allowing you to make queries with | |||
*sql* and Site will pass it to :py:func:`pymysql.connect(**sql) | |||
<pymysql.connect>`, allowing you to make queries with | |||
:py:meth:`site.sql_query <earwigbot.wiki.site.Site.sql_query>`. | |||
Returns ``True`` if the site was added successfully or ``False`` if the | |||
@@ -399,11 +429,19 @@ class SitesDB: | |||
user_agent = user_agent.replace("$2", python_version()) | |||
# Create a Site object to log in and load the other attributes: | |||
site = Site(base_url=base_url, script_path=script_path, sql=sql, | |||
login=login, oauth=oauth, cookiejar=cookiejar, | |||
user_agent=user_agent, use_https=use_https, | |||
assert_edit=assert_edit, maxlag=maxlag, | |||
wait_between_queries=wait_between_queries) | |||
site = Site( | |||
base_url=base_url, | |||
script_path=script_path, | |||
sql=sql, | |||
login=login, | |||
oauth=oauth, | |||
cookiejar=cookiejar, | |||
user_agent=user_agent, | |||
use_https=use_https, | |||
assert_edit=assert_edit, | |||
maxlag=maxlag, | |||
wait_between_queries=wait_between_queries, | |||
) | |||
self._logger.info("Added site '{0}'".format(site.name)) | |||
self._add_site_to_sitesdb(site) | |||
@@ -1,7 +1,7 @@ | |||
#! /usr/bin/env python | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2021 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -37,11 +37,11 @@ extra_deps = { | |||
"cryptography >= 3.4.7", # Storing bot passwords + keys in the config file | |||
], | |||
"sql": [ | |||
"oursql3 >= 0.9.4", # Interfacing with MediaWiki databases | |||
"pymysql >= 1.1.0", # Interfacing with MediaWiki databases | |||
], | |||
"copyvios": [ | |||
"beautifulsoup4 >= 4.9.3", # Parsing/scraping HTML | |||
"cchardet >= 2.1.7", # Encoding detection for BeautifulSoup | |||
"charset_normalizer >= 3.3.2", # Encoding detection for BeautifulSoup | |||
"lxml >= 4.6.3", # Faster parser for BeautifulSoup | |||
"nltk >= 3.6.1", # Parsing sentences to split article content | |||
"pdfminer >= 20191125", # Extracting text from PDF files | |||
@@ -58,21 +58,21 @@ with open("README.rst") as fp: | |||
long_docs = fp.read() | |||
setup( | |||
name = "earwigbot", | |||
packages = find_packages(exclude=("tests",)), | |||
entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]}, | |||
install_requires = dependencies, | |||
test_suite = "tests", | |||
version = __version__, | |||
author = "Ben Kurtovic", | |||
author_email = "ben.kurtovic@gmail.com", | |||
url = "https://github.com/earwig/earwigbot", | |||
description = "EarwigBot is a Python robot that edits Wikipedia and interacts with people over IRC.", | |||
long_description = long_docs, | |||
download_url = "https://github.com/earwig/earwigbot/tarball/v{0}".format(__version__), | |||
keywords = "earwig earwigbot irc wikipedia wiki mediawiki", | |||
license = "MIT License", | |||
classifiers = [ | |||
name="earwigbot", | |||
packages=find_packages(exclude=("tests",)), | |||
entry_points={"console_scripts": ["earwigbot = earwigbot.util:main"]}, | |||
install_requires=dependencies, | |||
test_suite="tests", | |||
version=__version__, | |||
author="Ben Kurtovic", | |||
author_email="ben.kurtovic@gmail.com", | |||
url="https://github.com/earwig/earwigbot", | |||
description="EarwigBot is a Python robot that edits Wikipedia and interacts with people over IRC.", | |||
long_description=long_docs, | |||
download_url="https://github.com/earwig/earwigbot/tarball/v{0}".format(__version__), | |||
keywords="earwig earwigbot irc wikipedia wiki mediawiki", | |||
license="MIT License", | |||
classifiers=[ | |||
"Development Status :: 3 - Alpha", | |||
"Environment :: Console", | |||
"Intended Audience :: Developers", | |||
@@ -81,6 +81,6 @@ setup( | |||
"Operating System :: OS Independent", | |||
"Programming Language :: Python :: 3", | |||
"Topic :: Communications :: Chat :: Internet Relay Chat", | |||
"Topic :: Internet :: WWW/HTTP" | |||
"Topic :: Internet :: WWW/HTTP", | |||
], | |||
) |