Просмотр исходного кода

Python 3.11+ cleanup and bugfixes

tags/v0.4
Ben Kurtovic 7 месяцев назад
Родитель
Сommit
10bc4b3fd4
10 измененных файлов: 335 добавлений и 184 удалений
  1. +3
    -1
      CHANGELOG
  2. +19
    -16
      earwigbot/irc/watcher.py
  3. +8
    -7
      earwigbot/lazy.py
  4. +24
    -18
      earwigbot/managers.py
  5. +1
    -1
      earwigbot/tasks/__init__.py
  6. +58
    -30
      earwigbot/tasks/wikiproject_tagger.py
  7. +12
    -7
      earwigbot/wiki/page.py
  8. +129
    -61
      earwigbot/wiki/site.py
  9. +62
    -24
      earwigbot/wiki/sitesdb.py
  10. +19
    -19
      setup.py

+ 3
- 1
CHANGELOG Просмотреть файл

@@ -1,12 +1,14 @@
v0.4 (unreleased):

- Migrated to Python 3.
- Migrated to Python 3 (3.11+).
- Migrated from oursql to pymysql.
- Copyvios: Configurable proxy support for specific domains.
- Copyvios: Parser-directed URL redirection.
- Copyvios: General parsing improvements.
- Copyvios: URL exclusion improvements.
- Copyvios: Removed long-deprecated Yahoo! BOSS search engine.
- Wiki: Fixed not sending Content-Type header in POST requests.
- IRC: Moved default server from Freenode to Libera.
- IRC: Remember joined channels across restarts.
- IRC: Added !listchans.
- IRC > !stalk: Added modifiers to change message format or filter messages.


+ 19
- 16
earwigbot/irc/watcher.py Просмотреть файл

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -20,12 +20,14 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import imp
import importlib.machinery
import importlib.util

from earwigbot.irc import IRCConnection, RC

__all__ = ["Watcher"]


class Watcher(IRCConnection):
"""
**EarwigBot: IRC Watcher Component**
@@ -40,16 +42,23 @@ class Watcher(IRCConnection):
def __init__(self, bot):
self.bot = bot
cf = bot.config.irc["watcher"]
super().__init__(cf["host"], cf["port"], cf["nick"], cf["ident"],
cf["realname"], bot.logger.getChild("watcher"))
super().__init__(
cf["host"],
cf["port"],
cf["nick"],
cf["ident"],
cf["realname"],
bot.logger.getChild("watcher"),
)
self._prepare_process_hook()
self._connect()

def __repr__(self):
"""Return the canonical string representation of the Watcher."""
res = "Watcher(host={0!r}, port={1!r}, nick={2!r}, ident={3!r}, realname={4!r}, bot={5!r})"
return res.format(self.host, self.port, self.nick, self.ident,
self.realname, self.bot)
return res.format(
self.host, self.port, self.nick, self.ident, self.realname, self.bot
)

def __str__(self):
"""Return a nice string representation of the Watcher."""
@@ -88,17 +97,11 @@ class Watcher(IRCConnection):
self._process_hook = lambda bot, rc: ()

path = self.bot.config.root_dir
try:
f, path, desc = imp.find_module("rules", [path])
except ImportError:
spec = importlib.machinery.PathFinder.find_spec("rules", [path])
if spec is None or spec.loader is None:
return
try:
module = imp.load_module("rules", f, path, desc)
except Exception:
return
finally:
f.close()

module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
self._process_hook_module = module
try:
self._process_hook = module.process


+ 8
- 7
earwigbot/lazy.py Просмотреть файл

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,6 @@ manner, so that they can be referred to by name but are not actually loaded
until they are used (i.e. their attributes are read or modified).
"""

from imp import acquire_lock, release_lock
import importlib
import sys
from threading import RLock
@@ -36,12 +35,16 @@ from types import ModuleType
__all__ = ["LazyImporter"]

_real_get = ModuleType.__getattribute__
_lazy_init_lock = RLock()


def _create_failing_get(exc):
def _fail(self, attr):
raise exc

return _fail


def _mock_get(self, attr):
with _real_get(self, "_lock"):
if _real_get(self, "_unloaded"):
@@ -59,14 +62,13 @@ def _mock_get(self, attr):

class _LazyModule(type):
def __new__(cls, name):
acquire_lock()
try:
with _lazy_init_lock:
if name not in sys.modules:
attributes = {
"__name__": name,
"__getattribute__": _mock_get,
"_unloaded": True,
"_lock": RLock()
"_lock": RLock(),
}
parents = (ModuleType,)
klass = type.__new__(cls, "module", parents, attributes)
@@ -74,8 +76,6 @@ class _LazyModule(type):
if "." in name: # Also ensure the parent exists
_LazyModule(name.rsplit(".", 1)[0])
return sys.modules[name]
finally:
release_lock()


class LazyImporter:
@@ -84,6 +84,7 @@ class LazyImporter:
This inserts itself into :py:data:`sys.meta_path`, storing a dictionary of
:py:class:`_LazyModule`\ s (which is added to with :py:meth:`new`).
"""

def __init__(self):
self._modules = {}
sys.meta_path.append(self)


+ 24
- 18
earwigbot/managers.py Просмотреть файл

@@ -1,7 +1,7 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -21,7 +21,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import imp
import importlib.machinery
import importlib.util
from os import listdir, path
from re import sub
from threading import RLock, Thread
@@ -32,6 +33,7 @@ from earwigbot.tasks import Task

__all__ = ["CommandManager", "TaskManager"]


class _ResourceManager:
"""
**EarwigBot: Resource Manager**
@@ -48,6 +50,7 @@ class _ResourceManager:
:py:meth:`load`, retrieving specific resources via :py:meth:`get`, and
iterating over all resources via :py:meth:`__iter__`.
"""

def __init__(self, bot, name, base):
self.bot = bot
self.logger = bot.logger.getChild(name)
@@ -60,8 +63,9 @@ class _ResourceManager:
def __repr__(self):
"""Return the canonical string representation of the manager."""
res = "{0}(bot={1!r}, name={2!r}, base={3!r})"
return res.format(self.__class__.__name__, self.bot,
self._resource_name, self._resource_base)
return res.format(
self.__class__.__name__, self.bot, self._resource_name, self._resource_base
)

def __str__(self):
"""Return a nice string representation of the manager."""
@@ -100,22 +104,22 @@ class _ResourceManager:
def _load_module(self, name, path):
"""Load a specific resource from a module, identified by name and path.

We'll first try to import it using imp magic, and if that works, make
instances of any classes inside that are subclasses of the base
We'll first try to import it using importlib magic, and if that works,
make instances of any classes inside that are subclasses of the base
(:py:attr:`self._resource_base <_resource_base>`), add them to the
resources dictionary with :py:meth:`self._load_resource()
<_load_resource>`, and finally log the addition. Any problems along
the way will either be ignored or logged.
"""
f, path, desc = imp.find_module(name, [path])
spec = importlib.machinery.PathFinder.find_spec(name, [path])
try:
module = imp.load_module(name, f, path, desc)
assert spec is not None, "Spec must not be None"
assert spec.loader is not None, "Loader must not be None"
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
except Exception:
e = "Couldn't load module '{0}' (from {1})"
self.logger.exception(e.format(name, path))
self.logger.exception(f"Couldn't load module {name!r} (from {path})")
return
finally:
f.close()

for obj in vars(module).values():
if type(obj) is type:
@@ -132,7 +136,7 @@ class _ResourceManager:
continue
if name.startswith("_") or name.startswith("."):
continue
modname = sub("\.pyc?$", "", name) # Remove extension
modname = sub(r"\.pyc?$", "", name) # Remove extension
if modname in processed:
continue
processed.append(modname)
@@ -200,6 +204,7 @@ class CommandManager(_ResourceManager):
"""
Manages (i.e., loads, reloads, and calls) IRC commands.
"""

def __init__(self, bot):
super().__init__(bot, "commands", Command)

@@ -234,8 +239,7 @@ class CommandManager(_ResourceManager):

for command in self:
if hook in command.hooks and self._wrap_check(command, data):
thread = Thread(target=self._wrap_process,
args=(command, data))
thread = Thread(target=self._wrap_process, args=(command, data))
start_time = strftime("%b %d %H:%M:%S")
thread.name = "irc:{0} ({1})".format(command.name, start_time)
thread.daemon = True
@@ -247,6 +251,7 @@ class TaskManager(_ResourceManager):
"""
Manages (i.e., loads, reloads, schedules, and runs) wiki bot tasks.
"""

def __init__(self, bot):
super().__init__(bot, "tasks", Task)

@@ -292,11 +297,12 @@ class TaskManager(_ResourceManager):
if not now:
now = gmtime()
# Get list of tasks to run this turn:
tasks = self.bot.config.schedule(now.tm_min, now.tm_hour, now.tm_mday,
now.tm_mon, now.tm_wday)
tasks = self.bot.config.schedule(
now.tm_min, now.tm_hour, now.tm_mday, now.tm_mon, now.tm_wday
)

for task in tasks:
if isinstance(task, list): # They've specified kwargs,
if isinstance(task, list): # They've specified kwargs,
self.start(task[0], **task[1]) # so pass those to start
else: # Otherwise, just pass task_name
self.start(task)

+ 1
- 1
earwigbot/tasks/__init__.py Просмотреть файл

@@ -146,7 +146,7 @@ class Task:
try:
content = page.get()
except exceptions.PageNotFoundError:
return False
return True
if content == cfg.get("disabled", "run"):
return False



+ 58
- 30
earwigbot/tasks/wikiproject_tagger.py Просмотреть файл

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@ from earwigbot import exceptions
from earwigbot.tasks import Task
from earwigbot.wiki import constants


class WikiProjectTagger(Task):
"""A task to tag talk pages with WikiProject banners.

@@ -76,28 +77,24 @@ class WikiProjectTagger(Task):
edited

"""

name = "wikiproject_tagger"

# Regexes for template names that should always go above the banner, based
# on [[Wikipedia:Talk page layout]]:
TOP_TEMPS = [
r"skip ?to ?(toc|talk|toctalk)$",

r"ga ?nominee$",

r"(user ?)?talk ?(header|page|page ?header)$",

r"community ?article ?probation$",
r"censor(-nudity)?$",
r"blp(o| ?others?)?$",
r"controvers(ial2?|y)$",

r"(not ?(a ?)?)?forum$",
r"tv(episode|series)talk$",
r"recurring ?themes$",
r"faq$",
r"(round ?in ?)?circ(les|ular)$",

r"ar(ti|it)cle ?(history|milestones)$",
r"failed ?ga$",
r"old ?prod( ?full)?$",
@@ -144,10 +141,18 @@ class WikiProjectTagger(Task):
else:
only_with = None

job = _Job(banner=banner, names=names, summary=summary, update=update,
append=append, autoassess=autoassess, only_with=only_with,
nocreate=nocreate, tag_categories=tag_categories,
dry_run=dry_run)
job = _Job(
banner=banner,
names=names,
summary=summary,
update=update,
append=append,
autoassess=autoassess,
only_with=only_with,
nocreate=nocreate,
tag_categories=tag_categories,
dry_run=dry_run,
)

try:
self.run_job(kwargs, site, job, recursive)
@@ -165,7 +170,6 @@ class WikiProjectTagger(Task):
with open(kwargs["file"], "r") as fileobj:
for line in fileobj:
if line.strip():
line = line.decode("utf8")
if line.startswith("[[") and line.endswith("]]"):
line = line[2:-2]
page = site.get_page(line)
@@ -201,8 +205,13 @@ class WikiProjectTagger(Task):
return banner, None

names = {banner, title}
result = site.api_query(action="query", list="backlinks", bllimit=500,
blfilterredir="redirects", bltitle=title)
result = site.api_query(
action="query",
list="backlinks",
bllimit=500,
blfilterredir="redirects",
bltitle=title,
)
for backlink in result["query"]["backlinks"]:
names.add(backlink["title"])
if backlink["ns"] == constants.NS_TEMPLATE:
@@ -215,8 +224,9 @@ class WikiProjectTagger(Task):
def process_category(self, page, job, recursive):
"""Try to tag all pages in the given category."""
if page.title in job.processed_cats:
self.logger.debug("Skipping category, already processed: [[%s]]",
page.title)
self.logger.debug(
"Skipping category, already processed: [[%s]]", page.title
)
return
self.logger.info("Processing category: [[%s]]", page.title)
job.processed_cats.add(page.title)
@@ -243,8 +253,7 @@ class WikiProjectTagger(Task):
page = page.toggle_talk()

if page.title in job.processed_pages:
self.logger.debug("Skipping page, already processed: [[%s]]",
page.title)
self.logger.debug("Skipping page, already processed: [[%s]]", page.title)
return
job.processed_pages.add(page.title)

@@ -275,21 +284,22 @@ class WikiProjectTagger(Task):
return

if job.only_with:
if not any(template.name.matches(job.only_with)
for template in code.ifilter_templates(recursive=True)):
if not any(
template.name.matches(job.only_with)
for template in code.ifilter_templates(recursive=True)
):
log = "Skipping page: [[%s]]; fails only-with condition"
self.logger.info(log, page.title)
return

if is_update:
old_banner = str(banner)
self.update_banner(banner, job, code)
if banner == old_banner:
updated = self.update_banner(banner, job, code)
if not updated:
log = "Skipping page: [[%s]]; already tagged and no updates"
self.logger.info(log, page.title)
return
self.logger.info("Updating banner on page: [[%s]]", page.title)
banner = banner.encode("utf8")
banner = str(banner)
else:
self.logger.info("Tagging page: [[%s]]", page.title)
banner = self.make_banner(job, code)
@@ -334,9 +344,11 @@ class WikiProjectTagger(Task):

def update_banner(self, banner, job, code):
"""Update an existing *banner* based on a *job* and a page's *code*."""
has = lambda key: (banner.has(key) and
banner.get(key).value.strip() not in ("", "?"))
has = lambda key: (
banner.has(key) and banner.get(key).value.strip() not in ("", "?")
)

updated = False
if job.autoassess is not False:
if not has("class"):
assess, reason = self.get_autoassessment(code, job.autoassess)
@@ -349,6 +361,8 @@ class WikiProjectTagger(Task):
key, value = param.split("=", 1)
if not has(key):
banner.add(key, value)
updated = True
return updated

def get_autoassessment(self, code, only_classes=None):
"""Get an autoassessment for a page.
@@ -356,16 +370,27 @@ class WikiProjectTagger(Task):
Return (assessed class as a string or None, assessment reason or None).
"""
if only_classes is None or only_classes is True:
classnames = ["a", "b", "book", "c", "dab", "fa", "fl", "ga",
"list", "redirect", "start", "stub"]
classnames = [
"a",
"b",
"book",
"c",
"dab",
"fa",
"fl",
"ga",
"list",
"redirect",
"start",
"stub",
]
else:
classnames = [klass.strip().lower()
for klass in only_classes.split(",")]
classnames = [klass.strip().lower() for klass in only_classes.split(",")]

classes = {klass: 0 for klass in classnames}
for template in code.ifilter_templates(recursive=True):
if template.has("class"):
value = str(template.get("class").value).lower()
value = str(template.get("class").value).strip().lower()
if value in classes:
classes[value] += 1

@@ -429,6 +454,7 @@ class WikiProjectTagger(Task):
self.logger.debug("Inserting banner at beginning")
code.insert(0, banner + "\n")


class _Job:
"""Represents a single wikiproject-tagging task.

@@ -436,6 +462,7 @@ class _Job:
or not to autoassess and create new pages from scratch, and a counter of
the number of pages edited.
"""

def __init__(self, **kwargs):
self.banner = kwargs["banner"]
self.names = kwargs["names"]
@@ -456,4 +483,5 @@ class _Job:
class _ShutoffEnabled(Exception):
"""Raised by process_page() if shutoff is enabled. Caught by run(), which
will then stop the task."""

pass

+ 12
- 7
earwigbot/wiki/page.py Просмотреть файл

@@ -280,7 +280,7 @@ class Page(CopyvioMixIn):
self._assert_existence()

def _edit(self, params=None, text=None, summary=None, minor=None, bot=None,
force=None, section=None, captcha_id=None, captcha_word=None):
force=None, section=None, captcha_id=None, captcha_word=None, **kwargs):
"""Edit the page!

If *params* is given, we'll use it as our API query parameters.
@@ -297,7 +297,7 @@ class Page(CopyvioMixIn):
# Build our API query string:
if not params:
params = self._build_edit_params(text, summary, minor, bot, force,
section, captcha_id, captcha_word)
section, captcha_id, captcha_word, kwargs)
else: # Make sure we have the right token:
params["token"] = self.site.get_token()

@@ -320,7 +320,7 @@ class Page(CopyvioMixIn):
raise exceptions.EditError(result["edit"])

def _build_edit_params(self, text, summary, minor, bot, force, section,
captcha_id, captcha_word):
captcha_id, captcha_word, kwargs):
"""Given some keyword arguments, build an API edit query string."""
unitxt = text.encode("utf8") if isinstance(text, str) else text
hashed = md5(unitxt).hexdigest() # Checksum to ensure text is correct
@@ -351,6 +351,11 @@ class Page(CopyvioMixIn):
else:
params["recreate"] = "true"

for key, val in kwargs.items():
if val is None:
params.pop(key, None)
else:
params[key] = val
return params

def _handle_edit_errors(self, error, params, retry=True):
@@ -657,7 +662,7 @@ class Page(CopyvioMixIn):
"""
return mwparserfromhell.parse(self.get())

def edit(self, text, summary, minor=False, bot=True, force=False):
def edit(self, text, summary, minor=False, bot=True, force=False, **kwargs):
"""Replace the page's content or creates a new page.

*text* is the new page content, with *summary* as the edit summary.
@@ -670,9 +675,9 @@ class Page(CopyvioMixIn):
editing our page. Be careful with this!
"""
self._edit(text=text, summary=summary, minor=minor, bot=bot,
force=force)
force=force, **kwargs)

def add_section(self, text, title, minor=False, bot=True, force=False):
def add_section(self, text, title, minor=False, bot=True, force=False, **kwargs):
"""Add a new section to the bottom of the page.

The arguments for this are the same as those for :py:meth:`edit`, but
@@ -683,7 +688,7 @@ class Page(CopyvioMixIn):
new section as content.
"""
self._edit(text=text, summary=title, minor=minor, bot=bot, force=force,
section="new")
section="new", **kwargs)

def check_exclusion(self, username=None, optouts=None):
"""Check whether or not we are allowed to edit the page.


+ 129
- 61
earwigbot/wiki/site.py Просмотреть файл

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2021 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -37,10 +37,11 @@ from earwigbot.wiki.category import Category
from earwigbot.wiki.page import Page
from earwigbot.wiki.user import User

oursql = importer.new("oursql")
pymysql = importer.new("pymysql")

__all__ = ["Site"]


class Site:
"""
**EarwigBot: Wiki Toolset: Site**
@@ -80,18 +81,41 @@ class Site:
- :py:meth:`get_user`: returns a User object for the given name
- :py:meth:`delegate`: controls when the API or SQL is used
"""

SERVICE_API = 1
SERVICE_SQL = 2
SPECIAL_TOKENS = ["createaccount", "deleteglobalaccount", "login",
"patrol", "rollback", "setglobalaccountstatus",
"userrights", "watch"]

def __init__(self, name=None, project=None, lang=None, base_url=None,
article_path=None, script_path=None, sql=None,
namespaces=None, login=(None, None), oauth=None,
cookiejar=None, user_agent=None, use_https=True,
assert_edit=None, maxlag=None, wait_between_queries=1,
logger=None, search_config=None):
SPECIAL_TOKENS = [
"createaccount",
"deleteglobalaccount",
"login",
"patrol",
"rollback",
"setglobalaccountstatus",
"userrights",
"watch",
]

def __init__(
self,
name=None,
project=None,
lang=None,
base_url=None,
article_path=None,
script_path=None,
sql=None,
namespaces=None,
login=(None, None),
oauth=None,
cookiejar=None,
user_agent=None,
use_https=True,
assert_edit=None,
maxlag=None,
wait_between_queries=1,
logger=None,
search_config=None,
):
"""Constructor for new Site instances.

This probably isn't necessary to call yourself unless you're building a
@@ -160,8 +184,11 @@ class Site:
self._session.headers["User-Agent"] = user_agent
if oauth:
self._session.auth = OAuth1(
oauth["consumer_token"], oauth["consumer_secret"],
oauth["access_token"], oauth["access_secret"])
oauth["consumer_token"],
oauth["consumer_secret"],
oauth["access_token"],
oauth["access_secret"],
)

# Set up our internal logger:
if logger:
@@ -182,13 +209,24 @@ class Site:

def __repr__(self):
"""Return the canonical string representation of the Site."""
res = ", ".join((
"Site(name={_name!r}", "project={_project!r}", "lang={_lang!r}",
"base_url={_base_url!r}", "article_path={_article_path!r}",
"script_path={_script_path!r}", "use_https={_use_https!r}",
"assert_edit={_assert_edit!r}", "maxlag={_maxlag!r}",
"sql={_sql_data!r}", "login={0}", "oauth={1}", "user_agent={3!r}",
"cookiejar={2})"))
res = ", ".join(
(
"Site(name={_name!r}",
"project={_project!r}",
"lang={_lang!r}",
"base_url={_base_url!r}",
"article_path={_article_path!r}",
"script_path={_script_path!r}",
"use_https={_use_https!r}",
"assert_edit={_assert_edit!r}",
"maxlag={_maxlag!r}",
"sql={_sql_data!r}",
"login={0}",
"oauth={1}",
"user_agent={3!r}",
"cookiejar={2})",
)
)
name, password = self._login_info
login = "({0}, {1})".format(repr(name), "hidden" if password else None)
oauth = "hidden" if self._oauth else None
@@ -211,8 +249,15 @@ class Site:
return value
return str(value, encoding)

def _api_query(self, params, tries=0, wait=5, ignore_maxlag=False,
no_assert=False, ae_retry=True):
def _api_query(
self,
params,
tries=0,
wait=5,
ignore_maxlag=False,
no_assert=False,
ae_retry=True,
):
"""Do an API query with *params* as a dict of parameters.

See the documentation for :py:meth:`api_query` for full implementation
@@ -348,8 +393,14 @@ class Site:
"""
# All attributes to be loaded, except _namespaces, which is a special
# case because it requires additional params in the API query:
attrs = [self._name, self._project, self._lang, self._base_url,
self._article_path, self._script_path]
attrs = [
self._name,
self._project,
self._lang,
self._base_url,
self._article_path,
self._script_path,
]

params = {"action": "query", "meta": "siteinfo", "siprop": "general"}

@@ -359,7 +410,7 @@ class Site:
result = self._api_query(params, no_assert=True)
self._load_namespaces(result)
elif all(attrs): # Everything is already specified and we're not told
return # to force a reload, so do nothing
return # to force a reload, so do nothing
else: # We're only loading attributes other than _namespaces
with self._api_lock:
result = self._api_query(params, no_assert=True)
@@ -424,11 +475,11 @@ class Site:
(for that, we'd do self._login_info[0]), but rather to get our current
username without an unnecessary ?action=query&meta=userinfo API query.
"""
name = ''.join((self._name, "Token"))
name = "".join((self._name, "Token"))
cookie = self._get_cookie(name, self.domain)

if cookie:
name = ''.join((self._name, "UserName"))
name = "".join((self._name, "UserName"))
user_name = self._get_cookie(name, self.domain)
if user_name:
return unquote_plus(user_name.value)
@@ -528,8 +579,12 @@ class Site:
except KeyError:
raise exceptions.LoginError("Couldn't get login token")

params = {"action": "login", "lgname": name, "lgpassword": password,
"lgtoken": token}
params = {
"action": "login",
"lgname": name,
"lgpassword": password,
"lgtoken": token,
}
with self._api_lock:
result = self._api_query(params, no_assert=True)

@@ -564,18 +619,22 @@ class Site:
def _sql_connect(self, **kwargs):
"""Attempt to establish a connection with this site's SQL database.

oursql.connect() will be called with self._sql_data as its kwargs.
pymysql.connect() will be called with self._sql_data as its kwargs.
Any kwargs given to this function will be passed to connect() and will
have precedence over the config file.

Will raise SQLError() if the module "oursql" is not available. oursql
may raise its own exceptions (e.g. oursql.InterfaceError) if it cannot
Will raise SQLError() if the module "pymysql" is not available. pymysql
may raise its own exceptions (e.g. pymysql.InterfaceError) if it cannot
establish a connection.
"""
args = self._sql_data
for key, value in kwargs.items():
args[key] = value
if "read_default_file" not in args and "user" not in args and "passwd" not in args:
if (
"read_default_file" not in args
and "user" not in args
and "passwd" not in args
):
args["read_default_file"] = expanduser("~/.my.cnf")
elif "read_default_file" in args:
args["read_default_file"] = expanduser(args["read_default_file"])
@@ -585,9 +644,9 @@ class Site:
args["autoreconnect"] = True

try:
self._sql_conn = oursql.connect(**args)
self._sql_conn = pymysql.connect(**args)
except ImportError:
e = "SQL querying requires the 'oursql' package: https://pythonhosted.org/oursql/"
e = "SQL querying requires the 'pymysql' package: https://pymysql.readthedocs.io/"
raise exceptions.SQLError(e)

def _get_service_order(self):
@@ -608,8 +667,11 @@ class Site:
if now - self._sql_info_cache["lastcheck"] > 120:
self._sql_info_cache["lastcheck"] = now
try:
self._sql_info_cache["replag"] = sqllag = self.get_replag()
except (exceptions.SQLError, oursql.Error):
try:
self._sql_info_cache["replag"] = sqllag = self.get_replag()
except pymysql.Error as exc:
raise exceptions.SQLError(str(exc))
except (exceptions.SQLError, ImportError):
self._sql_info_cache["usable"] = False
return [self.SERVICE_API]
self._sql_info_cache["usable"] = True
@@ -705,24 +767,31 @@ class Site:
with self._api_lock:
return self._api_query(kwargs)

def sql_query(self, query, params=(), plain_query=False, dict_cursor=False,
cursor_class=None, show_table=False, buffsize=1024):
def sql_query(
self,
query,
params=(),
plain_query=False,
dict_cursor=False,
cursor_class=None,
buffsize=1024,
):
"""Do an SQL query and yield its results.

If *plain_query* is ``True``, we will force an unparameterized query.
Specifying both *params* and *plain_query* will cause an error. If
*dict_cursor* is ``True``, we will use :py:class:`oursql.DictCursor` as
our cursor, otherwise the default :py:class:`oursql.Cursor`. If
*cursor_class* is given, it will override this option. If *show_table*
is True, the name of the table will be prepended to the name of the
column. This will mainly affect an :py:class:`~oursql.DictCursor`.
*dict_cursor* is ``True``, we will use
:py:class:`pymysql.cursors.DictCursor` as our cursor, otherwise the
default :py:class:`pymysql.cursors.Cursor`. If *cursor_class* is given,
it will override this option.

*buffsize* is the size of each memory-buffered group of results, to
reduce the number of conversations with the database; it is passed to
:py:meth:`cursor.fetchmany() <oursql.Cursor.fetchmany>`. If set to
``0```, all results will be buffered in memory at once (this uses
:py:meth:`fetchall() <oursql.Cursor.fetchall>`). If set to ``1``, it is
equivalent to using :py:meth:`fetchone() <oursql.Cursor.fetchone>`.
:py:meth:`cursor.fetchmany() <pymysql.cursors.Cursor.fetchmany>`. If
set to ``0```, all results will be buffered in memory at once (this
uses :py:meth:`fetchall() <pymysql.cursors.Cursor.fetchall>`). If set
to ``1``, it is equivalent to using
:py:meth:`fetchone() <pymysql.cursors.Cursor.fetchone>`.

Example usage::

@@ -736,25 +805,25 @@ class Site:
{'user_id': 7418060L, 'user_registration': '20080703215134'}

This may raise :py:exc:`~earwigbot.exceptions.SQLError` or one of
oursql's exceptions (:py:exc:`oursql.ProgrammingError`,
:py:exc:`oursql.InterfaceError`, ...) if there were problems with the
pymysql's exceptions (:py:exc:`pymysql.ProgrammingError`,
:py:exc:`pymysql.InterfaceError`, ...) if there were problems with the
query.

See :py:meth:`_sql_connect` for information on how a connection is
acquired. Also relevant is `oursql's documentation
<https://pythonhosted.org/oursql/>`_ for details on that package.
acquired. Also relevant is `pymysql's documentation
<https://pymysql.readthedocs.io/>`_ for details on that package.
"""
if not cursor_class:
if dict_cursor:
cursor_class = oursql.DictCursor
cursor_class = pymysql.cursors.DictCursor
else:
cursor_class = oursql.Cursor
cursor_class = pymysql.cursors.Cursor
klass = cursor_class

with self._sql_lock:
if not self._sql_conn:
self._sql_connect()
with self._sql_conn.cursor(klass, show_table=show_table) as cur:
with self._sql_conn.cursor(klass) as cur:
cur.execute(query, params, plain_query)
if buffsize:
while True:
@@ -798,8 +867,8 @@ class Site:
time from the timestamp of the latest recent changes event.

This may raise :py:exc:`~earwigbot.exceptions.SQLError` or one of
oursql's exceptions (:py:exc:`oursql.ProgrammingError`,
:py:exc:`oursql.InterfaceError`, ...) if there were problems.
pymysql's exceptions (:py:exc:`pymysql.ProgrammingError`,
:py:exc:`pymysql.InterfaceError`, ...) if there were problems.
"""
query = """SELECT UNIX_TIMESTAMP() - UNIX_TIMESTAMP(rc_timestamp) FROM
recentchanges ORDER BY rc_timestamp DESC LIMIT 1"""
@@ -886,8 +955,7 @@ class Site:
prefix = title.split(":", 1)[0]
if prefix != title: # Avoid a page that is simply "Category"
if prefix in prefixes:
return Category(self, title, follow_redirects, pageid,
self._logger)
return Category(self, title, follow_redirects, pageid, self._logger)
return Page(self, title, follow_redirects, pageid, self._logger)

def get_category(self, catname, follow_redirects=False, pageid=None):
@@ -899,7 +967,7 @@ class Site:
"""
catname = self._unicodeify(catname)
prefix = self.namespace_id_to_name(constants.NS_CATEGORY)
pagename = ':'.join((prefix, catname))
pagename = ":".join((prefix, catname))
return Category(self, pagename, follow_redirects, pageid, self._logger)

def get_user(self, username=None):


+ 62
- 24
earwigbot/wiki/sitesdb.py Просмотреть файл

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2021 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -35,6 +35,7 @@ from earwigbot.wiki.site import Site

__all__ = ["SitesDB"]


class SitesDB:
"""
**EarwigBot: Wiki Toolset: Sites Database Manager**
@@ -106,7 +107,7 @@ class SitesDB:
# Create the file and restrict reading/writing only to the
# owner, so others can't peak at our cookies:
open(self._cookie_file, "w").close()
chmod(self._cookie_file, stat.S_IRUSR|stat.S_IWUSR)
chmod(self._cookie_file, stat.S_IRUSR | stat.S_IWUSR)
else:
raise

@@ -172,8 +173,16 @@ class SitesDB:
except KeyError:
namespaces[ns_id] = [ns_name]

return (name, project, lang, base_url, article_path, script_path, sql,
namespaces)
return (
name,
project,
lang,
base_url,
article_path,
script_path,
sql,
namespaces,
)

def _make_site_object(self, name):
"""Return a Site object associated with the site *name* in our sitesdb.
@@ -182,8 +191,9 @@ class SitesDB:
raised if the site is not in our sitesdb.
"""
cookiejar = self._get_cookiejar()
(name, project, lang, base_url, article_path, script_path, sql,
namespaces) = self._load_site_from_sitesdb(name)
(name, project, lang, base_url, article_path, script_path, sql, namespaces) = (
self._load_site_from_sitesdb(name)
)

config = self.config
login = (config.wiki.get("username"), config.wiki.get("password"))
@@ -211,13 +221,26 @@ class SitesDB:
if isinstance(value, str) and "$1" in value:
sql[key] = value.replace("$1", name)

return Site(name=name, project=project, lang=lang, base_url=base_url,
article_path=article_path, script_path=script_path,
sql=sql, namespaces=namespaces, login=login, oauth=oauth,
cookiejar=cookiejar, user_agent=user_agent,
use_https=use_https, assert_edit=assert_edit,
maxlag=maxlag, wait_between_queries=wait_between_queries,
logger=logger, search_config=search_config)
return Site(
name=name,
project=project,
lang=lang,
base_url=base_url,
article_path=article_path,
script_path=script_path,
sql=sql,
namespaces=namespaces,
login=login,
oauth=oauth,
cookiejar=cookiejar,
user_agent=user_agent,
use_https=use_https,
assert_edit=assert_edit,
maxlag=maxlag,
wait_between_queries=wait_between_queries,
logger=logger,
search_config=search_config,
)

def _get_site_name_from_sitesdb(self, project, lang):
"""Return the name of the first site with the given project and lang.
@@ -255,8 +278,14 @@ class SitesDB:
database. If the sitesdb doesn't exist, we'll create it first.
"""
name = site.name
sites_data = (name, site.project, site.lang, site._base_url,
site._article_path, site._script_path)
sites_data = (
name,
site.project,
site.lang,
site._base_url,
site._article_path,
site._script_path,
)
sql_data = [(name, key, val) for key, val in site._sql_data.items()]
ns_data = []
for ns_id, ns_names in site._namespaces.items():
@@ -353,8 +382,9 @@ class SitesDB:
e = "Site '{0}:{1}' not found in the sitesdb.".format(project, lang)
raise SiteNotFoundError(e)

def add_site(self, project=None, lang=None, base_url=None,
script_path="/w", sql=None):
def add_site(
self, project=None, lang=None, base_url=None, script_path="/w", sql=None
):
"""Add a site to the sitesdb so it can be retrieved with get_site().

If only a project and a lang are given, we'll guess the *base_url* as
@@ -368,8 +398,8 @@ class SitesDB:
your wiki is different, provide the script_path as an argument. SQL
connection settings are guessed automatically using config's template
value. If this is wrong or not specified, provide a dict of kwargs as
*sql* and Site will pass it to :py:func:`oursql.connect(**sql)
<oursql.connect>`, allowing you to make queries with
*sql* and Site will pass it to :py:func:`pymysql.connect(**sql)
<pymysql.connect>`, allowing you to make queries with
:py:meth:`site.sql_query <earwigbot.wiki.site.Site.sql_query>`.

Returns ``True`` if the site was added successfully or ``False`` if the
@@ -399,11 +429,19 @@ class SitesDB:
user_agent = user_agent.replace("$2", python_version())

# Create a Site object to log in and load the other attributes:
site = Site(base_url=base_url, script_path=script_path, sql=sql,
login=login, oauth=oauth, cookiejar=cookiejar,
user_agent=user_agent, use_https=use_https,
assert_edit=assert_edit, maxlag=maxlag,
wait_between_queries=wait_between_queries)
site = Site(
base_url=base_url,
script_path=script_path,
sql=sql,
login=login,
oauth=oauth,
cookiejar=cookiejar,
user_agent=user_agent,
use_https=use_https,
assert_edit=assert_edit,
maxlag=maxlag,
wait_between_queries=wait_between_queries,
)

self._logger.info("Added site '{0}'".format(site.name))
self._add_site_to_sitesdb(site)


+ 19
- 19
setup.py Просмотреть файл

@@ -1,7 +1,7 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2021 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -37,11 +37,11 @@ extra_deps = {
"cryptography >= 3.4.7", # Storing bot passwords + keys in the config file
],
"sql": [
"oursql3 >= 0.9.4", # Interfacing with MediaWiki databases
"pymysql >= 1.1.0", # Interfacing with MediaWiki databases
],
"copyvios": [
"beautifulsoup4 >= 4.9.3", # Parsing/scraping HTML
"cchardet >= 2.1.7", # Encoding detection for BeautifulSoup
"charset_normalizer >= 3.3.2", # Encoding detection for BeautifulSoup
"lxml >= 4.6.3", # Faster parser for BeautifulSoup
"nltk >= 3.6.1", # Parsing sentences to split article content
"pdfminer >= 20191125", # Extracting text from PDF files
@@ -58,21 +58,21 @@ with open("README.rst") as fp:
long_docs = fp.read()

setup(
name = "earwigbot",
packages = find_packages(exclude=("tests",)),
entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]},
install_requires = dependencies,
test_suite = "tests",
version = __version__,
author = "Ben Kurtovic",
author_email = "ben.kurtovic@gmail.com",
url = "https://github.com/earwig/earwigbot",
description = "EarwigBot is a Python robot that edits Wikipedia and interacts with people over IRC.",
long_description = long_docs,
download_url = "https://github.com/earwig/earwigbot/tarball/v{0}".format(__version__),
keywords = "earwig earwigbot irc wikipedia wiki mediawiki",
license = "MIT License",
classifiers = [
name="earwigbot",
packages=find_packages(exclude=("tests",)),
entry_points={"console_scripts": ["earwigbot = earwigbot.util:main"]},
install_requires=dependencies,
test_suite="tests",
version=__version__,
author="Ben Kurtovic",
author_email="ben.kurtovic@gmail.com",
url="https://github.com/earwig/earwigbot",
description="EarwigBot is a Python robot that edits Wikipedia and interacts with people over IRC.",
long_description=long_docs,
download_url="https://github.com/earwig/earwigbot/tarball/v{0}".format(__version__),
keywords="earwig earwigbot irc wikipedia wiki mediawiki",
license="MIT License",
classifiers=[
"Development Status :: 3 - Alpha",
"Environment :: Console",
"Intended Audience :: Developers",
@@ -81,6 +81,6 @@ setup(
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Topic :: Communications :: Chat :: Internet Relay Chat",
"Topic :: Internet :: WWW/HTTP"
"Topic :: Internet :: WWW/HTTP",
],
)

Загрузка…
Отмена
Сохранить