Browse Source

Make cat.get_members() an iterator; make page.exists output nicer; cleanup

tags/v0.1^2
Ben Kurtovic 12 years ago
parent
commit
dcf912b65b
7 changed files with 132 additions and 86 deletions
  1. +9
    -6
      docs/toolset.rst
  2. +1
    -1
      earwigbot/commands/afc_pending.py
  3. +1
    -1
      earwigbot/commands/afc_report.py
  4. +53
    -29
      earwigbot/wiki/category.py
  5. +46
    -43
      earwigbot/wiki/page.py
  6. +16
    -6
      earwigbot/wiki/site.py
  7. +6
    -0
      earwigbot/wiki/user.py

+ 9
- 6
docs/toolset.rst View File

@@ -97,11 +97,11 @@ and the following methods:
- :py:meth:`namespace_name_to_id(name)
<earwigbot.wiki.site.Site.namespace_name_to_id>`: given a namespace name,
returns the associated namespace ID
- :py:meth:`get_page(title, follow_redirects=False)
- :py:meth:`get_page(title, follow_redirects=False, ...)
<earwigbot.wiki.site.Site.get_page>`: returns a ``Page`` object for the given
title (or a :py:class:`~earwigbot.wiki.category.Category` object if the
page's namespace is "``Category:``")
- :py:meth:`get_category(catname, follow_redirects=False)
- :py:meth:`get_category(catname, follow_redirects=False, ...)
<earwigbot.wiki.site.Site.get_category>`: returns a ``Category`` object for
the given title (sans namespace)
- :py:meth:`get_user(username) <earwigbot.wiki.site.Site.get_user>`: returns a
@@ -120,7 +120,7 @@ provide the following attributes:
- :py:attr:`~earwigbot.wiki.page.Page.site`: the page's corresponding
:py:class:`~earwigbot.wiki.site.Site` object
- :py:attr:`~earwigbot.wiki.page.Page.title`: the page's title, or pagename
- :py:attr:`~earwigbot.wiki.page.Page.exists`: whether the page exists
- :py:attr:`~earwigbot.wiki.page.Page.exists`: whether or not the page exists
- :py:attr:`~earwigbot.wiki.page.Page.pageid`: an integer ID representing the
page
- :py:attr:`~earwigbot.wiki.page.Page.url`: the page's URL
@@ -166,9 +166,10 @@ or :py:meth:`site.get_page(title) <earwigbot.wiki.site.Site.get_page>` where
``title`` is in the ``Category:`` namespace) provide the following additional
method:

- :py:meth:`get_members(use_sql=False, limit=None)
<earwigbot.wiki.category.Category.get_members>`: returns a list of page
titles in the category (limit is ``50`` by default if using the API)
- :py:meth:`get_members(use_sql=False, limit=None, ...)
<earwigbot.wiki.category.Category.get_members>`: iterates over
:py:class:`~earwigbot.wiki.page.Page`\ s in the category, until either the
category is exhausted or (if given) ``limit`` is reached

Users
~~~~~
@@ -178,6 +179,8 @@ Create :py:class:`earwigbot.wiki.User <earwigbot.wiki.user.User>` objects with
:py:meth:`page.get_creator() <earwigbot.wiki.page.Page.get_creator>`. They
provide the following attributes:

- :py:attr:`~earwigbot.wiki.user.User.site`: the user's corresponding
:py:class:`~earwigbot.wiki.site.Site` object
- :py:attr:`~earwigbot.wiki.user.User.name`: the user's username
- :py:attr:`~earwigbot.wiki.user.User.exists`: ``True`` if the user exists, or
``False`` if they do not


+ 1
- 1
earwigbot/commands/afc_pending.py View File

@@ -23,7 +23,7 @@
from earwigbot.commands import BaseCommand

class Command(BaseCommand):
"""Links the user to the pending AFC submissions page and category."""
"""Link the user to the pending AFC submissions page and category."""
name = "pending"

def check(self, data):


+ 1
- 1
earwigbot/commands/afc_report.py View File

@@ -70,7 +70,7 @@ class Command(BaseCommand):

def get_page(self, title):
page = self.site.get_page(title, follow_redirects=False)
if page.exists[0]:
if page.exists == page.PAGE_EXISTS:
return page

def report(self, page):


+ 53
- 29
earwigbot/wiki/category.py View File

@@ -39,7 +39,7 @@ class Category(Page):

*Public methods:*

- :py:meth:`get_members`: returns a list of page titles in the category
- :py:meth:`get_members`: iterates over Pages in the category
"""

def __repr__(self):
@@ -51,8 +51,8 @@ class Category(Page):
"""Return a nice string representation of the Category."""
return '<Category "{0}" of {1}>'.format(self.title, str(self._site))

def _get_members_via_sql(self, limit):
"""Return a list of tuples of (title, pageid) in the category."""
def _get_members_via_sql(self, limit, follow):
"""Iterate over Pages in the category using SQL."""
query = """SELECT page_title, page_namespace, page_id FROM page
JOIN categorylinks ON page_id = cl_from
WHERE cl_to = ?"""
@@ -64,42 +64,66 @@ class Category(Page):
else:
result = self._site.sql_query(query, (title,))

members = []
for row in result:
members = list(result)
for row in members:
base = row[0].replace("_", " ").decode("utf8")
namespace = self._site.namespace_id_to_name(row[1])
if namespace:
title = u":".join((namespace, base))
else: # Avoid doing a silly (albeit valid) ":Pagename" thing
title = base
members.append((title, row[2]))
return members
yield self._site.get_page(title, follow_redirects=follow,
pageid=row[2])

def _get_members_via_api(self, limit):
"""Return a list of page titles in the category using the API."""
def _get_members_via_api(self, limit, follow):
"""Iterate over Pages in the category using the API."""
params = {"action": "query", "list": "categorymembers",
"cmlimit": limit, "cmtitle": self._title}
if not limit:
params["cmlimit"] = 50 # Default value

result = self._site.api_query(**params)
members = result['query']['categorymembers']
return [member["title"] for member in members]

def get_members(self, use_sql=False, limit=None):
"""Return a list of page titles in the category.
"cmtitle": self._title}

while 1:
params["cmlimit"] = limit if limit else "max"
result = self._site.api_query(**params)
for member in result["query"]["categorymembers"]:
title = member["title"]
yield self._site.get_page(title, follow_redirects=follow)

if "query-continue" in result:
qcontinue = result["query-continue"]["categorymembers"]
params["cmcontinue"] = qcontinue["cmcontinue"]
if limit:
limit -= len(result["query"]["categorymembers"])
else:
break

def get_members(self, use_sql=False, limit=None, follow_redirects=None):
"""Iterate over Pages in the category.

If *use_sql* is ``True``, we will use a SQL query instead of the API.
Pages will be returned as tuples of ``(title, pageid)`` instead of just
titles.

If *limit* is provided, we will provide this many titles, or less if
the category is smaller. It defaults to 50 for API queries; normal
users can go up to 500, and bots can go up to 5,000 on a single API
query. If we're using SQL, the limit is ``None`` by default (returning
all pages in the category), but an arbitrary limit can still be chosen.
Note that pages are retrieved from the API in chunks (by default, in
500-page chunks for normal users and 5000-page chunks for bots and
admins), so queries may be made as we go along. If *limit* is given, we
will provide this many pages, or less if the category is smaller. By
default, *limit* is ``None``, meaning we will keep iterating over
members until the category is exhausted. *follow_redirects* is passed
directly to :py:meth:`site.get_page()
<earwigbot.wiki.site.Site.get_page>`; it defaults to ``None``, which
will use the value passed to our :py:meth:`__init__`.

.. note::
Be careful when iterating over very large categories with no limit.
If using the API, at best, you will make one query per 5000 pages,
which can add up significantly for categories with hundreds of
thousands of members. As for SQL, note that *all page titles are
stored internally* as soon as the query is made, so the site-wide
SQL lock can be freed and unrelated queries can be made without
requiring a separate connection to be opened. This is generally not
an issue unless your category's size approaches several hundred
thousand, in which case the sheer number of titles in memory becomes
problematic.
"""
if follow_redirects is None:
follow_redirects = self._follow_redirects
if use_sql:
return self._get_members_via_sql(limit)
return self._get_members_via_sql(limit, follow_redirects)
else:
return self._get_members_via_api(limit)
return self._get_members_via_api(limit, follow_redirects)

+ 46
- 43
earwigbot/wiki/page.py View File

@@ -43,7 +43,7 @@ class Page(CopyrightMixin):

- :py:attr:`site`: the page's corresponding Site object
- :py:attr:`title`: the page's title, or pagename
- :py:attr:`exists`: whether the page exists
- :py:attr:`exists`: whether or not the page exists
- :py:attr:`pageid`: an integer ID representing the page
- :py:attr:`url`: the page's URL
- :py:attr:`namespace`: the page's namespace as an integer
@@ -70,17 +70,20 @@ class Page(CopyrightMixin):
URL
"""

re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"
PAGE_UNKNOWN = 0
PAGE_INVALID = 1
PAGE_MISSING = 2
PAGE_EXISTS = 3

def __init__(self, site, title, follow_redirects=False):
def __init__(self, site, title, follow_redirects=False, pageid=None):
"""Constructor for new Page instances.

Takes three arguments: a Site object, the Page's title (or pagename),
and whether or not to follow redirects (optional, defaults to False).
Takes four arguments: a Site object, the Page's title (or pagename),
whether or not to follow redirects (optional, defaults to False), and
a page ID to supplement the title (optional, defaults to None - i.e.,
we will have to query the API to get it).

As with User, site.get_page() is preferred. Site's method has support
for a default *follow_redirects* value in our config, while __init__()
always defaults to False.
As with User, site.get_page() is preferred.

__init__() will not do any API queries, but it will use basic namespace
logic to determine our namespace ID and if we are a talkpage.
@@ -89,9 +92,9 @@ class Page(CopyrightMixin):
self._site = site
self._title = title.strip()
self._follow_redirects = self._keep_following = follow_redirects
self._pageid = pageid

self._exists = 0
self._pageid = None
self._exists = self.PAGE_UNKNOWN
self._is_redirect = None
self._lastrevid = None
self._protection = None
@@ -140,7 +143,7 @@ class Page(CopyrightMixin):
Note that validity != existence. If a page's title is invalid (e.g, it
contains "[") it will always be invalid, and cannot be edited.
"""
if self._exists == 1:
if self._exists == self.PAGE_INVALID:
e = "Page '{0}' is invalid.".format(self._title)
raise exceptions.InvalidPageError(e)

@@ -152,7 +155,7 @@ class Page(CopyrightMixin):
It will also call _assert_validity() beforehand.
"""
self._assert_validity()
if self._exists == 2:
if self._exists == self.PAGE_MISSING:
e = "Page '{0}' does not exist.".format(self._title)
raise exceptions.PageNotFoundError(e)

@@ -213,14 +216,14 @@ class Page(CopyrightMixin):
if "missing" in res:
# If it has a negative ID and it's missing; we can still get
# data like the namespace, protection, and URL:
self._exists = 2
self._exists = self.PAGE_MISSING
else:
# If it has a negative ID and it's invalid, then break here,
# because there's no other data for us to get:
self._exists = 1
self._exists = self.PAGE_INVALID
return
else:
self._exists = 3
self._exists = self.PAGE_EXISTS

self._fullurl = res["fullurl"]
self._protection = res["protection"]
@@ -312,7 +315,7 @@ class Page(CopyrightMixin):
if result["edit"]["result"] == "Success":
self._content = None
self._basetimestamp = None
self._exists = 0
self._exists = self.PAGE_UNKNOWN
return

# If we're here, then the edit failed. If it's because of AssertEdit,
@@ -346,7 +349,7 @@ class Page(CopyrightMixin):
params["starttimestamp"] = self._starttimestamp
if self._basetimestamp:
params["basetimestamp"] = self._basetimestamp
if self._exists == 2:
if self._exists == self.PAGE_MISSING:
# Page does not exist; don't edit if it already exists:
params["createonly"] = "true"
else:
@@ -384,7 +387,7 @@ class Page(CopyrightMixin):
# These attributes are now invalidated:
self._content = None
self._basetimestamp = None
self._exists = 0
self._exists = self.PAGE_UNKNOWN
raise exceptions.EditConflictError(error.info)

elif error.code in ["emptypage", "emptynewsection"]:
@@ -432,12 +435,12 @@ class Page(CopyrightMixin):

@property
def site(self):
"""The Page's corresponding Site object."""
"""The page's corresponding Site object."""
return self._site

@property
def title(self):
"""The Page's title, or "pagename".
"""The page's title, or "pagename".

This won't do any API queries on its own. Any other attributes or
methods that do API queries will reload the title, however, like
@@ -448,37 +451,36 @@ class Page(CopyrightMixin):

@property
def exists(self):
"""Information about whether the Page exists or not.
"""Whether or not the page exists.

The "information" is a tuple with two items. The first is a bool,
either ``True`` if the page exists or ``False`` if it does not. The
second is a string giving more information, either ``"invalid"``,
(title is invalid, e.g. it contains ``"["``), ``"missing"``, or
``"exists"``.
This will be a number; its value does not matter, but it will equal
one of :py:attr:`self.PAGE_INVALID <PAGE_INVALID>`,
:py:attr:`self.PAGE_MISSING <PAGE_MISSING>`, or
:py:attr:`self.PAGE_EXISTS <PAGE_EXISTS>`.

Makes an API query only if we haven't already made one.
"""
cases = {
0: (None, "unknown"),
1: (False, "invalid"),
2: (False, "missing"),
3: (True, "exists"),
}
if self._exists == 0:
if self._exists == self.PAGE_UNKNOWN:
self._load()
return cases[self._exists]
return self._exists

@property
def pageid(self):
"""An integer ID representing the Page.
"""An integer ID representing the page.

Makes an API query only if we haven't already made one.
Makes an API query only if we haven't already made one and the *pageid*
parameter to :py:meth:`__init__` was left as ``None``, which should be
true for all cases except when pages are returned by an SQL generator
(like :py:meth:`category.get_members(use_sql=True)
<earwigbot.wiki.category.Category.get_members>`).

Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or
:py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is
invalid or the page does not exist, respectively.
"""
if self._exists == 0:
if self._pageid:
return self._pageid
if self._exists == self.PAGE_UNKNOWN:
self._load()
self._assert_existence() # Missing pages do not have IDs
return self._pageid
@@ -518,7 +520,7 @@ class Page(CopyrightMixin):
name is invalid. Won't raise an error if the page is missing because
those can still be create-protected.
"""
if self._exists == 0:
if self._exists == self.PAGE_UNKNOWN:
self._load()
self._assert_validity() # Invalid pages cannot be protected
return self._protection
@@ -541,7 +543,7 @@ class Page(CopyrightMixin):

We will return ``False`` even if the page does not exist or is invalid.
"""
if self._exists == 0:
if self._exists == self.PAGE_UNKNOWN:
self._load()
return self._is_redirect

@@ -606,7 +608,7 @@ class Page(CopyrightMixin):
Raises InvalidPageError or PageNotFoundError if the page name is
invalid or the page does not exist, respectively.
"""
if self._exists == 0:
if self._exists == self.PAGE_UNKNOWN:
# Kill two birds with one stone by doing an API query for both our
# attributes and our page content:
query = self._site.api_query
@@ -621,7 +623,7 @@ class Page(CopyrightMixin):
if self._keep_following and self._is_redirect:
self._title = self.get_redirect_target()
self._keep_following = False # Don't follow double redirects
self._exists = 0 # Force another API query
self._exists = self.PAGE_UNKNOWN # Force another API query
self.get()

return self._content
@@ -645,9 +647,10 @@ class Page(CopyrightMixin):
:py:exc:`~earwigbot.exceptions.RedirectError` if the page is not a
redirect.
"""
re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"
content = self.get()
try:
return re.findall(self.re_redirect, content, flags=re.I)[0]
return re.findall(re_redirect, content, flags=re.I)[0]
except IndexError:
e = "The page does not appear to have a redirect target."
raise exceptions.RedirectError(e)
@@ -666,7 +669,7 @@ class Page(CopyrightMixin):
:py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is
invalid or the page does not exist, respectively.
"""
if self._exists == 0:
if self._exists == self.PAGE_UNKNOWN:
self._load()
self._assert_existence()
if not self._creator:


+ 16
- 6
earwigbot/wiki/site.py View File

@@ -184,6 +184,12 @@ class Site(object):
res = "<Site {0} ({1}:{2}) at {3}>"
return res.format(self.name, self.project, self.lang, self.domain)

def _unicodeify(self, value, encoding="utf8"):
"""Return input as unicode if it's not unicode to begin with."""
if isinstance(value, unicode):
return value
return unicode(value, encoding)

def _urlencode_utf8(self, params):
"""Implement urllib.urlencode() with support for unicode input."""
enc = lambda s: s.encode("utf8") if isinstance(s, unicode) else str(s)
@@ -682,7 +688,7 @@ class Site(object):
e = "There is no namespace with name '{0}'.".format(name)
raise exceptions.NamespaceNotFoundError(e)

def get_page(self, title, follow_redirects=False):
def get_page(self, title, follow_redirects=False, pageid=None):
"""Return a :py:class:`Page` object for the given title.

*follow_redirects* is passed directly to
@@ -696,23 +702,26 @@ class Site(object):
redirect-following: :py:class:`~earwigbot.wiki.page.Page`'s methods
provide that.
"""
title = self._unicodeify(title)
prefixes = self.namespace_id_to_name(constants.NS_CATEGORY, all=True)
prefix = title.split(":", 1)[0]
if prefix != title: # Avoid a page that is simply "Category"
if prefix in prefixes:
return Category(self, title, follow_redirects)
return Page(self, title, follow_redirects)
return Category(self, title, follow_redirects, pageid)
return Page(self, title, follow_redirects, pageid)

def get_category(self, catname, follow_redirects=False):
def get_category(self, catname, follow_redirects=False, pageid=None):
"""Return a :py:class:`Category` object for the given category name.

*catname* should be given *without* a namespace prefix. This method is
really just shorthand for :py:meth:`get_page("Category:" + catname)
<get_page>`.
"""
catname = self._unicodeify(catname)
name = name if isinstance(name, unicode) else name.decode("utf8")
prefix = self.namespace_id_to_name(constants.NS_CATEGORY)
pagename = ':'.join((prefix, catname))
return Category(self, pagename, follow_redirects)
pagename = u':'.join((prefix, catname))
return Category(self, pagename, follow_redirects, pageid)

def get_user(self, username=None):
"""Return a :py:class:`User` object for the given username.
@@ -721,6 +730,7 @@ class Site(object):
:py:class:`~earwigbot.wiki.user.User` object representing the currently
logged-in (or anonymous!) user is returned.
"""
username = self._unicodeify(username)
if not username:
username = self._get_username()
return User(self, username)

+ 6
- 0
earwigbot/wiki/user.py View File

@@ -39,6 +39,7 @@ class User(object):

*Attributes:*

- :py:attr:`site`: the user's corresponding Site object
- :py:attr:`name`: the user's username
- :py:attr:`exists`: ``True`` if the user exists, else ``False``
- :py:attr:`userid`: an integer ID representing the user
@@ -155,6 +156,11 @@ class User(object):
self._gender = res["gender"]

@property
def site(self):
"""The user's corresponding Site object."""
return self._site

@property
def name(self):
"""The user's username.



Loading…
Cancel
Save