@@ -97,11 +97,11 @@ and the following methods: | |||
- :py:meth:`namespace_name_to_id(name) | |||
<earwigbot.wiki.site.Site.namespace_name_to_id>`: given a namespace name, | |||
returns the associated namespace ID | |||
- :py:meth:`get_page(title, follow_redirects=False) | |||
- :py:meth:`get_page(title, follow_redirects=False, ...) | |||
<earwigbot.wiki.site.Site.get_page>`: returns a ``Page`` object for the given | |||
title (or a :py:class:`~earwigbot.wiki.category.Category` object if the | |||
page's namespace is "``Category:``") | |||
- :py:meth:`get_category(catname, follow_redirects=False) | |||
- :py:meth:`get_category(catname, follow_redirects=False, ...) | |||
<earwigbot.wiki.site.Site.get_category>`: returns a ``Category`` object for | |||
the given title (sans namespace) | |||
- :py:meth:`get_user(username) <earwigbot.wiki.site.Site.get_user>`: returns a | |||
@@ -120,7 +120,7 @@ provide the following attributes: | |||
- :py:attr:`~earwigbot.wiki.page.Page.site`: the page's corresponding | |||
:py:class:`~earwigbot.wiki.site.Site` object | |||
- :py:attr:`~earwigbot.wiki.page.Page.title`: the page's title, or pagename | |||
- :py:attr:`~earwigbot.wiki.page.Page.exists`: whether the page exists | |||
- :py:attr:`~earwigbot.wiki.page.Page.exists`: whether or not the page exists | |||
- :py:attr:`~earwigbot.wiki.page.Page.pageid`: an integer ID representing the | |||
page | |||
- :py:attr:`~earwigbot.wiki.page.Page.url`: the page's URL | |||
@@ -166,9 +166,10 @@ or :py:meth:`site.get_page(title) <earwigbot.wiki.site.Site.get_page>` where | |||
``title`` is in the ``Category:`` namespace) provide the following additional | |||
method: | |||
- :py:meth:`get_members(use_sql=False, limit=None) | |||
<earwigbot.wiki.category.Category.get_members>`: returns a list of page | |||
titles in the category (limit is ``50`` by default if using the API) | |||
- :py:meth:`get_members(use_sql=False, limit=None, ...) | |||
<earwigbot.wiki.category.Category.get_members>`: iterates over | |||
:py:class:`~earwigbot.wiki.page.Page`\ s in the category, until either the | |||
category is exhausted or (if given) ``limit`` is reached | |||
Users | |||
~~~~~ | |||
@@ -178,6 +179,8 @@ Create :py:class:`earwigbot.wiki.User <earwigbot.wiki.user.User>` objects with | |||
:py:meth:`page.get_creator() <earwigbot.wiki.page.Page.get_creator>`. They | |||
provide the following attributes: | |||
- :py:attr:`~earwigbot.wiki.user.User.site`: the user's corresponding | |||
:py:class:`~earwigbot.wiki.site.Site` object | |||
- :py:attr:`~earwigbot.wiki.user.User.name`: the user's username | |||
- :py:attr:`~earwigbot.wiki.user.User.exists`: ``True`` if the user exists, or | |||
``False`` if they do not | |||
@@ -23,7 +23,7 @@ | |||
from earwigbot.commands import BaseCommand | |||
class Command(BaseCommand): | |||
"""Links the user to the pending AFC submissions page and category.""" | |||
"""Link the user to the pending AFC submissions page and category.""" | |||
name = "pending" | |||
def check(self, data): | |||
@@ -70,7 +70,7 @@ class Command(BaseCommand): | |||
def get_page(self, title): | |||
page = self.site.get_page(title, follow_redirects=False) | |||
if page.exists[0]: | |||
if page.exists == page.PAGE_EXISTS: | |||
return page | |||
def report(self, page): | |||
@@ -39,7 +39,7 @@ class Category(Page): | |||
*Public methods:* | |||
- :py:meth:`get_members`: returns a list of page titles in the category | |||
- :py:meth:`get_members`: iterates over Pages in the category | |||
""" | |||
def __repr__(self): | |||
@@ -51,8 +51,8 @@ class Category(Page): | |||
"""Return a nice string representation of the Category.""" | |||
return '<Category "{0}" of {1}>'.format(self.title, str(self._site)) | |||
def _get_members_via_sql(self, limit): | |||
"""Return a list of tuples of (title, pageid) in the category.""" | |||
def _get_members_via_sql(self, limit, follow): | |||
"""Iterate over Pages in the category using SQL.""" | |||
query = """SELECT page_title, page_namespace, page_id FROM page | |||
JOIN categorylinks ON page_id = cl_from | |||
WHERE cl_to = ?""" | |||
@@ -64,42 +64,66 @@ class Category(Page): | |||
else: | |||
result = self._site.sql_query(query, (title,)) | |||
members = [] | |||
for row in result: | |||
members = list(result) | |||
for row in members: | |||
base = row[0].replace("_", " ").decode("utf8") | |||
namespace = self._site.namespace_id_to_name(row[1]) | |||
if namespace: | |||
title = u":".join((namespace, base)) | |||
else: # Avoid doing a silly (albeit valid) ":Pagename" thing | |||
title = base | |||
members.append((title, row[2])) | |||
return members | |||
yield self._site.get_page(title, follow_redirects=follow, | |||
pageid=row[2]) | |||
def _get_members_via_api(self, limit): | |||
"""Return a list of page titles in the category using the API.""" | |||
def _get_members_via_api(self, limit, follow): | |||
"""Iterate over Pages in the category using the API.""" | |||
params = {"action": "query", "list": "categorymembers", | |||
"cmlimit": limit, "cmtitle": self._title} | |||
if not limit: | |||
params["cmlimit"] = 50 # Default value | |||
result = self._site.api_query(**params) | |||
members = result['query']['categorymembers'] | |||
return [member["title"] for member in members] | |||
def get_members(self, use_sql=False, limit=None): | |||
"""Return a list of page titles in the category. | |||
"cmtitle": self._title} | |||
while 1: | |||
params["cmlimit"] = limit if limit else "max" | |||
result = self._site.api_query(**params) | |||
for member in result["query"]["categorymembers"]: | |||
title = member["title"] | |||
yield self._site.get_page(title, follow_redirects=follow) | |||
if "query-continue" in result: | |||
qcontinue = result["query-continue"]["categorymembers"] | |||
params["cmcontinue"] = qcontinue["cmcontinue"] | |||
if limit: | |||
limit -= len(result["query"]["categorymembers"]) | |||
else: | |||
break | |||
def get_members(self, use_sql=False, limit=None, follow_redirects=None): | |||
"""Iterate over Pages in the category. | |||
If *use_sql* is ``True``, we will use a SQL query instead of the API. | |||
Pages will be returned as tuples of ``(title, pageid)`` instead of just | |||
titles. | |||
If *limit* is provided, we will provide this many titles, or less if | |||
the category is smaller. It defaults to 50 for API queries; normal | |||
users can go up to 500, and bots can go up to 5,000 on a single API | |||
query. If we're using SQL, the limit is ``None`` by default (returning | |||
all pages in the category), but an arbitrary limit can still be chosen. | |||
Note that pages are retrieved from the API in chunks (by default, in | |||
500-page chunks for normal users and 5000-page chunks for bots and | |||
admins), so queries may be made as we go along. If *limit* is given, we | |||
will provide this many pages, or less if the category is smaller. By | |||
default, *limit* is ``None``, meaning we will keep iterating over | |||
members until the category is exhausted. *follow_redirects* is passed | |||
directly to :py:meth:`site.get_page() | |||
<earwigbot.wiki.site.Site.get_page>`; it defaults to ``None``, which | |||
will use the value passed to our :py:meth:`__init__`. | |||
.. note:: | |||
Be careful when iterating over very large categories with no limit. | |||
If using the API, at best, you will make one query per 5000 pages, | |||
which can add up significantly for categories with hundreds of | |||
thousands of members. As for SQL, note that *all page titles are | |||
stored internally* as soon as the query is made, so the site-wide | |||
SQL lock can be freed and unrelated queries can be made without | |||
requiring a separate connection to be opened. This is generally not | |||
an issue unless your category's size approaches several hundred | |||
thousand, in which case the sheer number of titles in memory becomes | |||
problematic. | |||
""" | |||
if follow_redirects is None: | |||
follow_redirects = self._follow_redirects | |||
if use_sql: | |||
return self._get_members_via_sql(limit) | |||
return self._get_members_via_sql(limit, follow_redirects) | |||
else: | |||
return self._get_members_via_api(limit) | |||
return self._get_members_via_api(limit, follow_redirects) |
@@ -43,7 +43,7 @@ class Page(CopyrightMixin): | |||
- :py:attr:`site`: the page's corresponding Site object | |||
- :py:attr:`title`: the page's title, or pagename | |||
- :py:attr:`exists`: whether the page exists | |||
- :py:attr:`exists`: whether or not the page exists | |||
- :py:attr:`pageid`: an integer ID representing the page | |||
- :py:attr:`url`: the page's URL | |||
- :py:attr:`namespace`: the page's namespace as an integer | |||
@@ -70,17 +70,20 @@ class Page(CopyrightMixin): | |||
URL | |||
""" | |||
re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]" | |||
PAGE_UNKNOWN = 0 | |||
PAGE_INVALID = 1 | |||
PAGE_MISSING = 2 | |||
PAGE_EXISTS = 3 | |||
def __init__(self, site, title, follow_redirects=False): | |||
def __init__(self, site, title, follow_redirects=False, pageid=None): | |||
"""Constructor for new Page instances. | |||
Takes three arguments: a Site object, the Page's title (or pagename), | |||
and whether or not to follow redirects (optional, defaults to False). | |||
Takes four arguments: a Site object, the Page's title (or pagename), | |||
whether or not to follow redirects (optional, defaults to False), and | |||
a page ID to supplement the title (optional, defaults to None - i.e., | |||
we will have to query the API to get it). | |||
As with User, site.get_page() is preferred. Site's method has support | |||
for a default *follow_redirects* value in our config, while __init__() | |||
always defaults to False. | |||
As with User, site.get_page() is preferred. | |||
__init__() will not do any API queries, but it will use basic namespace | |||
logic to determine our namespace ID and if we are a talkpage. | |||
@@ -89,9 +92,9 @@ class Page(CopyrightMixin): | |||
self._site = site | |||
self._title = title.strip() | |||
self._follow_redirects = self._keep_following = follow_redirects | |||
self._pageid = pageid | |||
self._exists = 0 | |||
self._pageid = None | |||
self._exists = self.PAGE_UNKNOWN | |||
self._is_redirect = None | |||
self._lastrevid = None | |||
self._protection = None | |||
@@ -140,7 +143,7 @@ class Page(CopyrightMixin): | |||
Note that validity != existence. If a page's title is invalid (e.g, it | |||
contains "[") it will always be invalid, and cannot be edited. | |||
""" | |||
if self._exists == 1: | |||
if self._exists == self.PAGE_INVALID: | |||
e = "Page '{0}' is invalid.".format(self._title) | |||
raise exceptions.InvalidPageError(e) | |||
@@ -152,7 +155,7 @@ class Page(CopyrightMixin): | |||
It will also call _assert_validity() beforehand. | |||
""" | |||
self._assert_validity() | |||
if self._exists == 2: | |||
if self._exists == self.PAGE_MISSING: | |||
e = "Page '{0}' does not exist.".format(self._title) | |||
raise exceptions.PageNotFoundError(e) | |||
@@ -213,14 +216,14 @@ class Page(CopyrightMixin): | |||
if "missing" in res: | |||
# If it has a negative ID and it's missing; we can still get | |||
# data like the namespace, protection, and URL: | |||
self._exists = 2 | |||
self._exists = self.PAGE_MISSING | |||
else: | |||
# If it has a negative ID and it's invalid, then break here, | |||
# because there's no other data for us to get: | |||
self._exists = 1 | |||
self._exists = self.PAGE_INVALID | |||
return | |||
else: | |||
self._exists = 3 | |||
self._exists = self.PAGE_EXISTS | |||
self._fullurl = res["fullurl"] | |||
self._protection = res["protection"] | |||
@@ -312,7 +315,7 @@ class Page(CopyrightMixin): | |||
if result["edit"]["result"] == "Success": | |||
self._content = None | |||
self._basetimestamp = None | |||
self._exists = 0 | |||
self._exists = self.PAGE_UNKNOWN | |||
return | |||
# If we're here, then the edit failed. If it's because of AssertEdit, | |||
@@ -346,7 +349,7 @@ class Page(CopyrightMixin): | |||
params["starttimestamp"] = self._starttimestamp | |||
if self._basetimestamp: | |||
params["basetimestamp"] = self._basetimestamp | |||
if self._exists == 2: | |||
if self._exists == self.PAGE_MISSING: | |||
# Page does not exist; don't edit if it already exists: | |||
params["createonly"] = "true" | |||
else: | |||
@@ -384,7 +387,7 @@ class Page(CopyrightMixin): | |||
# These attributes are now invalidated: | |||
self._content = None | |||
self._basetimestamp = None | |||
self._exists = 0 | |||
self._exists = self.PAGE_UNKNOWN | |||
raise exceptions.EditConflictError(error.info) | |||
elif error.code in ["emptypage", "emptynewsection"]: | |||
@@ -432,12 +435,12 @@ class Page(CopyrightMixin): | |||
@property | |||
def site(self): | |||
"""The Page's corresponding Site object.""" | |||
"""The page's corresponding Site object.""" | |||
return self._site | |||
@property | |||
def title(self): | |||
"""The Page's title, or "pagename". | |||
"""The page's title, or "pagename". | |||
This won't do any API queries on its own. Any other attributes or | |||
methods that do API queries will reload the title, however, like | |||
@@ -448,37 +451,36 @@ class Page(CopyrightMixin): | |||
@property | |||
def exists(self): | |||
"""Information about whether the Page exists or not. | |||
"""Whether or not the page exists. | |||
The "information" is a tuple with two items. The first is a bool, | |||
either ``True`` if the page exists or ``False`` if it does not. The | |||
second is a string giving more information, either ``"invalid"``, | |||
(title is invalid, e.g. it contains ``"["``), ``"missing"``, or | |||
``"exists"``. | |||
This will be a number; its value does not matter, but it will equal | |||
one of :py:attr:`self.PAGE_INVALID <PAGE_INVALID>`, | |||
:py:attr:`self.PAGE_MISSING <PAGE_MISSING>`, or | |||
:py:attr:`self.PAGE_EXISTS <PAGE_EXISTS>`. | |||
Makes an API query only if we haven't already made one. | |||
""" | |||
cases = { | |||
0: (None, "unknown"), | |||
1: (False, "invalid"), | |||
2: (False, "missing"), | |||
3: (True, "exists"), | |||
} | |||
if self._exists == 0: | |||
if self._exists == self.PAGE_UNKNOWN: | |||
self._load() | |||
return cases[self._exists] | |||
return self._exists | |||
@property | |||
def pageid(self): | |||
"""An integer ID representing the Page. | |||
"""An integer ID representing the page. | |||
Makes an API query only if we haven't already made one. | |||
Makes an API query only if we haven't already made one and the *pageid* | |||
parameter to :py:meth:`__init__` was left as ``None``, which should be | |||
true for all cases except when pages are returned by an SQL generator | |||
(like :py:meth:`category.get_members(use_sql=True) | |||
<earwigbot.wiki.category.Category.get_members>`). | |||
Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or | |||
:py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is | |||
invalid or the page does not exist, respectively. | |||
""" | |||
if self._exists == 0: | |||
if self._pageid: | |||
return self._pageid | |||
if self._exists == self.PAGE_UNKNOWN: | |||
self._load() | |||
self._assert_existence() # Missing pages do not have IDs | |||
return self._pageid | |||
@@ -518,7 +520,7 @@ class Page(CopyrightMixin): | |||
name is invalid. Won't raise an error if the page is missing because | |||
those can still be create-protected. | |||
""" | |||
if self._exists == 0: | |||
if self._exists == self.PAGE_UNKNOWN: | |||
self._load() | |||
self._assert_validity() # Invalid pages cannot be protected | |||
return self._protection | |||
@@ -541,7 +543,7 @@ class Page(CopyrightMixin): | |||
We will return ``False`` even if the page does not exist or is invalid. | |||
""" | |||
if self._exists == 0: | |||
if self._exists == self.PAGE_UNKNOWN: | |||
self._load() | |||
return self._is_redirect | |||
@@ -606,7 +608,7 @@ class Page(CopyrightMixin): | |||
Raises InvalidPageError or PageNotFoundError if the page name is | |||
invalid or the page does not exist, respectively. | |||
""" | |||
if self._exists == 0: | |||
if self._exists == self.PAGE_UNKNOWN: | |||
# Kill two birds with one stone by doing an API query for both our | |||
# attributes and our page content: | |||
query = self._site.api_query | |||
@@ -621,7 +623,7 @@ class Page(CopyrightMixin): | |||
if self._keep_following and self._is_redirect: | |||
self._title = self.get_redirect_target() | |||
self._keep_following = False # Don't follow double redirects | |||
self._exists = 0 # Force another API query | |||
self._exists = self.PAGE_UNKNOWN # Force another API query | |||
self.get() | |||
return self._content | |||
@@ -645,9 +647,10 @@ class Page(CopyrightMixin): | |||
:py:exc:`~earwigbot.exceptions.RedirectError` if the page is not a | |||
redirect. | |||
""" | |||
re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]" | |||
content = self.get() | |||
try: | |||
return re.findall(self.re_redirect, content, flags=re.I)[0] | |||
return re.findall(re_redirect, content, flags=re.I)[0] | |||
except IndexError: | |||
e = "The page does not appear to have a redirect target." | |||
raise exceptions.RedirectError(e) | |||
@@ -666,7 +669,7 @@ class Page(CopyrightMixin): | |||
:py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is | |||
invalid or the page does not exist, respectively. | |||
""" | |||
if self._exists == 0: | |||
if self._exists == self.PAGE_UNKNOWN: | |||
self._load() | |||
self._assert_existence() | |||
if not self._creator: | |||
@@ -184,6 +184,12 @@ class Site(object): | |||
res = "<Site {0} ({1}:{2}) at {3}>" | |||
return res.format(self.name, self.project, self.lang, self.domain) | |||
def _unicodeify(self, value, encoding="utf8"): | |||
"""Return input as unicode if it's not unicode to begin with.""" | |||
if isinstance(value, unicode): | |||
return value | |||
return unicode(value, encoding) | |||
def _urlencode_utf8(self, params): | |||
"""Implement urllib.urlencode() with support for unicode input.""" | |||
enc = lambda s: s.encode("utf8") if isinstance(s, unicode) else str(s) | |||
@@ -682,7 +688,7 @@ class Site(object): | |||
e = "There is no namespace with name '{0}'.".format(name) | |||
raise exceptions.NamespaceNotFoundError(e) | |||
def get_page(self, title, follow_redirects=False): | |||
def get_page(self, title, follow_redirects=False, pageid=None): | |||
"""Return a :py:class:`Page` object for the given title. | |||
*follow_redirects* is passed directly to | |||
@@ -696,23 +702,26 @@ class Site(object): | |||
redirect-following: :py:class:`~earwigbot.wiki.page.Page`'s methods | |||
provide that. | |||
""" | |||
title = self._unicodeify(title) | |||
prefixes = self.namespace_id_to_name(constants.NS_CATEGORY, all=True) | |||
prefix = title.split(":", 1)[0] | |||
if prefix != title: # Avoid a page that is simply "Category" | |||
if prefix in prefixes: | |||
return Category(self, title, follow_redirects) | |||
return Page(self, title, follow_redirects) | |||
return Category(self, title, follow_redirects, pageid) | |||
return Page(self, title, follow_redirects, pageid) | |||
def get_category(self, catname, follow_redirects=False): | |||
def get_category(self, catname, follow_redirects=False, pageid=None): | |||
"""Return a :py:class:`Category` object for the given category name. | |||
*catname* should be given *without* a namespace prefix. This method is | |||
really just shorthand for :py:meth:`get_page("Category:" + catname) | |||
<get_page>`. | |||
""" | |||
catname = self._unicodeify(catname) | |||
name = name if isinstance(name, unicode) else name.decode("utf8") | |||
prefix = self.namespace_id_to_name(constants.NS_CATEGORY) | |||
pagename = ':'.join((prefix, catname)) | |||
return Category(self, pagename, follow_redirects) | |||
pagename = u':'.join((prefix, catname)) | |||
return Category(self, pagename, follow_redirects, pageid) | |||
def get_user(self, username=None): | |||
"""Return a :py:class:`User` object for the given username. | |||
@@ -721,6 +730,7 @@ class Site(object): | |||
:py:class:`~earwigbot.wiki.user.User` object representing the currently | |||
logged-in (or anonymous!) user is returned. | |||
""" | |||
username = self._unicodeify(username) | |||
if not username: | |||
username = self._get_username() | |||
return User(self, username) |
@@ -39,6 +39,7 @@ class User(object): | |||
*Attributes:* | |||
- :py:attr:`site`: the user's corresponding Site object | |||
- :py:attr:`name`: the user's username | |||
- :py:attr:`exists`: ``True`` if the user exists, else ``False`` | |||
- :py:attr:`userid`: an integer ID representing the user | |||
@@ -155,6 +156,11 @@ class User(object): | |||
self._gender = res["gender"] | |||
@property | |||
def site(self): | |||
"""The user's corresponding Site object.""" | |||
return self._site | |||
@property | |||
def name(self): | |||
"""The user's username. | |||