From dcf912b65b57f0fa68ca2ac419bdec0b09cce0ab Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 14 May 2012 23:25:21 -0400 Subject: [PATCH] Make cat.get_members() an iterator; make page.exists output nicer; cleanup --- docs/toolset.rst | 15 ++++--- earwigbot/commands/afc_pending.py | 2 +- earwigbot/commands/afc_report.py | 2 +- earwigbot/wiki/category.py | 82 +++++++++++++++++++++++------------- earwigbot/wiki/page.py | 89 ++++++++++++++++++++------------------- earwigbot/wiki/site.py | 22 +++++++--- earwigbot/wiki/user.py | 6 +++ 7 files changed, 132 insertions(+), 86 deletions(-) diff --git a/docs/toolset.rst b/docs/toolset.rst index b8a4124..812b0c8 100644 --- a/docs/toolset.rst +++ b/docs/toolset.rst @@ -97,11 +97,11 @@ and the following methods: - :py:meth:`namespace_name_to_id(name) `: given a namespace name, returns the associated namespace ID -- :py:meth:`get_page(title, follow_redirects=False) +- :py:meth:`get_page(title, follow_redirects=False, ...) `: returns a ``Page`` object for the given title (or a :py:class:`~earwigbot.wiki.category.Category` object if the page's namespace is "``Category:``") -- :py:meth:`get_category(catname, follow_redirects=False) +- :py:meth:`get_category(catname, follow_redirects=False, ...) `: returns a ``Category`` object for the given title (sans namespace) - :py:meth:`get_user(username) `: returns a @@ -120,7 +120,7 @@ provide the following attributes: - :py:attr:`~earwigbot.wiki.page.Page.site`: the page's corresponding :py:class:`~earwigbot.wiki.site.Site` object - :py:attr:`~earwigbot.wiki.page.Page.title`: the page's title, or pagename -- :py:attr:`~earwigbot.wiki.page.Page.exists`: whether the page exists +- :py:attr:`~earwigbot.wiki.page.Page.exists`: whether or not the page exists - :py:attr:`~earwigbot.wiki.page.Page.pageid`: an integer ID representing the page - :py:attr:`~earwigbot.wiki.page.Page.url`: the page's URL @@ -166,9 +166,10 @@ or :py:meth:`site.get_page(title) ` where ``title`` is in the ``Category:`` namespace) provide the following additional method: -- :py:meth:`get_members(use_sql=False, limit=None) - `: returns a list of page - titles in the category (limit is ``50`` by default if using the API) +- :py:meth:`get_members(use_sql=False, limit=None, ...) + `: iterates over + :py:class:`~earwigbot.wiki.page.Page`\ s in the category, until either the + category is exhausted or (if given) ``limit`` is reached Users ~~~~~ @@ -178,6 +179,8 @@ Create :py:class:`earwigbot.wiki.User ` objects with :py:meth:`page.get_creator() `. They provide the following attributes: +- :py:attr:`~earwigbot.wiki.user.User.site`: the user's corresponding + :py:class:`~earwigbot.wiki.site.Site` object - :py:attr:`~earwigbot.wiki.user.User.name`: the user's username - :py:attr:`~earwigbot.wiki.user.User.exists`: ``True`` if the user exists, or ``False`` if they do not diff --git a/earwigbot/commands/afc_pending.py b/earwigbot/commands/afc_pending.py index d90b25f..1a43786 100644 --- a/earwigbot/commands/afc_pending.py +++ b/earwigbot/commands/afc_pending.py @@ -23,7 +23,7 @@ from earwigbot.commands import BaseCommand class Command(BaseCommand): - """Links the user to the pending AFC submissions page and category.""" + """Link the user to the pending AFC submissions page and category.""" name = "pending" def check(self, data): diff --git a/earwigbot/commands/afc_report.py b/earwigbot/commands/afc_report.py index 6129e11..c41e3bc 100644 --- a/earwigbot/commands/afc_report.py +++ b/earwigbot/commands/afc_report.py @@ -70,7 +70,7 @@ class Command(BaseCommand): def get_page(self, title): page = self.site.get_page(title, follow_redirects=False) - if page.exists[0]: + if page.exists == page.PAGE_EXISTS: return page def report(self, page): diff --git a/earwigbot/wiki/category.py b/earwigbot/wiki/category.py index e953e70..2df9a0e 100644 --- a/earwigbot/wiki/category.py +++ b/earwigbot/wiki/category.py @@ -39,7 +39,7 @@ class Category(Page): *Public methods:* - - :py:meth:`get_members`: returns a list of page titles in the category + - :py:meth:`get_members`: iterates over Pages in the category """ def __repr__(self): @@ -51,8 +51,8 @@ class Category(Page): """Return a nice string representation of the Category.""" return ''.format(self.title, str(self._site)) - def _get_members_via_sql(self, limit): - """Return a list of tuples of (title, pageid) in the category.""" + def _get_members_via_sql(self, limit, follow): + """Iterate over Pages in the category using SQL.""" query = """SELECT page_title, page_namespace, page_id FROM page JOIN categorylinks ON page_id = cl_from WHERE cl_to = ?""" @@ -64,42 +64,66 @@ class Category(Page): else: result = self._site.sql_query(query, (title,)) - members = [] - for row in result: + members = list(result) + for row in members: base = row[0].replace("_", " ").decode("utf8") namespace = self._site.namespace_id_to_name(row[1]) if namespace: title = u":".join((namespace, base)) else: # Avoid doing a silly (albeit valid) ":Pagename" thing title = base - members.append((title, row[2])) - return members + yield self._site.get_page(title, follow_redirects=follow, + pageid=row[2]) - def _get_members_via_api(self, limit): - """Return a list of page titles in the category using the API.""" + def _get_members_via_api(self, limit, follow): + """Iterate over Pages in the category using the API.""" params = {"action": "query", "list": "categorymembers", - "cmlimit": limit, "cmtitle": self._title} - if not limit: - params["cmlimit"] = 50 # Default value - - result = self._site.api_query(**params) - members = result['query']['categorymembers'] - return [member["title"] for member in members] - - def get_members(self, use_sql=False, limit=None): - """Return a list of page titles in the category. + "cmtitle": self._title} + + while 1: + params["cmlimit"] = limit if limit else "max" + result = self._site.api_query(**params) + for member in result["query"]["categorymembers"]: + title = member["title"] + yield self._site.get_page(title, follow_redirects=follow) + + if "query-continue" in result: + qcontinue = result["query-continue"]["categorymembers"] + params["cmcontinue"] = qcontinue["cmcontinue"] + if limit: + limit -= len(result["query"]["categorymembers"]) + else: + break + + def get_members(self, use_sql=False, limit=None, follow_redirects=None): + """Iterate over Pages in the category. If *use_sql* is ``True``, we will use a SQL query instead of the API. - Pages will be returned as tuples of ``(title, pageid)`` instead of just - titles. - - If *limit* is provided, we will provide this many titles, or less if - the category is smaller. It defaults to 50 for API queries; normal - users can go up to 500, and bots can go up to 5,000 on a single API - query. If we're using SQL, the limit is ``None`` by default (returning - all pages in the category), but an arbitrary limit can still be chosen. + Note that pages are retrieved from the API in chunks (by default, in + 500-page chunks for normal users and 5000-page chunks for bots and + admins), so queries may be made as we go along. If *limit* is given, we + will provide this many pages, or less if the category is smaller. By + default, *limit* is ``None``, meaning we will keep iterating over + members until the category is exhausted. *follow_redirects* is passed + directly to :py:meth:`site.get_page() + `; it defaults to ``None``, which + will use the value passed to our :py:meth:`__init__`. + + .. note:: + Be careful when iterating over very large categories with no limit. + If using the API, at best, you will make one query per 5000 pages, + which can add up significantly for categories with hundreds of + thousands of members. As for SQL, note that *all page titles are + stored internally* as soon as the query is made, so the site-wide + SQL lock can be freed and unrelated queries can be made without + requiring a separate connection to be opened. This is generally not + an issue unless your category's size approaches several hundred + thousand, in which case the sheer number of titles in memory becomes + problematic. """ + if follow_redirects is None: + follow_redirects = self._follow_redirects if use_sql: - return self._get_members_via_sql(limit) + return self._get_members_via_sql(limit, follow_redirects) else: - return self._get_members_via_api(limit) + return self._get_members_via_api(limit, follow_redirects) diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py index 21acd6d..98f11dd 100644 --- a/earwigbot/wiki/page.py +++ b/earwigbot/wiki/page.py @@ -43,7 +43,7 @@ class Page(CopyrightMixin): - :py:attr:`site`: the page's corresponding Site object - :py:attr:`title`: the page's title, or pagename - - :py:attr:`exists`: whether the page exists + - :py:attr:`exists`: whether or not the page exists - :py:attr:`pageid`: an integer ID representing the page - :py:attr:`url`: the page's URL - :py:attr:`namespace`: the page's namespace as an integer @@ -70,17 +70,20 @@ class Page(CopyrightMixin): URL """ - re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]" + PAGE_UNKNOWN = 0 + PAGE_INVALID = 1 + PAGE_MISSING = 2 + PAGE_EXISTS = 3 - def __init__(self, site, title, follow_redirects=False): + def __init__(self, site, title, follow_redirects=False, pageid=None): """Constructor for new Page instances. - Takes three arguments: a Site object, the Page's title (or pagename), - and whether or not to follow redirects (optional, defaults to False). + Takes four arguments: a Site object, the Page's title (or pagename), + whether or not to follow redirects (optional, defaults to False), and + a page ID to supplement the title (optional, defaults to None - i.e., + we will have to query the API to get it). - As with User, site.get_page() is preferred. Site's method has support - for a default *follow_redirects* value in our config, while __init__() - always defaults to False. + As with User, site.get_page() is preferred. __init__() will not do any API queries, but it will use basic namespace logic to determine our namespace ID and if we are a talkpage. @@ -89,9 +92,9 @@ class Page(CopyrightMixin): self._site = site self._title = title.strip() self._follow_redirects = self._keep_following = follow_redirects + self._pageid = pageid - self._exists = 0 - self._pageid = None + self._exists = self.PAGE_UNKNOWN self._is_redirect = None self._lastrevid = None self._protection = None @@ -140,7 +143,7 @@ class Page(CopyrightMixin): Note that validity != existence. If a page's title is invalid (e.g, it contains "[") it will always be invalid, and cannot be edited. """ - if self._exists == 1: + if self._exists == self.PAGE_INVALID: e = "Page '{0}' is invalid.".format(self._title) raise exceptions.InvalidPageError(e) @@ -152,7 +155,7 @@ class Page(CopyrightMixin): It will also call _assert_validity() beforehand. """ self._assert_validity() - if self._exists == 2: + if self._exists == self.PAGE_MISSING: e = "Page '{0}' does not exist.".format(self._title) raise exceptions.PageNotFoundError(e) @@ -213,14 +216,14 @@ class Page(CopyrightMixin): if "missing" in res: # If it has a negative ID and it's missing; we can still get # data like the namespace, protection, and URL: - self._exists = 2 + self._exists = self.PAGE_MISSING else: # If it has a negative ID and it's invalid, then break here, # because there's no other data for us to get: - self._exists = 1 + self._exists = self.PAGE_INVALID return else: - self._exists = 3 + self._exists = self.PAGE_EXISTS self._fullurl = res["fullurl"] self._protection = res["protection"] @@ -312,7 +315,7 @@ class Page(CopyrightMixin): if result["edit"]["result"] == "Success": self._content = None self._basetimestamp = None - self._exists = 0 + self._exists = self.PAGE_UNKNOWN return # If we're here, then the edit failed. If it's because of AssertEdit, @@ -346,7 +349,7 @@ class Page(CopyrightMixin): params["starttimestamp"] = self._starttimestamp if self._basetimestamp: params["basetimestamp"] = self._basetimestamp - if self._exists == 2: + if self._exists == self.PAGE_MISSING: # Page does not exist; don't edit if it already exists: params["createonly"] = "true" else: @@ -384,7 +387,7 @@ class Page(CopyrightMixin): # These attributes are now invalidated: self._content = None self._basetimestamp = None - self._exists = 0 + self._exists = self.PAGE_UNKNOWN raise exceptions.EditConflictError(error.info) elif error.code in ["emptypage", "emptynewsection"]: @@ -432,12 +435,12 @@ class Page(CopyrightMixin): @property def site(self): - """The Page's corresponding Site object.""" + """The page's corresponding Site object.""" return self._site @property def title(self): - """The Page's title, or "pagename". + """The page's title, or "pagename". This won't do any API queries on its own. Any other attributes or methods that do API queries will reload the title, however, like @@ -448,37 +451,36 @@ class Page(CopyrightMixin): @property def exists(self): - """Information about whether the Page exists or not. + """Whether or not the page exists. - The "information" is a tuple with two items. The first is a bool, - either ``True`` if the page exists or ``False`` if it does not. The - second is a string giving more information, either ``"invalid"``, - (title is invalid, e.g. it contains ``"["``), ``"missing"``, or - ``"exists"``. + This will be a number; its value does not matter, but it will equal + one of :py:attr:`self.PAGE_INVALID `, + :py:attr:`self.PAGE_MISSING `, or + :py:attr:`self.PAGE_EXISTS `. Makes an API query only if we haven't already made one. """ - cases = { - 0: (None, "unknown"), - 1: (False, "invalid"), - 2: (False, "missing"), - 3: (True, "exists"), - } - if self._exists == 0: + if self._exists == self.PAGE_UNKNOWN: self._load() - return cases[self._exists] + return self._exists @property def pageid(self): - """An integer ID representing the Page. + """An integer ID representing the page. - Makes an API query only if we haven't already made one. + Makes an API query only if we haven't already made one and the *pageid* + parameter to :py:meth:`__init__` was left as ``None``, which should be + true for all cases except when pages are returned by an SQL generator + (like :py:meth:`category.get_members(use_sql=True) + `). Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is invalid or the page does not exist, respectively. """ - if self._exists == 0: + if self._pageid: + return self._pageid + if self._exists == self.PAGE_UNKNOWN: self._load() self._assert_existence() # Missing pages do not have IDs return self._pageid @@ -518,7 +520,7 @@ class Page(CopyrightMixin): name is invalid. Won't raise an error if the page is missing because those can still be create-protected. """ - if self._exists == 0: + if self._exists == self.PAGE_UNKNOWN: self._load() self._assert_validity() # Invalid pages cannot be protected return self._protection @@ -541,7 +543,7 @@ class Page(CopyrightMixin): We will return ``False`` even if the page does not exist or is invalid. """ - if self._exists == 0: + if self._exists == self.PAGE_UNKNOWN: self._load() return self._is_redirect @@ -606,7 +608,7 @@ class Page(CopyrightMixin): Raises InvalidPageError or PageNotFoundError if the page name is invalid or the page does not exist, respectively. """ - if self._exists == 0: + if self._exists == self.PAGE_UNKNOWN: # Kill two birds with one stone by doing an API query for both our # attributes and our page content: query = self._site.api_query @@ -621,7 +623,7 @@ class Page(CopyrightMixin): if self._keep_following and self._is_redirect: self._title = self.get_redirect_target() self._keep_following = False # Don't follow double redirects - self._exists = 0 # Force another API query + self._exists = self.PAGE_UNKNOWN # Force another API query self.get() return self._content @@ -645,9 +647,10 @@ class Page(CopyrightMixin): :py:exc:`~earwigbot.exceptions.RedirectError` if the page is not a redirect. """ + re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]" content = self.get() try: - return re.findall(self.re_redirect, content, flags=re.I)[0] + return re.findall(re_redirect, content, flags=re.I)[0] except IndexError: e = "The page does not appear to have a redirect target." raise exceptions.RedirectError(e) @@ -666,7 +669,7 @@ class Page(CopyrightMixin): :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is invalid or the page does not exist, respectively. """ - if self._exists == 0: + if self._exists == self.PAGE_UNKNOWN: self._load() self._assert_existence() if not self._creator: diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index 3c4babc..eb818c4 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -184,6 +184,12 @@ class Site(object): res = "" return res.format(self.name, self.project, self.lang, self.domain) + def _unicodeify(self, value, encoding="utf8"): + """Return input as unicode if it's not unicode to begin with.""" + if isinstance(value, unicode): + return value + return unicode(value, encoding) + def _urlencode_utf8(self, params): """Implement urllib.urlencode() with support for unicode input.""" enc = lambda s: s.encode("utf8") if isinstance(s, unicode) else str(s) @@ -682,7 +688,7 @@ class Site(object): e = "There is no namespace with name '{0}'.".format(name) raise exceptions.NamespaceNotFoundError(e) - def get_page(self, title, follow_redirects=False): + def get_page(self, title, follow_redirects=False, pageid=None): """Return a :py:class:`Page` object for the given title. *follow_redirects* is passed directly to @@ -696,23 +702,26 @@ class Site(object): redirect-following: :py:class:`~earwigbot.wiki.page.Page`'s methods provide that. """ + title = self._unicodeify(title) prefixes = self.namespace_id_to_name(constants.NS_CATEGORY, all=True) prefix = title.split(":", 1)[0] if prefix != title: # Avoid a page that is simply "Category" if prefix in prefixes: - return Category(self, title, follow_redirects) - return Page(self, title, follow_redirects) + return Category(self, title, follow_redirects, pageid) + return Page(self, title, follow_redirects, pageid) - def get_category(self, catname, follow_redirects=False): + def get_category(self, catname, follow_redirects=False, pageid=None): """Return a :py:class:`Category` object for the given category name. *catname* should be given *without* a namespace prefix. This method is really just shorthand for :py:meth:`get_page("Category:" + catname) `. """ + catname = self._unicodeify(catname) + name = name if isinstance(name, unicode) else name.decode("utf8") prefix = self.namespace_id_to_name(constants.NS_CATEGORY) - pagename = ':'.join((prefix, catname)) - return Category(self, pagename, follow_redirects) + pagename = u':'.join((prefix, catname)) + return Category(self, pagename, follow_redirects, pageid) def get_user(self, username=None): """Return a :py:class:`User` object for the given username. @@ -721,6 +730,7 @@ class Site(object): :py:class:`~earwigbot.wiki.user.User` object representing the currently logged-in (or anonymous!) user is returned. """ + username = self._unicodeify(username) if not username: username = self._get_username() return User(self, username) diff --git a/earwigbot/wiki/user.py b/earwigbot/wiki/user.py index 619e6ad..9256a52 100644 --- a/earwigbot/wiki/user.py +++ b/earwigbot/wiki/user.py @@ -39,6 +39,7 @@ class User(object): *Attributes:* + - :py:attr:`site`: the user's corresponding Site object - :py:attr:`name`: the user's username - :py:attr:`exists`: ``True`` if the user exists, else ``False`` - :py:attr:`userid`: an integer ID representing the user @@ -155,6 +156,11 @@ class User(object): self._gender = res["gender"] @property + def site(self): + """The user's corresponding Site object.""" + return self._site + + @property def name(self): """The user's username.