From dcf912b65b57f0fa68ca2ac419bdec0b09cce0ab Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Mon, 14 May 2012 23:25:21 -0400
Subject: [PATCH] Make cat.get_members() an iterator; make page.exists output
 nicer; cleanup

---
 docs/toolset.rst                  | 15 ++++---
 earwigbot/commands/afc_pending.py |  2 +-
 earwigbot/commands/afc_report.py  |  2 +-
 earwigbot/wiki/category.py        | 82 +++++++++++++++++++++++-------------
 earwigbot/wiki/page.py            | 89 ++++++++++++++++++++-------------------
 earwigbot/wiki/site.py            | 22 +++++++---
 earwigbot/wiki/user.py            |  6 +++
 7 files changed, 132 insertions(+), 86 deletions(-)

diff --git a/docs/toolset.rst b/docs/toolset.rst
index b8a4124..812b0c8 100644
--- a/docs/toolset.rst
+++ b/docs/toolset.rst
@@ -97,11 +97,11 @@ and the following methods:
 - :py:meth:`namespace_name_to_id(name)
   <earwigbot.wiki.site.Site.namespace_name_to_id>`: given a namespace name,
   returns the associated namespace ID
-- :py:meth:`get_page(title, follow_redirects=False)
+- :py:meth:`get_page(title, follow_redirects=False, ...)
   <earwigbot.wiki.site.Site.get_page>`: returns a ``Page`` object for the given
   title (or a :py:class:`~earwigbot.wiki.category.Category` object if the
   page's namespace is "``Category:``")
-- :py:meth:`get_category(catname, follow_redirects=False)
+- :py:meth:`get_category(catname, follow_redirects=False, ...)
   <earwigbot.wiki.site.Site.get_category>`: returns a ``Category`` object for
   the given title (sans namespace)
 - :py:meth:`get_user(username) <earwigbot.wiki.site.Site.get_user>`: returns a
@@ -120,7 +120,7 @@ provide the following attributes:
 - :py:attr:`~earwigbot.wiki.page.Page.site`: the page's corresponding
   :py:class:`~earwigbot.wiki.site.Site` object
 - :py:attr:`~earwigbot.wiki.page.Page.title`: the page's title, or pagename
-- :py:attr:`~earwigbot.wiki.page.Page.exists`: whether the page exists
+- :py:attr:`~earwigbot.wiki.page.Page.exists`: whether or not the page exists
 - :py:attr:`~earwigbot.wiki.page.Page.pageid`: an integer ID representing the
   page
 - :py:attr:`~earwigbot.wiki.page.Page.url`: the page's URL
@@ -166,9 +166,10 @@ or :py:meth:`site.get_page(title) <earwigbot.wiki.site.Site.get_page>` where
 ``title`` is in the ``Category:`` namespace) provide the following additional
 method:
 
-- :py:meth:`get_members(use_sql=False, limit=None)
-  <earwigbot.wiki.category.Category.get_members>`: returns a list of page
-  titles in the category (limit is ``50`` by default if using the API)
+- :py:meth:`get_members(use_sql=False, limit=None, ...)
+  <earwigbot.wiki.category.Category.get_members>`: iterates over
+  :py:class:`~earwigbot.wiki.page.Page`\ s in the category, until either the
+  category is exhausted or (if given) ``limit`` is reached
 
 Users
 ~~~~~
@@ -178,6 +179,8 @@ Create :py:class:`earwigbot.wiki.User <earwigbot.wiki.user.User>` objects with
 :py:meth:`page.get_creator() <earwigbot.wiki.page.Page.get_creator>`. They
 provide the following attributes:
 
+- :py:attr:`~earwigbot.wiki.user.User.site`: the user's corresponding
+  :py:class:`~earwigbot.wiki.site.Site` object
 - :py:attr:`~earwigbot.wiki.user.User.name`: the user's username
 - :py:attr:`~earwigbot.wiki.user.User.exists`: ``True`` if the user exists, or
   ``False`` if they do not
diff --git a/earwigbot/commands/afc_pending.py b/earwigbot/commands/afc_pending.py
index d90b25f..1a43786 100644
--- a/earwigbot/commands/afc_pending.py
+++ b/earwigbot/commands/afc_pending.py
@@ -23,7 +23,7 @@
 from earwigbot.commands import BaseCommand
 
 class Command(BaseCommand):
-    """Links the user to the pending AFC submissions page and category."""
+    """Link the user to the pending AFC submissions page and category."""
     name = "pending"
 
     def check(self, data):
diff --git a/earwigbot/commands/afc_report.py b/earwigbot/commands/afc_report.py
index 6129e11..c41e3bc 100644
--- a/earwigbot/commands/afc_report.py
+++ b/earwigbot/commands/afc_report.py
@@ -70,7 +70,7 @@ class Command(BaseCommand):
 
     def get_page(self, title):
         page = self.site.get_page(title, follow_redirects=False)
-        if page.exists[0]:
+        if page.exists == page.PAGE_EXISTS:
             return page
 
     def report(self, page):
diff --git a/earwigbot/wiki/category.py b/earwigbot/wiki/category.py
index e953e70..2df9a0e 100644
--- a/earwigbot/wiki/category.py
+++ b/earwigbot/wiki/category.py
@@ -39,7 +39,7 @@ class Category(Page):
 
     *Public methods:*
 
-    - :py:meth:`get_members`: returns a list of page titles in the category
+    - :py:meth:`get_members`: iterates over Pages in the category
     """
 
     def __repr__(self):
@@ -51,8 +51,8 @@ class Category(Page):
         """Return a nice string representation of the Category."""
         return '<Category "{0}" of {1}>'.format(self.title, str(self._site))
 
-    def _get_members_via_sql(self, limit):
-        """Return a list of tuples of (title, pageid) in the category."""
+    def _get_members_via_sql(self, limit, follow):
+        """Iterate over Pages in the category using SQL."""
         query = """SELECT page_title, page_namespace, page_id FROM page
                    JOIN categorylinks ON page_id = cl_from
                    WHERE cl_to = ?"""
@@ -64,42 +64,66 @@ class Category(Page):
         else:
             result = self._site.sql_query(query, (title,))
 
-        members = []
-        for row in result:
+        members = list(result)
+        for row in members:
             base = row[0].replace("_", " ").decode("utf8")
             namespace = self._site.namespace_id_to_name(row[1])
             if namespace:
                 title = u":".join((namespace, base))
             else:  # Avoid doing a silly (albeit valid) ":Pagename" thing
                 title = base
-            members.append((title, row[2]))
-        return members
+            yield self._site.get_page(title, follow_redirects=follow,
+                                      pageid=row[2])
 
-    def _get_members_via_api(self, limit):
-        """Return a list of page titles in the category using the API."""
+    def _get_members_via_api(self, limit, follow):
+        """Iterate over Pages in the category using the API."""
         params = {"action": "query", "list": "categorymembers",
-                  "cmlimit": limit, "cmtitle": self._title}
-        if not limit:
-            params["cmlimit"] = 50  # Default value
-
-        result = self._site.api_query(**params)
-        members = result['query']['categorymembers']
-        return [member["title"] for member in members]
-
-    def get_members(self, use_sql=False, limit=None):
-        """Return a list of page titles in the category.
+                  "cmtitle": self._title}
+
+        while 1:
+            params["cmlimit"] = limit if limit else "max"
+            result = self._site.api_query(**params)
+            for member in result["query"]["categorymembers"]:
+                title = member["title"]
+                yield self._site.get_page(title, follow_redirects=follow)
+
+            if "query-continue" in result:
+                qcontinue = result["query-continue"]["categorymembers"]
+                params["cmcontinue"] = qcontinue["cmcontinue"]
+                if limit:
+                    limit -= len(result["query"]["categorymembers"])
+            else:
+                break
+
+    def get_members(self, use_sql=False, limit=None, follow_redirects=None):
+        """Iterate over Pages in the category.
 
         If *use_sql* is ``True``, we will use a SQL query instead of the API.
-        Pages will be returned as tuples of ``(title, pageid)`` instead of just
-        titles.
-
-        If *limit* is provided, we will provide this many titles, or less if
-        the category is smaller. It defaults to 50 for API queries; normal
-        users can go up to 500, and bots can go up to 5,000 on a single API
-        query. If we're using SQL, the limit is ``None`` by default (returning
-        all pages in the category), but an arbitrary limit can still be chosen.
+        Note that pages are retrieved from the API in chunks (by default, in
+        500-page chunks for normal users and 5000-page chunks for bots and
+        admins), so queries may be made as we go along. If *limit* is given, we
+        will provide this many pages, or less if the category is smaller. By
+        default, *limit* is ``None``, meaning we will keep iterating over
+        members until the category is exhausted. *follow_redirects* is passed
+        directly to :py:meth:`site.get_page()
+        <earwigbot.wiki.site.Site.get_page>`; it defaults to ``None``, which
+        will use the value passed to our :py:meth:`__init__`.
+
+        .. note::
+           Be careful when iterating over very large categories with no limit.
+           If using the API, at best, you will make one query per 5000 pages,
+           which can add up significantly for categories with hundreds of
+           thousands of members. As for SQL, note that *all page titles are
+           stored internally* as soon as the query is made, so the site-wide
+           SQL lock can be freed and unrelated queries can be made without
+           requiring a separate connection to be opened. This is generally not
+           an issue unless your category's size approaches several hundred
+           thousand, in which case the sheer number of titles in memory becomes
+           problematic.
         """
+        if follow_redirects is None:
+            follow_redirects = self._follow_redirects
         if use_sql:
-            return self._get_members_via_sql(limit)
+            return self._get_members_via_sql(limit, follow_redirects)
         else:
-            return self._get_members_via_api(limit)
+            return self._get_members_via_api(limit, follow_redirects)
diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py
index 21acd6d..98f11dd 100644
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -43,7 +43,7 @@ class Page(CopyrightMixin):
 
     - :py:attr:`site`:        the page's corresponding Site object
     - :py:attr:`title`:       the page's title, or pagename
-    - :py:attr:`exists`:      whether the page exists
+    - :py:attr:`exists`:      whether or not the page exists
     - :py:attr:`pageid`:      an integer ID representing the page
     - :py:attr:`url`:         the page's URL
     - :py:attr:`namespace`:   the page's namespace as an integer
@@ -70,17 +70,20 @@ class Page(CopyrightMixin):
       URL
     """
 
-    re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"
+    PAGE_UNKNOWN = 0
+    PAGE_INVALID = 1
+    PAGE_MISSING = 2
+    PAGE_EXISTS = 3
 
-    def __init__(self, site, title, follow_redirects=False):
+    def __init__(self, site, title, follow_redirects=False, pageid=None):
         """Constructor for new Page instances.
 
-        Takes three arguments: a Site object, the Page's title (or pagename),
-        and whether or not to follow redirects (optional, defaults to False).
+        Takes four arguments: a Site object, the Page's title (or pagename),
+        whether or not to follow redirects (optional, defaults to False), and
+        a page ID to supplement the title (optional, defaults to None - i.e.,
+        we will have to query the API to get it).
 
-        As with User, site.get_page() is preferred. Site's method has support
-        for a default *follow_redirects* value in our config, while __init__()
-        always defaults to False.
+        As with User, site.get_page() is preferred.
 
         __init__() will not do any API queries, but it will use basic namespace
         logic to determine our namespace ID and if we are a talkpage.
@@ -89,9 +92,9 @@ class Page(CopyrightMixin):
         self._site = site
         self._title = title.strip()
         self._follow_redirects = self._keep_following = follow_redirects
+        self._pageid = pageid
 
-        self._exists = 0
-        self._pageid = None
+        self._exists = self.PAGE_UNKNOWN
         self._is_redirect = None
         self._lastrevid = None
         self._protection = None
@@ -140,7 +143,7 @@ class Page(CopyrightMixin):
         Note that validity != existence. If a page's title is invalid (e.g, it
         contains "[") it will always be invalid, and cannot be edited.
         """
-        if self._exists == 1:
+        if self._exists == self.PAGE_INVALID:
             e = "Page '{0}' is invalid.".format(self._title)
             raise exceptions.InvalidPageError(e)
 
@@ -152,7 +155,7 @@ class Page(CopyrightMixin):
         It will also call _assert_validity() beforehand.
         """
         self._assert_validity()
-        if self._exists == 2:
+        if self._exists == self.PAGE_MISSING:
             e = "Page '{0}' does not exist.".format(self._title)
             raise exceptions.PageNotFoundError(e)
 
@@ -213,14 +216,14 @@ class Page(CopyrightMixin):
             if "missing" in res:
                 # If it has a negative ID and it's missing; we can still get
                 # data like the namespace, protection, and URL:
-                self._exists = 2
+                self._exists = self.PAGE_MISSING
             else:
                 # If it has a negative ID and it's invalid, then break here,
                 # because there's no other data for us to get:
-                self._exists = 1
+                self._exists = self.PAGE_INVALID
                 return
         else:
-            self._exists = 3
+            self._exists = self.PAGE_EXISTS
 
         self._fullurl = res["fullurl"]
         self._protection = res["protection"]
@@ -312,7 +315,7 @@ class Page(CopyrightMixin):
         if result["edit"]["result"] == "Success":
             self._content = None
             self._basetimestamp = None
-            self._exists = 0
+            self._exists = self.PAGE_UNKNOWN
             return
 
         # If we're here, then the edit failed. If it's because of AssertEdit,
@@ -346,7 +349,7 @@ class Page(CopyrightMixin):
             params["starttimestamp"] = self._starttimestamp
             if self._basetimestamp:
                 params["basetimestamp"] = self._basetimestamp
-            if self._exists == 2:
+            if self._exists == self.PAGE_MISSING:
                 # Page does not exist; don't edit if it already exists:
                 params["createonly"] = "true"
         else:
@@ -384,7 +387,7 @@ class Page(CopyrightMixin):
             # These attributes are now invalidated:
             self._content = None
             self._basetimestamp = None
-            self._exists = 0
+            self._exists = self.PAGE_UNKNOWN
             raise exceptions.EditConflictError(error.info)
 
         elif error.code in ["emptypage", "emptynewsection"]:
@@ -432,12 +435,12 @@ class Page(CopyrightMixin):
 
     @property
     def site(self):
-        """The Page's corresponding Site object."""
+        """The page's corresponding Site object."""
         return self._site
 
     @property
     def title(self):
-        """The Page's title, or "pagename".
+        """The page's title, or "pagename".
 
         This won't do any API queries on its own. Any other attributes or
         methods that do API queries will reload the title, however, like
@@ -448,37 +451,36 @@ class Page(CopyrightMixin):
 
     @property
     def exists(self):
-        """Information about whether the Page exists or not.
+        """Whether or not the page exists.
 
-        The "information" is a tuple with two items. The first is a bool,
-        either ``True`` if the page exists or ``False`` if it does not. The
-        second is a string giving more information, either ``"invalid"``,
-        (title is invalid, e.g. it contains ``"["``), ``"missing"``, or
-        ``"exists"``.
+        This will be a number; its value does not matter, but it will equal
+        one of :py:attr:`self.PAGE_INVALID <PAGE_INVALID>`,
+        :py:attr:`self.PAGE_MISSING <PAGE_MISSING>`, or
+        :py:attr:`self.PAGE_EXISTS <PAGE_EXISTS>`.
 
         Makes an API query only if we haven't already made one.
         """
-        cases = {
-            0: (None, "unknown"),
-            1: (False, "invalid"),
-            2: (False, "missing"),
-            3: (True, "exists"),
-        }
-        if self._exists == 0:
+        if self._exists == self.PAGE_UNKNOWN:
             self._load()
-        return cases[self._exists]
+        return self._exists
 
     @property
     def pageid(self):
-        """An integer ID representing the Page.
+        """An integer ID representing the page.
 
-        Makes an API query only if we haven't already made one.
+        Makes an API query only if we haven't already made one and the *pageid*
+        parameter to :py:meth:`__init__` was left as ``None``, which should be
+        true for all cases except when pages are returned by an SQL generator
+        (like :py:meth:`category.get_members(use_sql=True)
+        <earwigbot.wiki.category.Category.get_members>`).
 
         Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or
         :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is
         invalid or the page does not exist, respectively.
         """
-        if self._exists == 0:
+        if self._pageid:
+            return self._pageid
+        if self._exists == self.PAGE_UNKNOWN:
             self._load()
         self._assert_existence()  # Missing pages do not have IDs
         return self._pageid
@@ -518,7 +520,7 @@ class Page(CopyrightMixin):
         name is invalid. Won't raise an error if the page is missing because
         those can still be create-protected.
         """
-        if self._exists == 0:
+        if self._exists == self.PAGE_UNKNOWN:
             self._load()
         self._assert_validity()  # Invalid pages cannot be protected
         return self._protection
@@ -541,7 +543,7 @@ class Page(CopyrightMixin):
 
         We will return ``False`` even if the page does not exist or is invalid.
         """
-        if self._exists == 0:
+        if self._exists == self.PAGE_UNKNOWN:
             self._load()
         return self._is_redirect
 
@@ -606,7 +608,7 @@ class Page(CopyrightMixin):
         Raises InvalidPageError or PageNotFoundError if the page name is
         invalid or the page does not exist, respectively.
         """
-        if self._exists == 0:
+        if self._exists == self.PAGE_UNKNOWN:
             # Kill two birds with one stone by doing an API query for both our
             # attributes and our page content:
             query = self._site.api_query
@@ -621,7 +623,7 @@ class Page(CopyrightMixin):
             if self._keep_following and self._is_redirect:
                 self._title = self.get_redirect_target()
                 self._keep_following = False  # Don't follow double redirects
-                self._exists = 0  # Force another API query
+                self._exists = self.PAGE_UNKNOWN  # Force another API query
                 self.get()
 
             return self._content
@@ -645,9 +647,10 @@ class Page(CopyrightMixin):
         :py:exc:`~earwigbot.exceptions.RedirectError` if the page is not a
         redirect.
         """
+        re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"
         content = self.get()
         try:
-            return re.findall(self.re_redirect, content, flags=re.I)[0]
+            return re.findall(re_redirect, content, flags=re.I)[0]
         except IndexError:
             e = "The page does not appear to have a redirect target."
             raise exceptions.RedirectError(e)
@@ -666,7 +669,7 @@ class Page(CopyrightMixin):
         :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is
         invalid or the page does not exist, respectively.
         """
-        if self._exists == 0:
+        if self._exists == self.PAGE_UNKNOWN:
             self._load()
         self._assert_existence()
         if not self._creator:
diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py
index 3c4babc..eb818c4 100644
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -184,6 +184,12 @@ class Site(object):
         res = "<Site {0} ({1}:{2}) at {3}>"
         return res.format(self.name, self.project, self.lang, self.domain)
 
+    def _unicodeify(self, value, encoding="utf8"):
+        """Return input as unicode if it's not unicode to begin with."""
+        if isinstance(value, unicode):
+            return value
+        return unicode(value, encoding)
+
     def _urlencode_utf8(self, params):
         """Implement urllib.urlencode() with support for unicode input."""
         enc = lambda s: s.encode("utf8") if isinstance(s, unicode) else str(s)
@@ -682,7 +688,7 @@ class Site(object):
         e = "There is no namespace with name '{0}'.".format(name)
         raise exceptions.NamespaceNotFoundError(e)
 
-    def get_page(self, title, follow_redirects=False):
+    def get_page(self, title, follow_redirects=False, pageid=None):
         """Return a :py:class:`Page` object for the given title.
 
         *follow_redirects* is passed directly to
@@ -696,23 +702,26 @@ class Site(object):
         redirect-following: :py:class:`~earwigbot.wiki.page.Page`'s methods
         provide that.
         """
+        title = self._unicodeify(title)
         prefixes = self.namespace_id_to_name(constants.NS_CATEGORY, all=True)
         prefix = title.split(":", 1)[0]
         if prefix != title:  # Avoid a page that is simply "Category"
             if prefix in prefixes:
-                return Category(self, title, follow_redirects)
-        return Page(self, title, follow_redirects)
+                return Category(self, title, follow_redirects, pageid)
+        return Page(self, title, follow_redirects, pageid)
 
-    def get_category(self, catname, follow_redirects=False):
+    def get_category(self, catname, follow_redirects=False, pageid=None):
         """Return a :py:class:`Category` object for the given category name.
 
         *catname* should be given *without* a namespace prefix. This method is
         really just shorthand for :py:meth:`get_page("Category:" + catname)
         <get_page>`.
         """
+        catname = self._unicodeify(catname)
+        name = name if isinstance(name, unicode) else name.decode("utf8")
         prefix = self.namespace_id_to_name(constants.NS_CATEGORY)
-        pagename = ':'.join((prefix, catname))
-        return Category(self, pagename, follow_redirects)
+        pagename = u':'.join((prefix, catname))
+        return Category(self, pagename, follow_redirects, pageid)
 
     def get_user(self, username=None):
         """Return a :py:class:`User` object for the given username.
@@ -721,6 +730,7 @@ class Site(object):
         :py:class:`~earwigbot.wiki.user.User` object representing the currently
         logged-in (or anonymous!) user is returned.
         """
+        username = self._unicodeify(username)
         if not username:
             username = self._get_username()
         return User(self, username)
diff --git a/earwigbot/wiki/user.py b/earwigbot/wiki/user.py
index 619e6ad..9256a52 100644
--- a/earwigbot/wiki/user.py
+++ b/earwigbot/wiki/user.py
@@ -39,6 +39,7 @@ class User(object):
 
     *Attributes:*
 
+    - :py:attr:`site`:         the user's corresponding Site object
     - :py:attr:`name`:         the user's username
     - :py:attr:`exists`:       ``True`` if the user exists, else ``False``
     - :py:attr:`userid`:       an integer ID representing the user
@@ -155,6 +156,11 @@ class User(object):
         self._gender = res["gender"]
 
     @property
+    def site(self):
+        """The user's corresponding Site object."""
+        return self._site
+
+    @property
     def name(self):
         """The user's username.