Make cat.get_members() an iterator; make page.exists output nicer; cleanup

12 years ago · dcf912b65b
--- a/docs/toolset.rst
+++ b/docs/toolset.rst
@@ -97,11 +97,11 @@ and the following methods:
 - :py:meth:`namespace_name_to_id(name)
  <earwigbot.wiki.site.Site.namespace_name_to_id>`: given a namespace name,
  returns the associated namespace ID
 - :py:meth:`get_page(title, follow_redirects=False)
 - :py:meth:`get_page(title, follow_redirects=False, ...)
  <earwigbot.wiki.site.Site.get_page>`: returns a ``Page`` object for the given
  title (or a :py:class:`~earwigbot.wiki.category.Category` object if the
  page's namespace is "``Category:``")
 - :py:meth:`get_category(catname, follow_redirects=False)
 - :py:meth:`get_category(catname, follow_redirects=False, ...)
  <earwigbot.wiki.site.Site.get_category>`: returns a ``Category`` object for
  the given title (sans namespace)
 - :py:meth:`get_user(username) <earwigbot.wiki.site.Site.get_user>`: returns a
@@ -120,7 +120,7 @@ provide the following attributes:
 - :py:attr:`~earwigbot.wiki.page.Page.site`: the page's corresponding
  :py:class:`~earwigbot.wiki.site.Site` object
 - :py:attr:`~earwigbot.wiki.page.Page.title`: the page's title, or pagename
 - :py:attr:`~earwigbot.wiki.page.Page.exists`: whether the page exists
 - :py:attr:`~earwigbot.wiki.page.Page.exists`: whether or not the page exists
 - :py:attr:`~earwigbot.wiki.page.Page.pageid`: an integer ID representing the
  page
 - :py:attr:`~earwigbot.wiki.page.Page.url`: the page's URL
@@ -166,9 +166,10 @@ or :py:meth:`site.get_page(title) <earwigbot.wiki.site.Site.get_page>` where
 ``title`` is in the ``Category:`` namespace) provide the following additional
 method:

 - :py:meth:`get_members(use_sql=False, limit=None)
  <earwigbot.wiki.category.Category.get_members>`: returns a list of page
  titles in the category (limit is ``50`` by default if using the API)
 - :py:meth:`get_members(use_sql=False, limit=None, ...)
  <earwigbot.wiki.category.Category.get_members>`: iterates over
  :py:class:`~earwigbot.wiki.page.Page`\ s in the category, until either the
  category is exhausted or (if given) ``limit`` is reached

 Users
 ~~~~~
@@ -178,6 +179,8 @@ Create :py:class:`earwigbot.wiki.User <earwigbot.wiki.user.User>` objects with
 :py:meth:`page.get_creator() <earwigbot.wiki.page.Page.get_creator>`. They
 provide the following attributes:

 - :py:attr:`~earwigbot.wiki.user.User.site`: the user's corresponding
  :py:class:`~earwigbot.wiki.site.Site` object
 - :py:attr:`~earwigbot.wiki.user.User.name`: the user's username
 - :py:attr:`~earwigbot.wiki.user.User.exists`: ``True`` if the user exists, or
  ``False`` if they do not
--- a/earwigbot/commands/afc_pending.py
+++ b/earwigbot/commands/afc_pending.py
@@ -23,7 +23,7 @@
 from earwigbot.commands import BaseCommand

 class Command(BaseCommand):
    """Links the user to the pending AFC submissions page and category."""
    """Link the user to the pending AFC submissions page and category."""
    name = "pending"

    def check(self, data):
--- a/earwigbot/commands/afc_report.py
+++ b/earwigbot/commands/afc_report.py
@@ -70,7 +70,7 @@ class Command(BaseCommand):

    def get_page(self, title):
        page = self.site.get_page(title, follow_redirects=False)
        if page.exists[0]:
        if page.exists == page.PAGE_EXISTS:
            return page

    def report(self, page):
--- a/earwigbot/wiki/category.py
+++ b/earwigbot/wiki/category.py
@@ -39,7 +39,7 @@ class Category(Page):

    *Public methods:*

    - :py:meth:`get_members`: returns a list of page titles in the category
    - :py:meth:`get_members`: iterates over Pages in the category
    """

    def __repr__(self):
@@ -51,8 +51,8 @@ class Category(Page):
        """Return a nice string representation of the Category."""
        return '<Category "{0}" of {1}>'.format(self.title, str(self._site))

    def _get_members_via_sql(self, limit):
        """Return a list of tuples of (title, pageid) in the category."""
    def _get_members_via_sql(self, limit, follow):
        """Iterate over Pages in the category using SQL."""
        query = """SELECT page_title, page_namespace, page_id FROM page
                   JOIN categorylinks ON page_id = cl_from
                   WHERE cl_to = ?"""
@@ -64,42 +64,66 @@ class Category(Page):
        else:
            result = self._site.sql_query(query, (title,))

        members = []
        for row in result:
        members = list(result)
        for row in members:
            base = row[0].replace("_", " ").decode("utf8")
            namespace = self._site.namespace_id_to_name(row[1])
            if namespace:
                title = u":".join((namespace, base))
            else:  # Avoid doing a silly (albeit valid) ":Pagename" thing
                title = base
            members.append((title, row[2]))
        return members
            yield self._site.get_page(title, follow_redirects=follow,
                                      pageid=row[2])

    def _get_members_via_api(self, limit):
        """Return a list of page titles in the category using the API."""
    def _get_members_via_api(self, limit, follow):
        """Iterate over Pages in the category using the API."""
        params = {"action": "query", "list": "categorymembers",
                  "cmlimit": limit, "cmtitle": self._title}
        if not limit:
            params["cmlimit"] = 50  # Default value

        result = self._site.api_query(**params)
        members = result['query']['categorymembers']
        return [member["title"] for member in members]

    def get_members(self, use_sql=False, limit=None):
        """Return a list of page titles in the category.
                  "cmtitle": self._title}

        while 1:
            params["cmlimit"] = limit if limit else "max"
            result = self._site.api_query(**params)
            for member in result["query"]["categorymembers"]:
                title = member["title"]
                yield self._site.get_page(title, follow_redirects=follow)

            if "query-continue" in result:
                qcontinue = result["query-continue"]["categorymembers"]
                params["cmcontinue"] = qcontinue["cmcontinue"]
                if limit:
                    limit -= len(result["query"]["categorymembers"])
            else:
                break

    def get_members(self, use_sql=False, limit=None, follow_redirects=None):
        """Iterate over Pages in the category.

        If *use_sql* is ``True``, we will use a SQL query instead of the API.
        Pages will be returned as tuples of ``(title, pageid)`` instead of just
        titles.

        If *limit* is provided, we will provide this many titles, or less if
        the category is smaller. It defaults to 50 for API queries; normal
        users can go up to 500, and bots can go up to 5,000 on a single API
        query. If we're using SQL, the limit is ``None`` by default (returning
        all pages in the category), but an arbitrary limit can still be chosen.
        Note that pages are retrieved from the API in chunks (by default, in
        500-page chunks for normal users and 5000-page chunks for bots and
        admins), so queries may be made as we go along. If *limit* is given, we
        will provide this many pages, or less if the category is smaller. By
        default, *limit* is ``None``, meaning we will keep iterating over
        members until the category is exhausted. *follow_redirects* is passed
        directly to :py:meth:`site.get_page()
        <earwigbot.wiki.site.Site.get_page>`; it defaults to ``None``, which
        will use the value passed to our :py:meth:`__init__`.

        .. note::
           Be careful when iterating over very large categories with no limit.
           If using the API, at best, you will make one query per 5000 pages,
           which can add up significantly for categories with hundreds of
           thousands of members. As for SQL, note that *all page titles are
           stored internally* as soon as the query is made, so the site-wide
           SQL lock can be freed and unrelated queries can be made without
           requiring a separate connection to be opened. This is generally not
           an issue unless your category's size approaches several hundred
           thousand, in which case the sheer number of titles in memory becomes
           problematic.
        """
        if follow_redirects is None:
            follow_redirects = self._follow_redirects
        if use_sql:
            return self._get_members_via_sql(limit)
            return self._get_members_via_sql(limit, follow_redirects)
        else:
            return self._get_members_via_api(limit)
            return self._get_members_via_api(limit, follow_redirects)
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -43,7 +43,7 @@ class Page(CopyrightMixin):

    - :py:attr:`site`:        the page's corresponding Site object
    - :py:attr:`title`:       the page's title, or pagename
    - :py:attr:`exists`:      whether the page exists
    - :py:attr:`exists`:      whether or not the page exists
    - :py:attr:`pageid`:      an integer ID representing the page
    - :py:attr:`url`:         the page's URL
    - :py:attr:`namespace`:   the page's namespace as an integer
@@ -70,17 +70,20 @@ class Page(CopyrightMixin):
      URL
    """

    re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"
    PAGE_UNKNOWN = 0
    PAGE_INVALID = 1
    PAGE_MISSING = 2
    PAGE_EXISTS = 3

    def __init__(self, site, title, follow_redirects=False):
    def __init__(self, site, title, follow_redirects=False, pageid=None):
        """Constructor for new Page instances.

        Takes three arguments: a Site object, the Page's title (or pagename),
        and whether or not to follow redirects (optional, defaults to False).
        Takes four arguments: a Site object, the Page's title (or pagename),
        whether or not to follow redirects (optional, defaults to False), and
        a page ID to supplement the title (optional, defaults to None - i.e.,
        we will have to query the API to get it).

        As with User, site.get_page() is preferred. Site's method has support
        for a default *follow_redirects* value in our config, while __init__()
        always defaults to False.
        As with User, site.get_page() is preferred.

        __init__() will not do any API queries, but it will use basic namespace
        logic to determine our namespace ID and if we are a talkpage.
@@ -89,9 +92,9 @@ class Page(CopyrightMixin):
        self._site = site
        self._title = title.strip()
        self._follow_redirects = self._keep_following = follow_redirects
        self._pageid = pageid

        self._exists = 0
        self._pageid = None
        self._exists = self.PAGE_UNKNOWN
        self._is_redirect = None
        self._lastrevid = None
        self._protection = None
@@ -140,7 +143,7 @@ class Page(CopyrightMixin):
        Note that validity != existence. If a page's title is invalid (e.g, it
        contains "[") it will always be invalid, and cannot be edited.
        """
        if self._exists == 1:
        if self._exists == self.PAGE_INVALID:
            e = "Page '{0}' is invalid.".format(self._title)
            raise exceptions.InvalidPageError(e)

@@ -152,7 +155,7 @@ class Page(CopyrightMixin):
        It will also call _assert_validity() beforehand.
        """
        self._assert_validity()
        if self._exists == 2:
        if self._exists == self.PAGE_MISSING:
            e = "Page '{0}' does not exist.".format(self._title)
            raise exceptions.PageNotFoundError(e)

@@ -213,14 +216,14 @@ class Page(CopyrightMixin):
            if "missing" in res:
                # If it has a negative ID and it's missing; we can still get
                # data like the namespace, protection, and URL:
                self._exists = 2
                self._exists = self.PAGE_MISSING
            else:
                # If it has a negative ID and it's invalid, then break here,
                # because there's no other data for us to get:
                self._exists = 1
                self._exists = self.PAGE_INVALID
                return
        else:
            self._exists = 3
            self._exists = self.PAGE_EXISTS

        self._fullurl = res["fullurl"]
        self._protection = res["protection"]
@@ -312,7 +315,7 @@ class Page(CopyrightMixin):
        if result["edit"]["result"] == "Success":
            self._content = None
            self._basetimestamp = None
            self._exists = 0
            self._exists = self.PAGE_UNKNOWN
            return

        # If we're here, then the edit failed. If it's because of AssertEdit,
@@ -346,7 +349,7 @@ class Page(CopyrightMixin):
            params["starttimestamp"] = self._starttimestamp
            if self._basetimestamp:
                params["basetimestamp"] = self._basetimestamp
            if self._exists == 2:
            if self._exists == self.PAGE_MISSING:
                # Page does not exist; don't edit if it already exists:
                params["createonly"] = "true"
        else:
@@ -384,7 +387,7 @@ class Page(CopyrightMixin):
            # These attributes are now invalidated:
            self._content = None
            self._basetimestamp = None
            self._exists = 0
            self._exists = self.PAGE_UNKNOWN
            raise exceptions.EditConflictError(error.info)

        elif error.code in ["emptypage", "emptynewsection"]:
@@ -432,12 +435,12 @@ class Page(CopyrightMixin):

    @property
    def site(self):
        """The Page's corresponding Site object."""
        """The page's corresponding Site object."""
        return self._site

    @property
    def title(self):
        """The Page's title, or "pagename".
        """The page's title, or "pagename".

        This won't do any API queries on its own. Any other attributes or
        methods that do API queries will reload the title, however, like
@@ -448,37 +451,36 @@ class Page(CopyrightMixin):

    @property
    def exists(self):
        """Information about whether the Page exists or not.
        """Whether or not the page exists.

        The "information" is a tuple with two items. The first is a bool,
        either ``True`` if the page exists or ``False`` if it does not. The
        second is a string giving more information, either ``"invalid"``,
        (title is invalid, e.g. it contains ``"["``), ``"missing"``, or
        ``"exists"``.
        This will be a number; its value does not matter, but it will equal
        one of :py:attr:`self.PAGE_INVALID <PAGE_INVALID>`,
        :py:attr:`self.PAGE_MISSING <PAGE_MISSING>`, or
        :py:attr:`self.PAGE_EXISTS <PAGE_EXISTS>`.

        Makes an API query only if we haven't already made one.
        """
        cases = {
            0: (None, "unknown"),
            1: (False, "invalid"),
            2: (False, "missing"),
            3: (True, "exists"),
        }
        if self._exists == 0:
        if self._exists == self.PAGE_UNKNOWN:
            self._load()
        return cases[self._exists]
        return self._exists

    @property
    def pageid(self):
        """An integer ID representing the Page.
        """An integer ID representing the page.

        Makes an API query only if we haven't already made one.
        Makes an API query only if we haven't already made one and the *pageid*
        parameter to :py:meth:`__init__` was left as ``None``, which should be
        true for all cases except when pages are returned by an SQL generator
        (like :py:meth:`category.get_members(use_sql=True)
        <earwigbot.wiki.category.Category.get_members>`).

        Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or
        :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is
        invalid or the page does not exist, respectively.
        """
        if self._exists == 0:
        if self._pageid:
            return self._pageid
        if self._exists == self.PAGE_UNKNOWN:
            self._load()
        self._assert_existence()  # Missing pages do not have IDs
        return self._pageid
@@ -518,7 +520,7 @@ class Page(CopyrightMixin):
        name is invalid. Won't raise an error if the page is missing because
        those can still be create-protected.
        """
        if self._exists == 0:
        if self._exists == self.PAGE_UNKNOWN:
            self._load()
        self._assert_validity()  # Invalid pages cannot be protected
        return self._protection
@@ -541,7 +543,7 @@ class Page(CopyrightMixin):

        We will return ``False`` even if the page does not exist or is invalid.
        """
        if self._exists == 0:
        if self._exists == self.PAGE_UNKNOWN:
            self._load()
        return self._is_redirect

@@ -606,7 +608,7 @@ class Page(CopyrightMixin):
        Raises InvalidPageError or PageNotFoundError if the page name is
        invalid or the page does not exist, respectively.
        """
        if self._exists == 0:
        if self._exists == self.PAGE_UNKNOWN:
            # Kill two birds with one stone by doing an API query for both our
            # attributes and our page content:
            query = self._site.api_query
@@ -621,7 +623,7 @@ class Page(CopyrightMixin):
            if self._keep_following and self._is_redirect:
                self._title = self.get_redirect_target()
                self._keep_following = False  # Don't follow double redirects
                self._exists = 0  # Force another API query
                self._exists = self.PAGE_UNKNOWN  # Force another API query
                self.get()

            return self._content
@@ -645,9 +647,10 @@ class Page(CopyrightMixin):
        :py:exc:`~earwigbot.exceptions.RedirectError` if the page is not a
        redirect.
        """
        re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"
        content = self.get()
        try:
            return re.findall(self.re_redirect, content, flags=re.I)[0]
            return re.findall(re_redirect, content, flags=re.I)[0]
        except IndexError:
            e = "The page does not appear to have a redirect target."
            raise exceptions.RedirectError(e)
@@ -666,7 +669,7 @@ class Page(CopyrightMixin):
        :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is
        invalid or the page does not exist, respectively.
        """
        if self._exists == 0:
        if self._exists == self.PAGE_UNKNOWN:
            self._load()
        self._assert_existence()
        if not self._creator:
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -184,6 +184,12 @@ class Site(object):
        res = "<Site {0} ({1}:{2}) at {3}>"
        return res.format(self.name, self.project, self.lang, self.domain)

    def _unicodeify(self, value, encoding="utf8"):
        """Return input as unicode if it's not unicode to begin with."""
        if isinstance(value, unicode):
            return value
        return unicode(value, encoding)

    def _urlencode_utf8(self, params):
        """Implement urllib.urlencode() with support for unicode input."""
        enc = lambda s: s.encode("utf8") if isinstance(s, unicode) else str(s)
@@ -682,7 +688,7 @@ class Site(object):
        e = "There is no namespace with name '{0}'.".format(name)
        raise exceptions.NamespaceNotFoundError(e)

    def get_page(self, title, follow_redirects=False):
    def get_page(self, title, follow_redirects=False, pageid=None):
        """Return a :py:class:`Page` object for the given title.

        *follow_redirects* is passed directly to
@@ -696,23 +702,26 @@ class Site(object):
        redirect-following: :py:class:`~earwigbot.wiki.page.Page`'s methods
        provide that.
        """
        title = self._unicodeify(title)
        prefixes = self.namespace_id_to_name(constants.NS_CATEGORY, all=True)
        prefix = title.split(":", 1)[0]
        if prefix != title:  # Avoid a page that is simply "Category"
            if prefix in prefixes:
                return Category(self, title, follow_redirects)
        return Page(self, title, follow_redirects)
                return Category(self, title, follow_redirects, pageid)
        return Page(self, title, follow_redirects, pageid)

    def get_category(self, catname, follow_redirects=False):
    def get_category(self, catname, follow_redirects=False, pageid=None):
        """Return a :py:class:`Category` object for the given category name.

        *catname* should be given *without* a namespace prefix. This method is
        really just shorthand for :py:meth:`get_page("Category:" + catname)
        <get_page>`.
        """
        catname = self._unicodeify(catname)
        name = name if isinstance(name, unicode) else name.decode("utf8")
        prefix = self.namespace_id_to_name(constants.NS_CATEGORY)
        pagename = ':'.join((prefix, catname))
        return Category(self, pagename, follow_redirects)
        pagename = u':'.join((prefix, catname))
        return Category(self, pagename, follow_redirects, pageid)

    def get_user(self, username=None):
        """Return a :py:class:`User` object for the given username.
@@ -721,6 +730,7 @@ class Site(object):
        :py:class:`~earwigbot.wiki.user.User` object representing the currently
        logged-in (or anonymous!) user is returned.
        """
        username = self._unicodeify(username)
        if not username:
            username = self._get_username()
        return User(self, username)
--- a/earwigbot/wiki/user.py
+++ b/earwigbot/wiki/user.py
@@ -39,6 +39,7 @@ class User(object):

    *Attributes:*

    - :py:attr:`site`:         the user's corresponding Site object
    - :py:attr:`name`:         the user's username
    - :py:attr:`exists`:       ``True`` if the user exists, else ``False``
    - :py:attr:`userid`:       an integer ID representing the user
@@ -155,6 +156,11 @@ class User(object):
        self._gender = res["gender"]

    @property
    def site(self):
        """The user's corresponding Site object."""
        return self._site

    @property
    def name(self):
        """The user's username.