From 6433957ae9917f3dc06e5022d8148d6bea6047d1 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 5 Jul 2012 17:46:21 -0400
Subject: [PATCH 1/8] First stages of service delegation.

---
 earwigbot/commands/afc_status.py  |  7 ++---
 earwigbot/exceptions.py           | 25 +++++++++++-------
 earwigbot/tasks/afc_statistics.py |  2 +-
 earwigbot/wiki/category.py        | 55 +++++++++++++++++++++++++++++++++------
 earwigbot/wiki/page.py            | 34 ++++++++++++------------
 earwigbot/wiki/site.py            | 33 ++++++++++++++++++-----
 earwigbot/wiki/user.py            | 14 +++++-----
 7 files changed, 116 insertions(+), 54 deletions(-)

diff --git a/earwigbot/commands/afc_status.py b/earwigbot/commands/afc_status.py
index 4f517d5..c03da85 100644
--- a/earwigbot/commands/afc_status.py
+++ b/earwigbot/commands/afc_status.py
@@ -109,12 +109,9 @@ class AFCStatus(Command):
 
     def count_submissions(self):
         """Returns the number of open AFC submissions (count of CAT:PEND)."""
-        cat = self.site.get_category("Pending AfC submissions")
-        subs = len(cat.get_members(use_sql=True))
-
-        # Remove [[Wikipedia:Articles for creation/Redirects]] and
+        # Subtract two for [[Wikipedia:Articles for creation/Redirects]] and
         # [[Wikipedia:Files for upload]], which aren't real submissions:
-        return subs - 2
+        return self.site.get_category("Pending AfC submissions").pages - 2
 
     def count_redirects(self):
         """Returns the number of open redirect submissions. Calculated as the
diff --git a/earwigbot/exceptions.py b/earwigbot/exceptions.py
index b06354a..193c9ed 100644
--- a/earwigbot/exceptions.py
+++ b/earwigbot/exceptions.py
@@ -31,7 +31,9 @@ This module contains all exceptions used by EarwigBot::
      |    +-- BrokenSocketError
      +-- WikiToolsetError
           +-- SiteNotFoundError
-          +-- SiteAPIError
+          +-- NoServiceError
+          +-- APIError
+          +-- SQLError
           +-- LoginError
           +-- NamespaceNotFoundError
           +-- PageNotFoundError
@@ -45,7 +47,6 @@ This module contains all exceptions used by EarwigBot::
           |    +-- ContentTooBigError
           |    +-- SpamDetectedError
           |    +-- FilteredError
-          +-- SQLError
           +-- CopyvioCheckError
                +-- UnknownSearchEngineError
                +-- UnsupportedSearchEngineError
@@ -81,7 +82,13 @@ class SiteNotFoundError(WikiToolsetError):
     Raised by :py:class:`~earwigbot.wiki.sitesdb.SitesDB`.
     """
 
-class SiteAPIError(WikiToolsetError):
+class NoServiceError(WikiToolsetError):
+    """No service is functioning to handle a specific task.
+
+    Raised by :py:meth:`Site.delegate <earwigbot.wiki.site.Site.delegate>`.
+    """
+
+class APIError(WikiToolsetError):
     """Couldn't connect to a site's API.
 
     Perhaps the server doesn't exist, our URL is wrong or incomplete, or
@@ -90,6 +97,12 @@ class SiteAPIError(WikiToolsetError):
     Raised by :py:meth:`Site.api_query <earwigbot.wiki.site.Site.api_query>`.
     """
 
+class SQLError(WikiToolsetError):
+    """Some error involving SQL querying occurred.
+
+    Raised by :py:meth:`Site.sql_query <earwigbot.wiki.site.Site.sql_query>`.
+    """
+
 class LoginError(WikiToolsetError):
     """An error occured while trying to login.
 
@@ -188,12 +201,6 @@ class FilteredError(EditError):
     :py:meth:`Page.add_section <earwigbot.wiki.page.Page.add_section>`.
     """
 
-class SQLError(WikiToolsetError):
-    """Some error involving SQL querying occurred.
-
-    Raised by :py:meth:`Site.sql_query <earwigbot.wiki.site.Site.sql_query>`.
-    """
-
 class CopyvioCheckError(WikiToolsetError):
     """An error occured when checking a page for copyright violations.
 
diff --git a/earwigbot/tasks/afc_statistics.py b/earwigbot/tasks/afc_statistics.py
index 5b200db..69dd311 100644
--- a/earwigbot/tasks/afc_statistics.py
+++ b/earwigbot/tasks/afc_statistics.py
@@ -663,7 +663,7 @@ class AFCStatistics(Task):
                 return None, None, None
             try:
                 content = self.get_revision_content(revid)
-            except exceptions.SiteAPIError:
+            except exceptions.APIError:
                 msg = "API error interrupted SQL query in get_special() for page (id: {0}, chart: {1})"
                 self.logger.exception(msg.format(pageid, chart))
                 return None, None, None
diff --git a/earwigbot/wiki/category.py b/earwigbot/wiki/category.py
index 2df9a0e..c88a163 100644
--- a/earwigbot/wiki/category.py
+++ b/earwigbot/wiki/category.py
@@ -49,7 +49,7 @@ class Category(Page):
 
     def __str__(self):
         """Return a nice string representation of the Category."""
-        return '<Category "{0}" of {1}>'.format(self.title, str(self._site))
+        return '<Category "{0}" of {1}>'.format(self.title, str(self.site))
 
     def _get_members_via_sql(self, limit, follow):
         """Iterate over Pages in the category using SQL."""
@@ -60,32 +60,32 @@ class Category(Page):
 
         if limit:
             query += " LIMIT ?"
-            result = self._site.sql_query(query, (title, limit))
+            result = self.site.sql_query(query, (title, limit))
         else:
-            result = self._site.sql_query(query, (title,))
+            result = self.site.sql_query(query, (title,))
 
         members = list(result)
         for row in members:
             base = row[0].replace("_", " ").decode("utf8")
-            namespace = self._site.namespace_id_to_name(row[1])
+            namespace = self.site.namespace_id_to_name(row[1])
             if namespace:
                 title = u":".join((namespace, base))
             else:  # Avoid doing a silly (albeit valid) ":Pagename" thing
                 title = base
-            yield self._site.get_page(title, follow_redirects=follow,
+            yield self.site.get_page(title, follow_redirects=follow,
                                       pageid=row[2])
 
     def _get_members_via_api(self, limit, follow):
         """Iterate over Pages in the category using the API."""
         params = {"action": "query", "list": "categorymembers",
-                  "cmtitle": self._title}
+                  "cmtitle": self.title}
 
         while 1:
             params["cmlimit"] = limit if limit else "max"
-            result = self._site.api_query(**params)
+            result = self.site.api_query(**params)
             for member in result["query"]["categorymembers"]:
                 title = member["title"]
-                yield self._site.get_page(title, follow_redirects=follow)
+                yield self.site.get_page(title, follow_redirects=follow)
 
             if "query-continue" in result:
                 qcontinue = result["query-continue"]["categorymembers"]
@@ -95,6 +95,45 @@ class Category(Page):
             else:
                 break
 
+    def _get_size_via_sql(self, member_type):
+        query = "SELECT COUNT(*) FROM categorylinks WHERE cl_to = ?"
+        title = self.title.replace(" ", "_").split(":", 1)[1]
+        if member_type == "size":
+            result = self.site.sql_query(query, (title,))
+        else:
+            query += " AND cl_type = ?"
+            result = self.site.sql_query(query, (title, member_type[:-1]))
+        return list(result)[0]
+
+    def _get_size_via_sql(self, member_type):
+        result = self.site.api_query(action="query", prop="categoryinfo",
+                                     cmtitle=self.title)
+        info = result["query"]["pages"].values()[0]["categoryinfo"]
+        return info[member_type]
+
+    def _get_size(self, member_type):
+        services = {
+            self.site.SERVICE_API: self._size_via_api,
+            self.site.SERVICE_SQL: self._size_via_sql
+        }
+        return self.site.delegate(services, (member_type,))
+
+    @property
+    def size(self):
+        return self._get_size("size")
+
+    @property
+    def pages(self):
+        return self._get_size("pages")
+
+    @property
+    def files(self):
+        return self._get_size("files")
+
+    @property
+    def subcats(self):
+        return self._get_size("subcats")
+
     def get_members(self, use_sql=False, limit=None, follow_redirects=None):
         """Iterate over Pages in the category.
 
diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py
index d3b839d..bebd355 100644
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -117,7 +117,7 @@ class Page(CopyrightMixIn):
         prefix = self._title.split(":", 1)[0]
         if prefix != title:  # ignore a page that's titled "Category" or "User"
             try:
-                self._namespace = self._site.namespace_name_to_id(prefix)
+                self._namespace = self.site.namespace_name_to_id(prefix)
             except exceptions.NamespaceNotFoundError:
                 self._namespace = 0
         else:
@@ -137,7 +137,7 @@ class Page(CopyrightMixIn):
 
     def __str__(self):
         """Return a nice string representation of the Page."""
-        return '<Page "{0}" of {1}>'.format(self.title, str(self._site))
+        return '<Page "{0}" of {1}>'.format(self.title, str(self.site))
 
     def _assert_validity(self):
         """Used to ensure that our page's title is valid.
@@ -199,7 +199,7 @@ class Page(CopyrightMixIn):
         Assuming the API is sound, this should not raise any exceptions.
         """
         if not result:
-            query = self._site.api_query
+            query = self.site.api_query
             result = query(action="query", rvprop="user", intoken="edit",
                            prop="info|revisions", rvlimit=1, rvdir="newer",
                            titles=self._title, inprop="protection|url")
@@ -263,7 +263,7 @@ class Page(CopyrightMixIn):
         want to force content reloading.
         """
         if not result:
-            query = self._site.api_query
+            query = self.site.api_query
             result = query(action="query", prop="revisions", rvlimit=1,
                            rvprop="content|timestamp", titles=self._title)
 
@@ -310,8 +310,8 @@ class Page(CopyrightMixIn):
 
         # Try the API query, catching most errors with our handler:
         try:
-            result = self._site.api_query(**params)
-        except exceptions.SiteAPIError as error:
+            result = self.site.api_query(**params)
+        except exceptions.APIError as error:
             if not hasattr(error, "code"):
                 raise  # We can only handle errors with a code attribute
             result = self._handle_edit_errors(error, params, tries)
@@ -375,12 +375,12 @@ class Page(CopyrightMixIn):
 
         elif error.code in ["noedit-anon", "cantcreate-anon",
                             "noimageredirect-anon"]:
-            if not all(self._site._login_info):
+            if not all(self.site._login_info):
                 # Insufficient login info:
                 raise exceptions.PermissionsError(error.info)
             if tries == 0:
                 # We have login info; try to login:
-                self._site._login(self._site._login_info)
+                self.site._login(self.site._login_info)
                 self._token = None  # Need a new token; old one is invalid now
                 return self._edit(params=params, tries=1)
             else:
@@ -416,13 +416,13 @@ class Page(CopyrightMixIn):
         log in. Otherwise, raise PermissionsError with details.
         """
         if assertion == "user":
-            if not all(self._site._login_info):
+            if not all(self.site._login_info):
                 # Insufficient login info:
                 e = "AssertEdit: user assertion failed, and no login info was provided."
                 raise exceptions.PermissionsError(e)
             if tries == 0:
                 # We have login info; try to login:
-                self._site._login(self._site._login_info)
+                self.site._login(self.site._login_info)
                 self._token = None  # Need a new token; old one is invalid now
                 return self._edit(params=params, tries=1)
             else:
@@ -502,8 +502,8 @@ class Page(CopyrightMixIn):
             return self._fullurl
         else:
             slug = quote(self._title.replace(" ", "_"), safe="/:")
-            path = self._site._article_path.replace("$1", slug)
-            return ''.join((self._site.url, path))
+            path = self.site._article_path.replace("$1", slug)
+            return ''.join((self.site.url, path))
 
     @property
     def namespace(self):
@@ -580,7 +580,7 @@ class Page(CopyrightMixIn):
         otherwise missing or invalid.
         """
         if self._namespace < 0:
-            ns = self._site.namespace_id_to_name(self._namespace)
+            ns = self.site.namespace_id_to_name(self._namespace)
             e = u"Pages in the {0} namespace can't have talk pages.".format(ns)
             raise exceptions.InvalidPageError(e)
 
@@ -594,7 +594,7 @@ class Page(CopyrightMixIn):
         except IndexError:
             body = self._title
 
-        new_prefix = self._site.namespace_id_to_name(new_ns)
+        new_prefix = self.site.namespace_id_to_name(new_ns)
 
         # If the new page is in namespace 0, don't do ":Title" (it's correct,
         # but unnecessary), just do "Title":
@@ -605,7 +605,7 @@ class Page(CopyrightMixIn):
 
         if follow_redirects is None:
             follow_redirects = self._follow_redirects
-        return Page(self._site, new_title, follow_redirects)
+        return Page(self.site, new_title, follow_redirects)
 
     def get(self):
         """Return page content, which is cached if you try to call get again.
@@ -616,7 +616,7 @@ class Page(CopyrightMixIn):
         if self._exists == self.PAGE_UNKNOWN:
             # Kill two birds with one stone by doing an API query for both our
             # attributes and our page content:
-            query = self._site.api_query
+            query = self.site.api_query
             result = query(action="query", rvlimit=1, titles=self._title,
                            prop="info|revisions", inprop="protection|url",
                            intoken="edit", rvprop="content|timestamp")
@@ -680,7 +680,7 @@ class Page(CopyrightMixIn):
         if not self._creator:
             self._load()
             self._assert_existence()
-        return self._site.get_user(self._creator)
+        return self.site.get_user(self._creator)
 
     def parse(self):
         """Parse the page content for templates, links, etc.
diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py
index 2f9bc2b..0a4f2e0 100644
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -82,6 +82,8 @@ class Site(object):
     - :py:meth:`get_category`:         returns a Category for the given title
     - :py:meth:`get_user`:             returns a User object for the given name
     """
+    SERVICE_API = 1
+    SERVICE_SQL = 2
 
     def __init__(self, name=None, project=None, lang=None, base_url=None,
                  article_path=None, script_path=None, sql=None,
@@ -228,7 +230,7 @@ class Site(object):
                 e = e.format(error.code)
             else:
                 e = "API query failed."
-            raise exceptions.SiteAPIError(e)
+            raise exceptions.APIError(e)
 
         result = response.read()
         if response.headers.get("Content-Encoding") == "gzip":
@@ -242,7 +244,7 @@ class Site(object):
         """Given API query params, return the URL to query and POST data."""
         if not self._base_url or self._script_path is None:
             e = "Tried to do an API query, but no API URL is known."
-            raise exceptions.SiteAPIError(e)
+            raise exceptions.APIError(e)
 
         url = ''.join((self.url, self._script_path, "/api.php"))
         params["format"] = "json"  # This is the only format we understand
@@ -260,7 +262,7 @@ class Site(object):
             res = loads(result)  # Try to parse as a JSON object
         except ValueError:
             e = "API query failed: JSON could not be decoded."
-            raise exceptions.SiteAPIError(e)
+            raise exceptions.APIError(e)
 
         try:
             code = res["error"]["code"]
@@ -271,7 +273,7 @@ class Site(object):
         if code == "maxlag":  # We've been throttled by the server
             if tries >= self._max_retries:
                 e = "Maximum number of retries reached ({0})."
-                raise exceptions.SiteAPIError(e.format(self._max_retries))
+                raise exceptions.APIError(e.format(self._max_retries))
             tries += 1
             msg = 'Server says "{0}"; retrying in {1} seconds ({2}/{3})'
             self._logger.info(msg.format(info, wait, tries, self._max_retries))
@@ -279,7 +281,7 @@ class Site(object):
             return self._api_query(params, tries=tries, wait=wait*2)
         else:  # Some unknown error occurred
             e = 'API query failed: got error "{0}"; server says: "{1}".'
-            error = exceptions.SiteAPIError(e.format(code, info))
+            error = exceptions.APIError(e.format(code, info))
             error.code, error.info = code, info
             raise error
 
@@ -522,6 +524,10 @@ class Site(object):
 
         self._sql_conn = oursql.connect(**args)
 
+    def _get_service_order(self):
+        """DOCSTRING                                                                                            """
+        return [self.SERVICE_SQL, self.SERVICE_API]
+
     @property
     def name(self):
         """The Site's name (or "wikiid" in the API), like ``"enwiki"``."""
@@ -559,7 +565,7 @@ class Site(object):
         This will first attempt to construct an API url from
         :py:attr:`self._base_url` and :py:attr:`self._script_path`. We need
         both of these, or else we'll raise
-        :py:exc:`~earwigbot.exceptions.SiteAPIError`. If
+        :py:exc:`~earwigbot.exceptions.APIError`. If
         :py:attr:`self._base_url` is protocol-relative (introduced in MediaWiki
         1.18), we'll choose HTTPS only if :py:attr:`self._user_https` is
         ``True``, otherwise HTTP.
@@ -578,7 +584,7 @@ class Site(object):
         load it as a JSON object, and return it.
 
         If our request failed for some reason, we'll raise
-        :py:exc:`~earwigbot.exceptions.SiteAPIError` with details. If that
+        :py:exc:`~earwigbot.exceptions.APIError` with details. If that
         reason was due to maxlag, we'll sleep for a bit and then repeat the
         query until we exceed :py:attr:`self._max_retries`.
 
@@ -739,3 +745,16 @@ class Site(object):
         else:
             username = self._get_username()
         return User(self, username)
+
+    def delegate(self, services, args=None, kwargs=None):
+        """                                                                                             DOCSTRING"""
+        if not args:
+            args = ()
+        if not kwargs:
+            kwargs = {}
+
+        order = self._get_service_order()
+        for srv in order:
+            if srv in services:
+                return services[srv](*args, **kwargs)
+        raise exceptions.NoServiceError(services)
diff --git a/earwigbot/wiki/user.py b/earwigbot/wiki/user.py
index 9762824..b71b502 100644
--- a/earwigbot/wiki/user.py
+++ b/earwigbot/wiki/user.py
@@ -82,7 +82,7 @@ class User(object):
 
     def __str__(self):
         """Return a nice string representation of the User."""
-        return '<User "{0}" of {1}>'.format(self._name, str(self._site))
+        return '<User "{0}" of {1}>'.format(self.name, str(self.site))
 
     def _get_attribute(self, attr):
         """Internally used to get an attribute by name.
@@ -107,8 +107,8 @@ class User(object):
         is not defined. This defines it.
         """
         props = "blockinfo|groups|rights|editcount|registration|emailable|gender"
-        result = self._site.api_query(action="query", list="users",
-                                      ususers=self._name, usprop=props)
+        result = self.site.api_query(action="query", list="users",
+                                     ususers=self._name, usprop=props)
         res = result["query"]["users"][0]
 
         # normalize our username in case it was entered oddly
@@ -275,9 +275,9 @@ class User(object):
         No checks are made to see if it exists or not. Proper site namespace
         conventions are followed.
         """
-        prefix = self._site.namespace_id_to_name(constants.NS_USER)
+        prefix = self.site.namespace_id_to_name(constants.NS_USER)
         pagename = ':'.join((prefix, self._name))
-        return Page(self._site, pagename)
+        return Page(self.site, pagename)
 
     def get_talkpage(self):
         """Return a Page object representing the user's talkpage.
@@ -285,6 +285,6 @@ class User(object):
         No checks are made to see if it exists or not. Proper site namespace
         conventions are followed.
         """
-        prefix = self._site.namespace_id_to_name(constants.NS_USER_TALK)
+        prefix = self.site.namespace_id_to_name(constants.NS_USER_TALK)
         pagename = ':'.join((prefix, self._name))
-        return Page(self._site, pagename)
+        return Page(self.site, pagename)

From de34d8759b9c59d6b486eda7219ad1e52eafc6fb Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 5 Jul 2012 18:19:06 -0400
Subject: [PATCH 2/8] Site._get_service_order()

---
 earwigbot/wiki/site.py | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py
index 0a4f2e0..86a990c 100644
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -81,6 +81,7 @@ class Site(object):
     - :py:meth:`get_page`:             returns a Page for the given title
     - :py:meth:`get_category`:         returns a Category for the given title
     - :py:meth:`get_user`:             returns a User object for the given name
+    - :py:meth:`delegate`:             controls when the API or SQL is used
     """
     SERVICE_API = 1
     SERVICE_SQL = 2
@@ -131,6 +132,7 @@ class Site(object):
         self._sql_data = sql
         self._sql_conn = None
         self._sql_lock = Lock()
+        self._sql_info_cache = {"replag": 0, "lastcheck": 0, "usable": None}
 
         # Attribute used in copyright violation checks (see CopyrightMixIn):
         self._search_config = search_config
@@ -525,7 +527,32 @@ class Site(object):
         self._sql_conn = oursql.connect(**args)
 
     def _get_service_order(self):
-        """DOCSTRING                                                                                            """
+        """Return a preferred order for using services (e.g. the API and SQL).
+
+        A list is returned, starting with the preferred service first and
+        ending with the least preferred one. Currently, there are only two
+        services. SERVICE_API will always be included since the API is expected
+        to be always usable. In normal circumstances, self.SERVICE_SQL will be
+        first (with the API second), since using SQL directly is easier on the
+        servers than making web queries with the API. self.SERVICE_SQL will be
+        second if replag is greater than ten minutes (a cached value updated
+        every five minutes at most). self.SERVICE_SQL will not be included in
+        the list if we cannot form a proper SQL connection.
+        """
+        if time() - self._sql_info_cache["lastcheck"] > 300:
+            self._sql_info_cache["lastcheck"] = time()
+            try:
+                self._sql_info_cache["replag"] = self.get_replag()
+            except exceptions.SQLError, oursql.Error:
+                self._sql_info_cache["usable"] = False
+                return [self.SERVICE_API]
+            self._sql_info_cache["usable"] = True
+        else:
+            if not self._sql_info_cache["usable"]:
+                return [self.SERVICE_API]
+
+        if self._sql_info_cache["replag"] > 600:
+            return [self.SERVICE_API, self.SERVICE_SQL]
         return [self.SERVICE_SQL, self.SERVICE_API]
 
     @property
@@ -747,7 +774,10 @@ class Site(object):
         return User(self, username)
 
     def delegate(self, services, args=None, kwargs=None):
-        """                                                                                             DOCSTRING"""
+        """Delegate a task to either the API or SQL depending on conditions.
+
+        *services* should be a dictionary of                                                            @@TODO
+        """
         if not args:
             args = ()
         if not kwargs:

From 85ae7c5ce18a9780469467e104d00bdc452e9fe4 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 5 Jul 2012 18:38:14 -0400
Subject: [PATCH 3/8] Site.get_maxlag(); docstring for Site.delegate()

---
 earwigbot/wiki/site.py | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py
index 86a990c..ae933a8 100644
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -75,7 +75,8 @@ class Site(object):
 
     - :py:meth:`api_query`:            does an API query with kwargs as params
     - :py:meth:`sql_query`:            does an SQL query and yields its results
-    - :py:meth:`get_replag`:           estimates the database replication lag
+    - :py:meth:`get_maxlag`:           returns the internal database lag
+    - :py:meth:`get_replag`:           estimates the external database lag
     - :py:meth:`namespace_id_to_name`: returns names associated with an NS id
     - :py:meth:`namespace_name_to_id`: returns the ID associated with a NS name
     - :py:meth:`get_page`:             returns a Page for the given title
@@ -668,8 +669,27 @@ class Site(object):
                 for result in cur:
                     yield result
 
+    def get_maxlag(self, showall=False):
+        """Return the internal database replication lag in seconds.
+
+        In a typical setup, this function returns the replication lag *within*
+        the WMF's cluster, *not* external replication lag affecting the
+        Toolserver (see :py:meth:`get_replag` for that). This is useful when
+        combined with the ``maxlag`` API query param (added by config), in
+        which queries will be halted and retried if the lag is too high,
+        usually above five seconds.
+        """
+        if showall:
+            result = self.api_query(action="query", meta="siteinfo",
+                                    siprop="dbrepllag", sishowalldb=1)
+            return [server["lag"] for server in result["query"]["dbrepllag"]]
+        else:
+            result = self.api_query(action="query", meta="siteinfo",
+                                    siprop="dbrepllag")
+            return result["query"]["dbrepllag"][0]["lag"]
+
     def get_replag(self):
-        """Return the estimated database replication lag in seconds.
+        """Return the estimated external database replication lag in seconds.
 
         Requires SQL access. This function only makes sense on a replicated
         database (e.g. the Wikimedia Toolserver) and on a wiki that receives a
@@ -776,7 +796,17 @@ class Site(object):
     def delegate(self, services, args=None, kwargs=None):
         """Delegate a task to either the API or SQL depending on conditions.
 
-        *services* should be a dictionary of                                                            @@TODO
+        *services* should be a dictionary in which the key is the service name
+        (:py:attr:`self.SERVICE_API <SERVICE_API>` or
+        :py:attr:`self.SERVICE_SQL <SERVICE_SQL>`), and the value is the
+        function to call for this service. All functions will be passed the
+        same arguments the tuple *args* and the dict **kwargs**, which are both
+        empty by default. The service order is determined by
+        :py:meth:`_get_service_order`.
+
+        Not every service needs an entry in the dictionary. Will raise
+        :py:exc:`~earwigbot.exceptions.NoServiceError` if an appropriate
+        service cannot be found.
         """
         if not args:
             args = ()

From 97a6bb2059f531126deb2b4f071b651f1bd32f92 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 5 Jul 2012 18:45:20 -0400
Subject: [PATCH 4/8] 'ignore_maxlag' in Site.get_maxlag().

---
 earwigbot/wiki/site.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py
index ae933a8..3312fd9 100644
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -206,7 +206,7 @@ class Site(object):
             args.append(key + "=" + val)
         return "&".join(args)
 
-    def _api_query(self, params, tries=0, wait=5):
+    def _api_query(self, params, tries=0, wait=5, ignore_maxlag=False):
         """Do an API query with *params* as a dict of parameters.
 
         See the documentation for :py:meth:`api_query` for full implementation
@@ -220,7 +220,7 @@ class Site(object):
             sleep(wait_time)
         self._last_query_time = time()
 
-        url, data = self._build_api_query(params)
+        url, data = self._build_api_query(params, ignore_maxlag)
         self._logger.debug("{0} -> {1}".format(url, data))
 
         try:
@@ -243,7 +243,7 @@ class Site(object):
 
         return self._handle_api_query_result(result, params, tries, wait)
 
-    def _build_api_query(self, params):
+    def _build_api_query(self, params, ignore_maxlag):
         """Given API query params, return the URL to query and POST data."""
         if not self._base_url or self._script_path is None:
             e = "Tried to do an API query, but no API URL is known."
@@ -253,7 +253,8 @@ class Site(object):
         params["format"] = "json"  # This is the only format we understand
         if self._assert_edit:  # If requested, ensure that we're logged in
             params["assert"] = self._assert_edit
-        if self._maxlag:  # If requested, don't overload the servers
+        if self._maxlag and not ignore_maxlag:
+            # If requested, don't overload the servers:
             params["maxlag"] = self._maxlag
 
         data = self._urlencode_utf8(params)
@@ -678,15 +679,18 @@ class Site(object):
         combined with the ``maxlag`` API query param (added by config), in
         which queries will be halted and retried if the lag is too high,
         usually above five seconds.
+
+        With *showall*, will return a list of the lag for all servers in the
+        cluster, not just the one with the highest lag.
         """
+        params = {"action": "query", "meta": "siteinfo", "siprop": "dbrepllag"}
+        if showall:
+            params["sishowalldb"] = 1
+        with self._api_lock:
+            result = self._api_query(params, ignore_maxlag=True)
         if showall:
-            result = self.api_query(action="query", meta="siteinfo",
-                                    siprop="dbrepllag", sishowalldb=1)
             return [server["lag"] for server in result["query"]["dbrepllag"]]
-        else:
-            result = self.api_query(action="query", meta="siteinfo",
-                                    siprop="dbrepllag")
-            return result["query"]["dbrepllag"][0]["lag"]
+        return result["query"]["dbrepllag"][0]["lag"]
 
     def get_replag(self):
         """Return the estimated external database replication lag in seconds.

From 7b001ed50dc622d03f1c3c67a70aad410c794dce Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 5 Jul 2012 19:12:11 -0400
Subject: [PATCH 5/8] Improved service ordering that takes API lag into
 account.

---
 earwigbot/wiki/site.py | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py
index 3312fd9..28508dd 100644
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -128,6 +128,7 @@ class Site(object):
         self._max_retries = 6
         self._last_query_time = 0
         self._api_lock = Lock()
+        self._api_info_cache = {"maxlag": 0, "lastcheck": 0}
 
         # Attributes used for SQL queries:
         self._sql_data = sql
@@ -531,21 +532,23 @@ class Site(object):
     def _get_service_order(self):
         """Return a preferred order for using services (e.g. the API and SQL).
 
-        A list is returned, starting with the preferred service first and
+        A list is returned, starting with the most preferred service first and
         ending with the least preferred one. Currently, there are only two
         services. SERVICE_API will always be included since the API is expected
         to be always usable. In normal circumstances, self.SERVICE_SQL will be
         first (with the API second), since using SQL directly is easier on the
         servers than making web queries with the API. self.SERVICE_SQL will be
-        second if replag is greater than ten minutes (a cached value updated
-        every five minutes at most). self.SERVICE_SQL will not be included in
-        the list if we cannot form a proper SQL connection.
+        second if replag is greater than three minutes (a cached value updated
+        every two minutes at most), *unless* API lag is also very high.
+        self.SERVICE_SQL will not be included in the list if we cannot form a
+        proper SQL connection.
         """
-        if time() - self._sql_info_cache["lastcheck"] > 300:
-            self._sql_info_cache["lastcheck"] = time()
+        now = time()
+        if now - self._sql_info_cache["lastcheck"] > 120:
+            self._sql_info_cache["lastcheck"] = now
             try:
-                self._sql_info_cache["replag"] = self.get_replag()
-            except exceptions.SQLError, oursql.Error:
+                self._sql_info_cache["replag"] = sqllag = self.get_replag()
+            except (exceptions.SQLError, oursql.Error):
                 self._sql_info_cache["usable"] = False
                 return [self.SERVICE_API]
             self._sql_info_cache["usable"] = True
@@ -553,8 +556,19 @@ class Site(object):
             if not self._sql_info_cache["usable"]:
                 return [self.SERVICE_API]
 
-        if self._sql_info_cache["replag"] > 600:
+        if sqllag > 180:
+            if not self._maxlag:
+                return [self.SERVICE_API, self.SERVICE_SQL]
+            if now - self._api_info_cache["lastcheck"] > 120:
+                self._api_info_cache["lastcheck"] = now
+                try:
+                    self._api_info_cache["maxlag"] = apilag = self.get_maxlag()
+                except exceptions.APIError:
+                    self._api_info_cache["maxlag"] = apilag = 0
+            if sqllag / (180.0 / self._maxlag) < apilag:
+                return [self.SERVICE_SQL, self.SERVICE_API]
             return [self.SERVICE_API, self.SERVICE_SQL]
+
         return [self.SERVICE_SQL, self.SERVICE_API]
 
     @property

From e01ca0fd3149b82ffed1245f46b7e3d9704850de Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 5 Jul 2012 19:31:49 -0400
Subject: [PATCH 6/8] Use service delegation for Category.get_members().

---
 earwigbot/wiki/category.py | 69 +++++++++++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 34 deletions(-)

diff --git a/earwigbot/wiki/category.py b/earwigbot/wiki/category.py
index c88a163..687a7e7 100644
--- a/earwigbot/wiki/category.py
+++ b/earwigbot/wiki/category.py
@@ -51,6 +51,26 @@ class Category(Page):
         """Return a nice string representation of the Category."""
         return '<Category "{0}" of {1}>'.format(self.title, str(self.site))
 
+    def _get_members_via_api(self, limit, follow):
+        """Iterate over Pages in the category using the API."""
+        params = {"action": "query", "list": "categorymembers",
+                  "cmtitle": self.title}
+
+        while 1:
+            params["cmlimit"] = limit if limit else "max"
+            result = self.site.api_query(**params)
+            for member in result["query"]["categorymembers"]:
+                title = member["title"]
+                yield self.site.get_page(title, follow_redirects=follow)
+
+            if "query-continue" in result:
+                qcontinue = result["query-continue"]["categorymembers"]
+                params["cmcontinue"] = qcontinue["cmcontinue"]
+                if limit:
+                    limit -= len(result["query"]["categorymembers"])
+            else:
+                break
+
     def _get_members_via_sql(self, limit, follow):
         """Iterate over Pages in the category using SQL."""
         query = """SELECT page_title, page_namespace, page_id FROM page
@@ -75,27 +95,7 @@ class Category(Page):
             yield self.site.get_page(title, follow_redirects=follow,
                                       pageid=row[2])
 
-    def _get_members_via_api(self, limit, follow):
-        """Iterate over Pages in the category using the API."""
-        params = {"action": "query", "list": "categorymembers",
-                  "cmtitle": self.title}
-
-        while 1:
-            params["cmlimit"] = limit if limit else "max"
-            result = self.site.api_query(**params)
-            for member in result["query"]["categorymembers"]:
-                title = member["title"]
-                yield self.site.get_page(title, follow_redirects=follow)
-
-            if "query-continue" in result:
-                qcontinue = result["query-continue"]["categorymembers"]
-                params["cmcontinue"] = qcontinue["cmcontinue"]
-                if limit:
-                    limit -= len(result["query"]["categorymembers"])
-            else:
-                break
-
-    def _get_size_via_sql(self, member_type):
+    def _get_size_via_api(self, member_type):
         query = "SELECT COUNT(*) FROM categorylinks WHERE cl_to = ?"
         title = self.title.replace(" ", "_").split(":", 1)[1]
         if member_type == "size":
@@ -134,20 +134,20 @@ class Category(Page):
     def subcats(self):
         return self._get_size("subcats")
 
-    def get_members(self, use_sql=False, limit=None, follow_redirects=None):
+    def get_members(self, limit=None, follow_redirects=None):
         """Iterate over Pages in the category.
 
-        If *use_sql* is ``True``, we will use a SQL query instead of the API.
-        Note that pages are retrieved from the API in chunks (by default, in
-        500-page chunks for normal users and 5000-page chunks for bots and
-        admins), so queries may be made as we go along. If *limit* is given, we
-        will provide this many pages, or less if the category is smaller. By
-        default, *limit* is ``None``, meaning we will keep iterating over
-        members until the category is exhausted. *follow_redirects* is passed
-        directly to :py:meth:`site.get_page()
+        If *limit* is given, we will provide this many pages, or less if the
+        category is smaller. By default, *limit* is ``None``, meaning we will
+        keep iterating over members until the category is exhausted.
+        *follow_redirects* is passed directly to :py:meth:`site.get_page()
         <earwigbot.wiki.site.Site.get_page>`; it defaults to ``None``, which
         will use the value passed to our :py:meth:`__init__`.
 
+        This will use either the API or SQL depending on which are enabled and
+        the amount of lag on each. This is handled by :py:meth:`site.delegate()
+        <earwigbot.wiki.site.Site.delegate>`.
+
         .. note::
            Be careful when iterating over very large categories with no limit.
            If using the API, at best, you will make one query per 5000 pages,
@@ -160,9 +160,10 @@ class Category(Page):
            thousand, in which case the sheer number of titles in memory becomes
            problematic.
         """
+        services = {
+            self.site.SERVICE_API: self._get_members_via_api,
+            self.site.SERVICE_SQL: self._get_members_via_sql
+        }
         if follow_redirects is None:
             follow_redirects = self._follow_redirects
-        if use_sql:
-            return self._get_members_via_sql(limit, follow_redirects)
-        else:
-            return self._get_members_via_api(limit, follow_redirects)
+        return self.site.delegate(services, (follow_redirects,))

From f1ac019b3eca25d5edf5f4fab4c28f4be00e9b50 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 5 Jul 2012 19:40:11 -0400
Subject: [PATCH 7/8] Remove all use_sql from commands+tasks.

---
 earwigbot/commands/afc_submissions.py | 3 +--
 earwigbot/tasks/afc_history.py        | 2 +-
 earwigbot/tasks/afc_statistics.py     | 4 +---
 earwigbot/wiki/page.py                | 2 +-
 4 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/earwigbot/commands/afc_submissions.py b/earwigbot/commands/afc_submissions.py
index 163c055..d20722e 100644
--- a/earwigbot/commands/afc_submissions.py
+++ b/earwigbot/commands/afc_submissions.py
@@ -55,8 +55,7 @@ class AFCSubmissions(Command):
 
         site = self.bot.wiki.get_site()
         category = site.get_category("Pending AfC submissions")
-        limit = number + len(self.ignore_list)
-        members = category.get_members(use_sql=True, limit=limit)
+        members = category.get_members(limit=number + len(self.ignore_list))
         urls = [member.url for member in members if member.title not in self.ignore_list]
         pages = ", ".join(urls[:number])
         self.reply(data, "{0} pending AfC subs: {1}".format(number, pages))
diff --git a/earwigbot/tasks/afc_history.py b/earwigbot/tasks/afc_history.py
index 87c8636..7a0582a 100644
--- a/earwigbot/tasks/afc_history.py
+++ b/earwigbot/tasks/afc_history.py
@@ -130,7 +130,7 @@ class AFCHistory(Task):
         q_delete = "DELETE FROM page WHERE page_id = ?"
         q_update = "UPDATE page SET page_date = ?, page_status = ? WHERE page_id = ?"
         q_insert = "INSERT INTO page VALUES (?, ?, ?)"
-        members = category.get_members(use_sql=True)
+        members = category.get_members()
 
         with self.conn.cursor() as cursor:
             for title, pageid in members:
diff --git a/earwigbot/tasks/afc_statistics.py b/earwigbot/tasks/afc_statistics.py
index 69dd311..be6d95a 100644
--- a/earwigbot/tasks/afc_statistics.py
+++ b/earwigbot/tasks/afc_statistics.py
@@ -271,9 +271,7 @@ class AFCStatistics(Task):
         tracked = [i[0] for i in cursor.fetchall()]
 
         category = self.site.get_category(self.pending_cat)
-        pending = category.get_members(use_sql=True)
-
-        for title, pageid in pending:
+        for title, pageid in category.get_members():
             if title in self.ignore_list:
                 continue
             if pageid not in tracked:
diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py
index bebd355..d5a4cca 100644
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -476,7 +476,7 @@ class Page(CopyrightMixIn):
         Makes an API query only if we haven't already made one and the *pageid*
         parameter to :py:meth:`__init__` was left as ``None``, which should be
         true for all cases except when pages are returned by an SQL generator
-        (like :py:meth:`category.get_members(use_sql=True)
+        (like :py:meth:`category.get_members()
         <earwigbot.wiki.category.Category.get_members>`).
 
         Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or

From 7005a9b21b2791ca22a933d05ff1596a3d18fd9c Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 5 Jul 2012 19:59:01 -0400
Subject: [PATCH 8/8] Docstrings for Category

---
 earwigbot/wiki/category.py | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/earwigbot/wiki/category.py b/earwigbot/wiki/category.py
index 687a7e7..401047b 100644
--- a/earwigbot/wiki/category.py
+++ b/earwigbot/wiki/category.py
@@ -37,6 +37,13 @@ class Category(Page):
     the category namespace; :py:meth:`~earwigbot.wiki.site.Site.get_category`
     is shorthand, accepting category names without the namespace prefix.
 
+    *Attributes:*
+
+    - :py:attr:`size`:    the total number of members in the category
+    - :py:attr:`pages`:   the number of pages in the category
+    - :py:attr:`files`:   the number of files in the category
+    - :py:attr:`subcats`: the number of subcategories in the category
+
     *Public methods:*
 
     - :py:meth:`get_members`: iterates over Pages in the category
@@ -96,6 +103,7 @@ class Category(Page):
                                       pageid=row[2])
 
     def _get_size_via_api(self, member_type):
+        """Return the size of the category using the API."""
         query = "SELECT COUNT(*) FROM categorylinks WHERE cl_to = ?"
         title = self.title.replace(" ", "_").split(":", 1)[1]
         if member_type == "size":
@@ -106,12 +114,14 @@ class Category(Page):
         return list(result)[0]
 
     def _get_size_via_sql(self, member_type):
+        """Return the size of the category using SQL."""
         result = self.site.api_query(action="query", prop="categoryinfo",
                                      cmtitle=self.title)
         info = result["query"]["pages"].values()[0]["categoryinfo"]
         return info[member_type]
 
     def _get_size(self, member_type):
+        """Return the size of the category."""
         services = {
             self.site.SERVICE_API: self._size_via_api,
             self.site.SERVICE_SQL: self._size_via_sql
@@ -120,18 +130,44 @@ class Category(Page):
 
     @property
     def size(self):
+        """The total number of members in the category.
+
+        Includes pages, files, and subcats. Equal to :py:attr:`pages` +
+        :py:attr:`files` + :py:attr:`subcats`. This will use either the API or
+        SQL depending on which are enabled and the amount of lag on each. This
+        is handled by :py:meth:`site.delegate()
+        <earwigbot.wiki.site.Site.delegate>`.
+        """
         return self._get_size("size")
 
     @property
     def pages(self):
+        """The number of pages in the category.
+
+        This will use either the API or SQL depending on which are enabled and
+        the amount of lag on each. This is handled by :py:meth:`site.delegate()
+        <earwigbot.wiki.site.Site.delegate>`.
+        """
         return self._get_size("pages")
 
     @property
     def files(self):
+        """The number of files in the category.
+
+        This will use either the API or SQL depending on which are enabled and
+        the amount of lag on each. This is handled by :py:meth:`site.delegate()
+        <earwigbot.wiki.site.Site.delegate>`.
+        """
         return self._get_size("files")
 
     @property
     def subcats(self):
+        """The number of subcategories in the category.
+
+        This will use either the API or SQL depending on which are enabled and
+        the amount of lag on each. This is handled by :py:meth:`site.delegate()
+        <earwigbot.wiki.site.Site.delegate>`.
+        """
         return self._get_size("subcats")
 
     def get_members(self, limit=None, follow_redirects=None):