From 6433957ae9917f3dc06e5022d8148d6bea6047d1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 5 Jul 2012 17:46:21 -0400 Subject: [PATCH 1/8] First stages of service delegation. --- earwigbot/commands/afc_status.py | 7 ++--- earwigbot/exceptions.py | 25 +++++++++++------- earwigbot/tasks/afc_statistics.py | 2 +- earwigbot/wiki/category.py | 55 +++++++++++++++++++++++++++++++++------ earwigbot/wiki/page.py | 34 ++++++++++++------------ earwigbot/wiki/site.py | 33 ++++++++++++++++++----- earwigbot/wiki/user.py | 14 +++++----- 7 files changed, 116 insertions(+), 54 deletions(-) diff --git a/earwigbot/commands/afc_status.py b/earwigbot/commands/afc_status.py index 4f517d5..c03da85 100644 --- a/earwigbot/commands/afc_status.py +++ b/earwigbot/commands/afc_status.py @@ -109,12 +109,9 @@ class AFCStatus(Command): def count_submissions(self): """Returns the number of open AFC submissions (count of CAT:PEND).""" - cat = self.site.get_category("Pending AfC submissions") - subs = len(cat.get_members(use_sql=True)) - - # Remove [[Wikipedia:Articles for creation/Redirects]] and + # Subtract two for [[Wikipedia:Articles for creation/Redirects]] and # [[Wikipedia:Files for upload]], which aren't real submissions: - return subs - 2 + return self.site.get_category("Pending AfC submissions").pages - 2 def count_redirects(self): """Returns the number of open redirect submissions. Calculated as the diff --git a/earwigbot/exceptions.py b/earwigbot/exceptions.py index b06354a..193c9ed 100644 --- a/earwigbot/exceptions.py +++ b/earwigbot/exceptions.py @@ -31,7 +31,9 @@ This module contains all exceptions used by EarwigBot:: | +-- BrokenSocketError +-- WikiToolsetError +-- SiteNotFoundError - +-- SiteAPIError + +-- NoServiceError + +-- APIError + +-- SQLError +-- LoginError +-- NamespaceNotFoundError +-- PageNotFoundError @@ -45,7 +47,6 @@ This module contains all exceptions used by EarwigBot:: | +-- ContentTooBigError | +-- SpamDetectedError | +-- FilteredError - +-- SQLError +-- CopyvioCheckError +-- UnknownSearchEngineError +-- UnsupportedSearchEngineError @@ -81,7 +82,13 @@ class SiteNotFoundError(WikiToolsetError): Raised by :py:class:`~earwigbot.wiki.sitesdb.SitesDB`. """ -class SiteAPIError(WikiToolsetError): +class NoServiceError(WikiToolsetError): + """No service is functioning to handle a specific task. + + Raised by :py:meth:`Site.delegate `. + """ + +class APIError(WikiToolsetError): """Couldn't connect to a site's API. Perhaps the server doesn't exist, our URL is wrong or incomplete, or @@ -90,6 +97,12 @@ class SiteAPIError(WikiToolsetError): Raised by :py:meth:`Site.api_query `. """ +class SQLError(WikiToolsetError): + """Some error involving SQL querying occurred. + + Raised by :py:meth:`Site.sql_query `. + """ + class LoginError(WikiToolsetError): """An error occured while trying to login. @@ -188,12 +201,6 @@ class FilteredError(EditError): :py:meth:`Page.add_section `. """ -class SQLError(WikiToolsetError): - """Some error involving SQL querying occurred. - - Raised by :py:meth:`Site.sql_query `. - """ - class CopyvioCheckError(WikiToolsetError): """An error occured when checking a page for copyright violations. diff --git a/earwigbot/tasks/afc_statistics.py b/earwigbot/tasks/afc_statistics.py index 5b200db..69dd311 100644 --- a/earwigbot/tasks/afc_statistics.py +++ b/earwigbot/tasks/afc_statistics.py @@ -663,7 +663,7 @@ class AFCStatistics(Task): return None, None, None try: content = self.get_revision_content(revid) - except exceptions.SiteAPIError: + except exceptions.APIError: msg = "API error interrupted SQL query in get_special() for page (id: {0}, chart: {1})" self.logger.exception(msg.format(pageid, chart)) return None, None, None diff --git a/earwigbot/wiki/category.py b/earwigbot/wiki/category.py index 2df9a0e..c88a163 100644 --- a/earwigbot/wiki/category.py +++ b/earwigbot/wiki/category.py @@ -49,7 +49,7 @@ class Category(Page): def __str__(self): """Return a nice string representation of the Category.""" - return ''.format(self.title, str(self._site)) + return ''.format(self.title, str(self.site)) def _get_members_via_sql(self, limit, follow): """Iterate over Pages in the category using SQL.""" @@ -60,32 +60,32 @@ class Category(Page): if limit: query += " LIMIT ?" - result = self._site.sql_query(query, (title, limit)) + result = self.site.sql_query(query, (title, limit)) else: - result = self._site.sql_query(query, (title,)) + result = self.site.sql_query(query, (title,)) members = list(result) for row in members: base = row[0].replace("_", " ").decode("utf8") - namespace = self._site.namespace_id_to_name(row[1]) + namespace = self.site.namespace_id_to_name(row[1]) if namespace: title = u":".join((namespace, base)) else: # Avoid doing a silly (albeit valid) ":Pagename" thing title = base - yield self._site.get_page(title, follow_redirects=follow, + yield self.site.get_page(title, follow_redirects=follow, pageid=row[2]) def _get_members_via_api(self, limit, follow): """Iterate over Pages in the category using the API.""" params = {"action": "query", "list": "categorymembers", - "cmtitle": self._title} + "cmtitle": self.title} while 1: params["cmlimit"] = limit if limit else "max" - result = self._site.api_query(**params) + result = self.site.api_query(**params) for member in result["query"]["categorymembers"]: title = member["title"] - yield self._site.get_page(title, follow_redirects=follow) + yield self.site.get_page(title, follow_redirects=follow) if "query-continue" in result: qcontinue = result["query-continue"]["categorymembers"] @@ -95,6 +95,45 @@ class Category(Page): else: break + def _get_size_via_sql(self, member_type): + query = "SELECT COUNT(*) FROM categorylinks WHERE cl_to = ?" + title = self.title.replace(" ", "_").split(":", 1)[1] + if member_type == "size": + result = self.site.sql_query(query, (title,)) + else: + query += " AND cl_type = ?" + result = self.site.sql_query(query, (title, member_type[:-1])) + return list(result)[0] + + def _get_size_via_sql(self, member_type): + result = self.site.api_query(action="query", prop="categoryinfo", + cmtitle=self.title) + info = result["query"]["pages"].values()[0]["categoryinfo"] + return info[member_type] + + def _get_size(self, member_type): + services = { + self.site.SERVICE_API: self._size_via_api, + self.site.SERVICE_SQL: self._size_via_sql + } + return self.site.delegate(services, (member_type,)) + + @property + def size(self): + return self._get_size("size") + + @property + def pages(self): + return self._get_size("pages") + + @property + def files(self): + return self._get_size("files") + + @property + def subcats(self): + return self._get_size("subcats") + def get_members(self, use_sql=False, limit=None, follow_redirects=None): """Iterate over Pages in the category. diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py index d3b839d..bebd355 100644 --- a/earwigbot/wiki/page.py +++ b/earwigbot/wiki/page.py @@ -117,7 +117,7 @@ class Page(CopyrightMixIn): prefix = self._title.split(":", 1)[0] if prefix != title: # ignore a page that's titled "Category" or "User" try: - self._namespace = self._site.namespace_name_to_id(prefix) + self._namespace = self.site.namespace_name_to_id(prefix) except exceptions.NamespaceNotFoundError: self._namespace = 0 else: @@ -137,7 +137,7 @@ class Page(CopyrightMixIn): def __str__(self): """Return a nice string representation of the Page.""" - return ''.format(self.title, str(self._site)) + return ''.format(self.title, str(self.site)) def _assert_validity(self): """Used to ensure that our page's title is valid. @@ -199,7 +199,7 @@ class Page(CopyrightMixIn): Assuming the API is sound, this should not raise any exceptions. """ if not result: - query = self._site.api_query + query = self.site.api_query result = query(action="query", rvprop="user", intoken="edit", prop="info|revisions", rvlimit=1, rvdir="newer", titles=self._title, inprop="protection|url") @@ -263,7 +263,7 @@ class Page(CopyrightMixIn): want to force content reloading. """ if not result: - query = self._site.api_query + query = self.site.api_query result = query(action="query", prop="revisions", rvlimit=1, rvprop="content|timestamp", titles=self._title) @@ -310,8 +310,8 @@ class Page(CopyrightMixIn): # Try the API query, catching most errors with our handler: try: - result = self._site.api_query(**params) - except exceptions.SiteAPIError as error: + result = self.site.api_query(**params) + except exceptions.APIError as error: if not hasattr(error, "code"): raise # We can only handle errors with a code attribute result = self._handle_edit_errors(error, params, tries) @@ -375,12 +375,12 @@ class Page(CopyrightMixIn): elif error.code in ["noedit-anon", "cantcreate-anon", "noimageredirect-anon"]: - if not all(self._site._login_info): + if not all(self.site._login_info): # Insufficient login info: raise exceptions.PermissionsError(error.info) if tries == 0: # We have login info; try to login: - self._site._login(self._site._login_info) + self.site._login(self.site._login_info) self._token = None # Need a new token; old one is invalid now return self._edit(params=params, tries=1) else: @@ -416,13 +416,13 @@ class Page(CopyrightMixIn): log in. Otherwise, raise PermissionsError with details. """ if assertion == "user": - if not all(self._site._login_info): + if not all(self.site._login_info): # Insufficient login info: e = "AssertEdit: user assertion failed, and no login info was provided." raise exceptions.PermissionsError(e) if tries == 0: # We have login info; try to login: - self._site._login(self._site._login_info) + self.site._login(self.site._login_info) self._token = None # Need a new token; old one is invalid now return self._edit(params=params, tries=1) else: @@ -502,8 +502,8 @@ class Page(CopyrightMixIn): return self._fullurl else: slug = quote(self._title.replace(" ", "_"), safe="/:") - path = self._site._article_path.replace("$1", slug) - return ''.join((self._site.url, path)) + path = self.site._article_path.replace("$1", slug) + return ''.join((self.site.url, path)) @property def namespace(self): @@ -580,7 +580,7 @@ class Page(CopyrightMixIn): otherwise missing or invalid. """ if self._namespace < 0: - ns = self._site.namespace_id_to_name(self._namespace) + ns = self.site.namespace_id_to_name(self._namespace) e = u"Pages in the {0} namespace can't have talk pages.".format(ns) raise exceptions.InvalidPageError(e) @@ -594,7 +594,7 @@ class Page(CopyrightMixIn): except IndexError: body = self._title - new_prefix = self._site.namespace_id_to_name(new_ns) + new_prefix = self.site.namespace_id_to_name(new_ns) # If the new page is in namespace 0, don't do ":Title" (it's correct, # but unnecessary), just do "Title": @@ -605,7 +605,7 @@ class Page(CopyrightMixIn): if follow_redirects is None: follow_redirects = self._follow_redirects - return Page(self._site, new_title, follow_redirects) + return Page(self.site, new_title, follow_redirects) def get(self): """Return page content, which is cached if you try to call get again. @@ -616,7 +616,7 @@ class Page(CopyrightMixIn): if self._exists == self.PAGE_UNKNOWN: # Kill two birds with one stone by doing an API query for both our # attributes and our page content: - query = self._site.api_query + query = self.site.api_query result = query(action="query", rvlimit=1, titles=self._title, prop="info|revisions", inprop="protection|url", intoken="edit", rvprop="content|timestamp") @@ -680,7 +680,7 @@ class Page(CopyrightMixIn): if not self._creator: self._load() self._assert_existence() - return self._site.get_user(self._creator) + return self.site.get_user(self._creator) def parse(self): """Parse the page content for templates, links, etc. diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index 2f9bc2b..0a4f2e0 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -82,6 +82,8 @@ class Site(object): - :py:meth:`get_category`: returns a Category for the given title - :py:meth:`get_user`: returns a User object for the given name """ + SERVICE_API = 1 + SERVICE_SQL = 2 def __init__(self, name=None, project=None, lang=None, base_url=None, article_path=None, script_path=None, sql=None, @@ -228,7 +230,7 @@ class Site(object): e = e.format(error.code) else: e = "API query failed." - raise exceptions.SiteAPIError(e) + raise exceptions.APIError(e) result = response.read() if response.headers.get("Content-Encoding") == "gzip": @@ -242,7 +244,7 @@ class Site(object): """Given API query params, return the URL to query and POST data.""" if not self._base_url or self._script_path is None: e = "Tried to do an API query, but no API URL is known." - raise exceptions.SiteAPIError(e) + raise exceptions.APIError(e) url = ''.join((self.url, self._script_path, "/api.php")) params["format"] = "json" # This is the only format we understand @@ -260,7 +262,7 @@ class Site(object): res = loads(result) # Try to parse as a JSON object except ValueError: e = "API query failed: JSON could not be decoded." - raise exceptions.SiteAPIError(e) + raise exceptions.APIError(e) try: code = res["error"]["code"] @@ -271,7 +273,7 @@ class Site(object): if code == "maxlag": # We've been throttled by the server if tries >= self._max_retries: e = "Maximum number of retries reached ({0})." - raise exceptions.SiteAPIError(e.format(self._max_retries)) + raise exceptions.APIError(e.format(self._max_retries)) tries += 1 msg = 'Server says "{0}"; retrying in {1} seconds ({2}/{3})' self._logger.info(msg.format(info, wait, tries, self._max_retries)) @@ -279,7 +281,7 @@ class Site(object): return self._api_query(params, tries=tries, wait=wait*2) else: # Some unknown error occurred e = 'API query failed: got error "{0}"; server says: "{1}".' - error = exceptions.SiteAPIError(e.format(code, info)) + error = exceptions.APIError(e.format(code, info)) error.code, error.info = code, info raise error @@ -522,6 +524,10 @@ class Site(object): self._sql_conn = oursql.connect(**args) + def _get_service_order(self): + """DOCSTRING """ + return [self.SERVICE_SQL, self.SERVICE_API] + @property def name(self): """The Site's name (or "wikiid" in the API), like ``"enwiki"``.""" @@ -559,7 +565,7 @@ class Site(object): This will first attempt to construct an API url from :py:attr:`self._base_url` and :py:attr:`self._script_path`. We need both of these, or else we'll raise - :py:exc:`~earwigbot.exceptions.SiteAPIError`. If + :py:exc:`~earwigbot.exceptions.APIError`. If :py:attr:`self._base_url` is protocol-relative (introduced in MediaWiki 1.18), we'll choose HTTPS only if :py:attr:`self._user_https` is ``True``, otherwise HTTP. @@ -578,7 +584,7 @@ class Site(object): load it as a JSON object, and return it. If our request failed for some reason, we'll raise - :py:exc:`~earwigbot.exceptions.SiteAPIError` with details. If that + :py:exc:`~earwigbot.exceptions.APIError` with details. If that reason was due to maxlag, we'll sleep for a bit and then repeat the query until we exceed :py:attr:`self._max_retries`. @@ -739,3 +745,16 @@ class Site(object): else: username = self._get_username() return User(self, username) + + def delegate(self, services, args=None, kwargs=None): + """ DOCSTRING""" + if not args: + args = () + if not kwargs: + kwargs = {} + + order = self._get_service_order() + for srv in order: + if srv in services: + return services[srv](*args, **kwargs) + raise exceptions.NoServiceError(services) diff --git a/earwigbot/wiki/user.py b/earwigbot/wiki/user.py index 9762824..b71b502 100644 --- a/earwigbot/wiki/user.py +++ b/earwigbot/wiki/user.py @@ -82,7 +82,7 @@ class User(object): def __str__(self): """Return a nice string representation of the User.""" - return ''.format(self._name, str(self._site)) + return ''.format(self.name, str(self.site)) def _get_attribute(self, attr): """Internally used to get an attribute by name. @@ -107,8 +107,8 @@ class User(object): is not defined. This defines it. """ props = "blockinfo|groups|rights|editcount|registration|emailable|gender" - result = self._site.api_query(action="query", list="users", - ususers=self._name, usprop=props) + result = self.site.api_query(action="query", list="users", + ususers=self._name, usprop=props) res = result["query"]["users"][0] # normalize our username in case it was entered oddly @@ -275,9 +275,9 @@ class User(object): No checks are made to see if it exists or not. Proper site namespace conventions are followed. """ - prefix = self._site.namespace_id_to_name(constants.NS_USER) + prefix = self.site.namespace_id_to_name(constants.NS_USER) pagename = ':'.join((prefix, self._name)) - return Page(self._site, pagename) + return Page(self.site, pagename) def get_talkpage(self): """Return a Page object representing the user's talkpage. @@ -285,6 +285,6 @@ class User(object): No checks are made to see if it exists or not. Proper site namespace conventions are followed. """ - prefix = self._site.namespace_id_to_name(constants.NS_USER_TALK) + prefix = self.site.namespace_id_to_name(constants.NS_USER_TALK) pagename = ':'.join((prefix, self._name)) - return Page(self._site, pagename) + return Page(self.site, pagename) From de34d8759b9c59d6b486eda7219ad1e52eafc6fb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 5 Jul 2012 18:19:06 -0400 Subject: [PATCH 2/8] Site._get_service_order() --- earwigbot/wiki/site.py | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index 0a4f2e0..86a990c 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -81,6 +81,7 @@ class Site(object): - :py:meth:`get_page`: returns a Page for the given title - :py:meth:`get_category`: returns a Category for the given title - :py:meth:`get_user`: returns a User object for the given name + - :py:meth:`delegate`: controls when the API or SQL is used """ SERVICE_API = 1 SERVICE_SQL = 2 @@ -131,6 +132,7 @@ class Site(object): self._sql_data = sql self._sql_conn = None self._sql_lock = Lock() + self._sql_info_cache = {"replag": 0, "lastcheck": 0, "usable": None} # Attribute used in copyright violation checks (see CopyrightMixIn): self._search_config = search_config @@ -525,7 +527,32 @@ class Site(object): self._sql_conn = oursql.connect(**args) def _get_service_order(self): - """DOCSTRING """ + """Return a preferred order for using services (e.g. the API and SQL). + + A list is returned, starting with the preferred service first and + ending with the least preferred one. Currently, there are only two + services. SERVICE_API will always be included since the API is expected + to be always usable. In normal circumstances, self.SERVICE_SQL will be + first (with the API second), since using SQL directly is easier on the + servers than making web queries with the API. self.SERVICE_SQL will be + second if replag is greater than ten minutes (a cached value updated + every five minutes at most). self.SERVICE_SQL will not be included in + the list if we cannot form a proper SQL connection. + """ + if time() - self._sql_info_cache["lastcheck"] > 300: + self._sql_info_cache["lastcheck"] = time() + try: + self._sql_info_cache["replag"] = self.get_replag() + except exceptions.SQLError, oursql.Error: + self._sql_info_cache["usable"] = False + return [self.SERVICE_API] + self._sql_info_cache["usable"] = True + else: + if not self._sql_info_cache["usable"]: + return [self.SERVICE_API] + + if self._sql_info_cache["replag"] > 600: + return [self.SERVICE_API, self.SERVICE_SQL] return [self.SERVICE_SQL, self.SERVICE_API] @property @@ -747,7 +774,10 @@ class Site(object): return User(self, username) def delegate(self, services, args=None, kwargs=None): - """ DOCSTRING""" + """Delegate a task to either the API or SQL depending on conditions. + + *services* should be a dictionary of @@TODO + """ if not args: args = () if not kwargs: From 85ae7c5ce18a9780469467e104d00bdc452e9fe4 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 5 Jul 2012 18:38:14 -0400 Subject: [PATCH 3/8] Site.get_maxlag(); docstring for Site.delegate() --- earwigbot/wiki/site.py | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index 86a990c..ae933a8 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -75,7 +75,8 @@ class Site(object): - :py:meth:`api_query`: does an API query with kwargs as params - :py:meth:`sql_query`: does an SQL query and yields its results - - :py:meth:`get_replag`: estimates the database replication lag + - :py:meth:`get_maxlag`: returns the internal database lag + - :py:meth:`get_replag`: estimates the external database lag - :py:meth:`namespace_id_to_name`: returns names associated with an NS id - :py:meth:`namespace_name_to_id`: returns the ID associated with a NS name - :py:meth:`get_page`: returns a Page for the given title @@ -668,8 +669,27 @@ class Site(object): for result in cur: yield result + def get_maxlag(self, showall=False): + """Return the internal database replication lag in seconds. + + In a typical setup, this function returns the replication lag *within* + the WMF's cluster, *not* external replication lag affecting the + Toolserver (see :py:meth:`get_replag` for that). This is useful when + combined with the ``maxlag`` API query param (added by config), in + which queries will be halted and retried if the lag is too high, + usually above five seconds. + """ + if showall: + result = self.api_query(action="query", meta="siteinfo", + siprop="dbrepllag", sishowalldb=1) + return [server["lag"] for server in result["query"]["dbrepllag"]] + else: + result = self.api_query(action="query", meta="siteinfo", + siprop="dbrepllag") + return result["query"]["dbrepllag"][0]["lag"] + def get_replag(self): - """Return the estimated database replication lag in seconds. + """Return the estimated external database replication lag in seconds. Requires SQL access. This function only makes sense on a replicated database (e.g. the Wikimedia Toolserver) and on a wiki that receives a @@ -776,7 +796,17 @@ class Site(object): def delegate(self, services, args=None, kwargs=None): """Delegate a task to either the API or SQL depending on conditions. - *services* should be a dictionary of @@TODO + *services* should be a dictionary in which the key is the service name + (:py:attr:`self.SERVICE_API ` or + :py:attr:`self.SERVICE_SQL `), and the value is the + function to call for this service. All functions will be passed the + same arguments the tuple *args* and the dict **kwargs**, which are both + empty by default. The service order is determined by + :py:meth:`_get_service_order`. + + Not every service needs an entry in the dictionary. Will raise + :py:exc:`~earwigbot.exceptions.NoServiceError` if an appropriate + service cannot be found. """ if not args: args = () From 97a6bb2059f531126deb2b4f071b651f1bd32f92 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 5 Jul 2012 18:45:20 -0400 Subject: [PATCH 4/8] 'ignore_maxlag' in Site.get_maxlag(). --- earwigbot/wiki/site.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index ae933a8..3312fd9 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -206,7 +206,7 @@ class Site(object): args.append(key + "=" + val) return "&".join(args) - def _api_query(self, params, tries=0, wait=5): + def _api_query(self, params, tries=0, wait=5, ignore_maxlag=False): """Do an API query with *params* as a dict of parameters. See the documentation for :py:meth:`api_query` for full implementation @@ -220,7 +220,7 @@ class Site(object): sleep(wait_time) self._last_query_time = time() - url, data = self._build_api_query(params) + url, data = self._build_api_query(params, ignore_maxlag) self._logger.debug("{0} -> {1}".format(url, data)) try: @@ -243,7 +243,7 @@ class Site(object): return self._handle_api_query_result(result, params, tries, wait) - def _build_api_query(self, params): + def _build_api_query(self, params, ignore_maxlag): """Given API query params, return the URL to query and POST data.""" if not self._base_url or self._script_path is None: e = "Tried to do an API query, but no API URL is known." @@ -253,7 +253,8 @@ class Site(object): params["format"] = "json" # This is the only format we understand if self._assert_edit: # If requested, ensure that we're logged in params["assert"] = self._assert_edit - if self._maxlag: # If requested, don't overload the servers + if self._maxlag and not ignore_maxlag: + # If requested, don't overload the servers: params["maxlag"] = self._maxlag data = self._urlencode_utf8(params) @@ -678,15 +679,18 @@ class Site(object): combined with the ``maxlag`` API query param (added by config), in which queries will be halted and retried if the lag is too high, usually above five seconds. + + With *showall*, will return a list of the lag for all servers in the + cluster, not just the one with the highest lag. """ + params = {"action": "query", "meta": "siteinfo", "siprop": "dbrepllag"} + if showall: + params["sishowalldb"] = 1 + with self._api_lock: + result = self._api_query(params, ignore_maxlag=True) if showall: - result = self.api_query(action="query", meta="siteinfo", - siprop="dbrepllag", sishowalldb=1) return [server["lag"] for server in result["query"]["dbrepllag"]] - else: - result = self.api_query(action="query", meta="siteinfo", - siprop="dbrepllag") - return result["query"]["dbrepllag"][0]["lag"] + return result["query"]["dbrepllag"][0]["lag"] def get_replag(self): """Return the estimated external database replication lag in seconds. From 7b001ed50dc622d03f1c3c67a70aad410c794dce Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 5 Jul 2012 19:12:11 -0400 Subject: [PATCH 5/8] Improved service ordering that takes API lag into account. --- earwigbot/wiki/site.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index 3312fd9..28508dd 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -128,6 +128,7 @@ class Site(object): self._max_retries = 6 self._last_query_time = 0 self._api_lock = Lock() + self._api_info_cache = {"maxlag": 0, "lastcheck": 0} # Attributes used for SQL queries: self._sql_data = sql @@ -531,21 +532,23 @@ class Site(object): def _get_service_order(self): """Return a preferred order for using services (e.g. the API and SQL). - A list is returned, starting with the preferred service first and + A list is returned, starting with the most preferred service first and ending with the least preferred one. Currently, there are only two services. SERVICE_API will always be included since the API is expected to be always usable. In normal circumstances, self.SERVICE_SQL will be first (with the API second), since using SQL directly is easier on the servers than making web queries with the API. self.SERVICE_SQL will be - second if replag is greater than ten minutes (a cached value updated - every five minutes at most). self.SERVICE_SQL will not be included in - the list if we cannot form a proper SQL connection. + second if replag is greater than three minutes (a cached value updated + every two minutes at most), *unless* API lag is also very high. + self.SERVICE_SQL will not be included in the list if we cannot form a + proper SQL connection. """ - if time() - self._sql_info_cache["lastcheck"] > 300: - self._sql_info_cache["lastcheck"] = time() + now = time() + if now - self._sql_info_cache["lastcheck"] > 120: + self._sql_info_cache["lastcheck"] = now try: - self._sql_info_cache["replag"] = self.get_replag() - except exceptions.SQLError, oursql.Error: + self._sql_info_cache["replag"] = sqllag = self.get_replag() + except (exceptions.SQLError, oursql.Error): self._sql_info_cache["usable"] = False return [self.SERVICE_API] self._sql_info_cache["usable"] = True @@ -553,8 +556,19 @@ class Site(object): if not self._sql_info_cache["usable"]: return [self.SERVICE_API] - if self._sql_info_cache["replag"] > 600: + if sqllag > 180: + if not self._maxlag: + return [self.SERVICE_API, self.SERVICE_SQL] + if now - self._api_info_cache["lastcheck"] > 120: + self._api_info_cache["lastcheck"] = now + try: + self._api_info_cache["maxlag"] = apilag = self.get_maxlag() + except exceptions.APIError: + self._api_info_cache["maxlag"] = apilag = 0 + if sqllag / (180.0 / self._maxlag) < apilag: + return [self.SERVICE_SQL, self.SERVICE_API] return [self.SERVICE_API, self.SERVICE_SQL] + return [self.SERVICE_SQL, self.SERVICE_API] @property From e01ca0fd3149b82ffed1245f46b7e3d9704850de Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 5 Jul 2012 19:31:49 -0400 Subject: [PATCH 6/8] Use service delegation for Category.get_members(). --- earwigbot/wiki/category.py | 69 +++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/earwigbot/wiki/category.py b/earwigbot/wiki/category.py index c88a163..687a7e7 100644 --- a/earwigbot/wiki/category.py +++ b/earwigbot/wiki/category.py @@ -51,6 +51,26 @@ class Category(Page): """Return a nice string representation of the Category.""" return ''.format(self.title, str(self.site)) + def _get_members_via_api(self, limit, follow): + """Iterate over Pages in the category using the API.""" + params = {"action": "query", "list": "categorymembers", + "cmtitle": self.title} + + while 1: + params["cmlimit"] = limit if limit else "max" + result = self.site.api_query(**params) + for member in result["query"]["categorymembers"]: + title = member["title"] + yield self.site.get_page(title, follow_redirects=follow) + + if "query-continue" in result: + qcontinue = result["query-continue"]["categorymembers"] + params["cmcontinue"] = qcontinue["cmcontinue"] + if limit: + limit -= len(result["query"]["categorymembers"]) + else: + break + def _get_members_via_sql(self, limit, follow): """Iterate over Pages in the category using SQL.""" query = """SELECT page_title, page_namespace, page_id FROM page @@ -75,27 +95,7 @@ class Category(Page): yield self.site.get_page(title, follow_redirects=follow, pageid=row[2]) - def _get_members_via_api(self, limit, follow): - """Iterate over Pages in the category using the API.""" - params = {"action": "query", "list": "categorymembers", - "cmtitle": self.title} - - while 1: - params["cmlimit"] = limit if limit else "max" - result = self.site.api_query(**params) - for member in result["query"]["categorymembers"]: - title = member["title"] - yield self.site.get_page(title, follow_redirects=follow) - - if "query-continue" in result: - qcontinue = result["query-continue"]["categorymembers"] - params["cmcontinue"] = qcontinue["cmcontinue"] - if limit: - limit -= len(result["query"]["categorymembers"]) - else: - break - - def _get_size_via_sql(self, member_type): + def _get_size_via_api(self, member_type): query = "SELECT COUNT(*) FROM categorylinks WHERE cl_to = ?" title = self.title.replace(" ", "_").split(":", 1)[1] if member_type == "size": @@ -134,20 +134,20 @@ class Category(Page): def subcats(self): return self._get_size("subcats") - def get_members(self, use_sql=False, limit=None, follow_redirects=None): + def get_members(self, limit=None, follow_redirects=None): """Iterate over Pages in the category. - If *use_sql* is ``True``, we will use a SQL query instead of the API. - Note that pages are retrieved from the API in chunks (by default, in - 500-page chunks for normal users and 5000-page chunks for bots and - admins), so queries may be made as we go along. If *limit* is given, we - will provide this many pages, or less if the category is smaller. By - default, *limit* is ``None``, meaning we will keep iterating over - members until the category is exhausted. *follow_redirects* is passed - directly to :py:meth:`site.get_page() + If *limit* is given, we will provide this many pages, or less if the + category is smaller. By default, *limit* is ``None``, meaning we will + keep iterating over members until the category is exhausted. + *follow_redirects* is passed directly to :py:meth:`site.get_page() `; it defaults to ``None``, which will use the value passed to our :py:meth:`__init__`. + This will use either the API or SQL depending on which are enabled and + the amount of lag on each. This is handled by :py:meth:`site.delegate() + `. + .. note:: Be careful when iterating over very large categories with no limit. If using the API, at best, you will make one query per 5000 pages, @@ -160,9 +160,10 @@ class Category(Page): thousand, in which case the sheer number of titles in memory becomes problematic. """ + services = { + self.site.SERVICE_API: self._get_members_via_api, + self.site.SERVICE_SQL: self._get_members_via_sql + } if follow_redirects is None: follow_redirects = self._follow_redirects - if use_sql: - return self._get_members_via_sql(limit, follow_redirects) - else: - return self._get_members_via_api(limit, follow_redirects) + return self.site.delegate(services, (follow_redirects,)) From f1ac019b3eca25d5edf5f4fab4c28f4be00e9b50 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 5 Jul 2012 19:40:11 -0400 Subject: [PATCH 7/8] Remove all use_sql from commands+tasks. --- earwigbot/commands/afc_submissions.py | 3 +-- earwigbot/tasks/afc_history.py | 2 +- earwigbot/tasks/afc_statistics.py | 4 +--- earwigbot/wiki/page.py | 2 +- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/earwigbot/commands/afc_submissions.py b/earwigbot/commands/afc_submissions.py index 163c055..d20722e 100644 --- a/earwigbot/commands/afc_submissions.py +++ b/earwigbot/commands/afc_submissions.py @@ -55,8 +55,7 @@ class AFCSubmissions(Command): site = self.bot.wiki.get_site() category = site.get_category("Pending AfC submissions") - limit = number + len(self.ignore_list) - members = category.get_members(use_sql=True, limit=limit) + members = category.get_members(limit=number + len(self.ignore_list)) urls = [member.url for member in members if member.title not in self.ignore_list] pages = ", ".join(urls[:number]) self.reply(data, "{0} pending AfC subs: {1}".format(number, pages)) diff --git a/earwigbot/tasks/afc_history.py b/earwigbot/tasks/afc_history.py index 87c8636..7a0582a 100644 --- a/earwigbot/tasks/afc_history.py +++ b/earwigbot/tasks/afc_history.py @@ -130,7 +130,7 @@ class AFCHistory(Task): q_delete = "DELETE FROM page WHERE page_id = ?" q_update = "UPDATE page SET page_date = ?, page_status = ? WHERE page_id = ?" q_insert = "INSERT INTO page VALUES (?, ?, ?)" - members = category.get_members(use_sql=True) + members = category.get_members() with self.conn.cursor() as cursor: for title, pageid in members: diff --git a/earwigbot/tasks/afc_statistics.py b/earwigbot/tasks/afc_statistics.py index 69dd311..be6d95a 100644 --- a/earwigbot/tasks/afc_statistics.py +++ b/earwigbot/tasks/afc_statistics.py @@ -271,9 +271,7 @@ class AFCStatistics(Task): tracked = [i[0] for i in cursor.fetchall()] category = self.site.get_category(self.pending_cat) - pending = category.get_members(use_sql=True) - - for title, pageid in pending: + for title, pageid in category.get_members(): if title in self.ignore_list: continue if pageid not in tracked: diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py index bebd355..d5a4cca 100644 --- a/earwigbot/wiki/page.py +++ b/earwigbot/wiki/page.py @@ -476,7 +476,7 @@ class Page(CopyrightMixIn): Makes an API query only if we haven't already made one and the *pageid* parameter to :py:meth:`__init__` was left as ``None``, which should be true for all cases except when pages are returned by an SQL generator - (like :py:meth:`category.get_members(use_sql=True) + (like :py:meth:`category.get_members() `). Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or From 7005a9b21b2791ca22a933d05ff1596a3d18fd9c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 5 Jul 2012 19:59:01 -0400 Subject: [PATCH 8/8] Docstrings for Category --- earwigbot/wiki/category.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/earwigbot/wiki/category.py b/earwigbot/wiki/category.py index 687a7e7..401047b 100644 --- a/earwigbot/wiki/category.py +++ b/earwigbot/wiki/category.py @@ -37,6 +37,13 @@ class Category(Page): the category namespace; :py:meth:`~earwigbot.wiki.site.Site.get_category` is shorthand, accepting category names without the namespace prefix. + *Attributes:* + + - :py:attr:`size`: the total number of members in the category + - :py:attr:`pages`: the number of pages in the category + - :py:attr:`files`: the number of files in the category + - :py:attr:`subcats`: the number of subcategories in the category + *Public methods:* - :py:meth:`get_members`: iterates over Pages in the category @@ -96,6 +103,7 @@ class Category(Page): pageid=row[2]) def _get_size_via_api(self, member_type): + """Return the size of the category using the API.""" query = "SELECT COUNT(*) FROM categorylinks WHERE cl_to = ?" title = self.title.replace(" ", "_").split(":", 1)[1] if member_type == "size": @@ -106,12 +114,14 @@ class Category(Page): return list(result)[0] def _get_size_via_sql(self, member_type): + """Return the size of the category using SQL.""" result = self.site.api_query(action="query", prop="categoryinfo", cmtitle=self.title) info = result["query"]["pages"].values()[0]["categoryinfo"] return info[member_type] def _get_size(self, member_type): + """Return the size of the category.""" services = { self.site.SERVICE_API: self._size_via_api, self.site.SERVICE_SQL: self._size_via_sql @@ -120,18 +130,44 @@ class Category(Page): @property def size(self): + """The total number of members in the category. + + Includes pages, files, and subcats. Equal to :py:attr:`pages` + + :py:attr:`files` + :py:attr:`subcats`. This will use either the API or + SQL depending on which are enabled and the amount of lag on each. This + is handled by :py:meth:`site.delegate() + `. + """ return self._get_size("size") @property def pages(self): + """The number of pages in the category. + + This will use either the API or SQL depending on which are enabled and + the amount of lag on each. This is handled by :py:meth:`site.delegate() + `. + """ return self._get_size("pages") @property def files(self): + """The number of files in the category. + + This will use either the API or SQL depending on which are enabled and + the amount of lag on each. This is handled by :py:meth:`site.delegate() + `. + """ return self._get_size("files") @property def subcats(self): + """The number of subcategories in the category. + + This will use either the API or SQL depending on which are enabled and + the amount of lag on each. This is handled by :py:meth:`site.delegate() + `. + """ return self._get_size("subcats") def get_members(self, limit=None, follow_redirects=None):