@@ -109,12 +109,9 @@ class AFCStatus(Command): | |||||
def count_submissions(self): | def count_submissions(self): | ||||
"""Returns the number of open AFC submissions (count of CAT:PEND).""" | """Returns the number of open AFC submissions (count of CAT:PEND).""" | ||||
cat = self.site.get_category("Pending AfC submissions") | |||||
subs = len(cat.get_members(use_sql=True)) | |||||
# Remove [[Wikipedia:Articles for creation/Redirects]] and | |||||
# Subtract two for [[Wikipedia:Articles for creation/Redirects]] and | |||||
# [[Wikipedia:Files for upload]], which aren't real submissions: | # [[Wikipedia:Files for upload]], which aren't real submissions: | ||||
return subs - 2 | |||||
return self.site.get_category("Pending AfC submissions").pages - 2 | |||||
def count_redirects(self): | def count_redirects(self): | ||||
"""Returns the number of open redirect submissions. Calculated as the | """Returns the number of open redirect submissions. Calculated as the | ||||
@@ -55,8 +55,7 @@ class AFCSubmissions(Command): | |||||
site = self.bot.wiki.get_site() | site = self.bot.wiki.get_site() | ||||
category = site.get_category("Pending AfC submissions") | category = site.get_category("Pending AfC submissions") | ||||
limit = number + len(self.ignore_list) | |||||
members = category.get_members(use_sql=True, limit=limit) | |||||
members = category.get_members(limit=number + len(self.ignore_list)) | |||||
urls = [member.url for member in members if member.title not in self.ignore_list] | urls = [member.url for member in members if member.title not in self.ignore_list] | ||||
pages = ", ".join(urls[:number]) | pages = ", ".join(urls[:number]) | ||||
self.reply(data, "{0} pending AfC subs: {1}".format(number, pages)) | self.reply(data, "{0} pending AfC subs: {1}".format(number, pages)) |
@@ -31,7 +31,9 @@ This module contains all exceptions used by EarwigBot:: | |||||
| +-- BrokenSocketError | | +-- BrokenSocketError | ||||
+-- WikiToolsetError | +-- WikiToolsetError | ||||
+-- SiteNotFoundError | +-- SiteNotFoundError | ||||
+-- SiteAPIError | |||||
+-- NoServiceError | |||||
+-- APIError | |||||
+-- SQLError | |||||
+-- LoginError | +-- LoginError | ||||
+-- NamespaceNotFoundError | +-- NamespaceNotFoundError | ||||
+-- PageNotFoundError | +-- PageNotFoundError | ||||
@@ -45,7 +47,6 @@ This module contains all exceptions used by EarwigBot:: | |||||
| +-- ContentTooBigError | | +-- ContentTooBigError | ||||
| +-- SpamDetectedError | | +-- SpamDetectedError | ||||
| +-- FilteredError | | +-- FilteredError | ||||
+-- SQLError | |||||
+-- CopyvioCheckError | +-- CopyvioCheckError | ||||
+-- UnknownSearchEngineError | +-- UnknownSearchEngineError | ||||
+-- UnsupportedSearchEngineError | +-- UnsupportedSearchEngineError | ||||
@@ -81,7 +82,13 @@ class SiteNotFoundError(WikiToolsetError): | |||||
Raised by :py:class:`~earwigbot.wiki.sitesdb.SitesDB`. | Raised by :py:class:`~earwigbot.wiki.sitesdb.SitesDB`. | ||||
""" | """ | ||||
class SiteAPIError(WikiToolsetError): | |||||
class NoServiceError(WikiToolsetError): | |||||
"""No service is functioning to handle a specific task. | |||||
Raised by :py:meth:`Site.delegate <earwigbot.wiki.site.Site.delegate>`. | |||||
""" | |||||
class APIError(WikiToolsetError): | |||||
"""Couldn't connect to a site's API. | """Couldn't connect to a site's API. | ||||
Perhaps the server doesn't exist, our URL is wrong or incomplete, or | Perhaps the server doesn't exist, our URL is wrong or incomplete, or | ||||
@@ -90,6 +97,12 @@ class SiteAPIError(WikiToolsetError): | |||||
Raised by :py:meth:`Site.api_query <earwigbot.wiki.site.Site.api_query>`. | Raised by :py:meth:`Site.api_query <earwigbot.wiki.site.Site.api_query>`. | ||||
""" | """ | ||||
class SQLError(WikiToolsetError): | |||||
"""Some error involving SQL querying occurred. | |||||
Raised by :py:meth:`Site.sql_query <earwigbot.wiki.site.Site.sql_query>`. | |||||
""" | |||||
class LoginError(WikiToolsetError): | class LoginError(WikiToolsetError): | ||||
"""An error occured while trying to login. | """An error occured while trying to login. | ||||
@@ -188,12 +201,6 @@ class FilteredError(EditError): | |||||
:py:meth:`Page.add_section <earwigbot.wiki.page.Page.add_section>`. | :py:meth:`Page.add_section <earwigbot.wiki.page.Page.add_section>`. | ||||
""" | """ | ||||
class SQLError(WikiToolsetError): | |||||
"""Some error involving SQL querying occurred. | |||||
Raised by :py:meth:`Site.sql_query <earwigbot.wiki.site.Site.sql_query>`. | |||||
""" | |||||
class CopyvioCheckError(WikiToolsetError): | class CopyvioCheckError(WikiToolsetError): | ||||
"""An error occured when checking a page for copyright violations. | """An error occured when checking a page for copyright violations. | ||||
@@ -130,7 +130,7 @@ class AFCHistory(Task): | |||||
q_delete = "DELETE FROM page WHERE page_id = ?" | q_delete = "DELETE FROM page WHERE page_id = ?" | ||||
q_update = "UPDATE page SET page_date = ?, page_status = ? WHERE page_id = ?" | q_update = "UPDATE page SET page_date = ?, page_status = ? WHERE page_id = ?" | ||||
q_insert = "INSERT INTO page VALUES (?, ?, ?)" | q_insert = "INSERT INTO page VALUES (?, ?, ?)" | ||||
members = category.get_members(use_sql=True) | |||||
members = category.get_members() | |||||
with self.conn.cursor() as cursor: | with self.conn.cursor() as cursor: | ||||
for title, pageid in members: | for title, pageid in members: | ||||
@@ -271,9 +271,7 @@ class AFCStatistics(Task): | |||||
tracked = [i[0] for i in cursor.fetchall()] | tracked = [i[0] for i in cursor.fetchall()] | ||||
category = self.site.get_category(self.pending_cat) | category = self.site.get_category(self.pending_cat) | ||||
pending = category.get_members(use_sql=True) | |||||
for title, pageid in pending: | |||||
for title, pageid in category.get_members(): | |||||
if title in self.ignore_list: | if title in self.ignore_list: | ||||
continue | continue | ||||
if pageid not in tracked: | if pageid not in tracked: | ||||
@@ -663,7 +661,7 @@ class AFCStatistics(Task): | |||||
return None, None, None | return None, None, None | ||||
try: | try: | ||||
content = self.get_revision_content(revid) | content = self.get_revision_content(revid) | ||||
except exceptions.SiteAPIError: | |||||
except exceptions.APIError: | |||||
msg = "API error interrupted SQL query in get_special() for page (id: {0}, chart: {1})" | msg = "API error interrupted SQL query in get_special() for page (id: {0}, chart: {1})" | ||||
self.logger.exception(msg.format(pageid, chart)) | self.logger.exception(msg.format(pageid, chart)) | ||||
return None, None, None | return None, None, None | ||||
@@ -37,6 +37,13 @@ class Category(Page): | |||||
the category namespace; :py:meth:`~earwigbot.wiki.site.Site.get_category` | the category namespace; :py:meth:`~earwigbot.wiki.site.Site.get_category` | ||||
is shorthand, accepting category names without the namespace prefix. | is shorthand, accepting category names without the namespace prefix. | ||||
*Attributes:* | |||||
- :py:attr:`size`: the total number of members in the category | |||||
- :py:attr:`pages`: the number of pages in the category | |||||
- :py:attr:`files`: the number of files in the category | |||||
- :py:attr:`subcats`: the number of subcategories in the category | |||||
*Public methods:* | *Public methods:* | ||||
- :py:meth:`get_members`: iterates over Pages in the category | - :py:meth:`get_members`: iterates over Pages in the category | ||||
@@ -49,7 +56,27 @@ class Category(Page): | |||||
def __str__(self): | def __str__(self): | ||||
"""Return a nice string representation of the Category.""" | """Return a nice string representation of the Category.""" | ||||
return '<Category "{0}" of {1}>'.format(self.title, str(self._site)) | |||||
return '<Category "{0}" of {1}>'.format(self.title, str(self.site)) | |||||
def _get_members_via_api(self, limit, follow): | |||||
"""Iterate over Pages in the category using the API.""" | |||||
params = {"action": "query", "list": "categorymembers", | |||||
"cmtitle": self.title} | |||||
while 1: | |||||
params["cmlimit"] = limit if limit else "max" | |||||
result = self.site.api_query(**params) | |||||
for member in result["query"]["categorymembers"]: | |||||
title = member["title"] | |||||
yield self.site.get_page(title, follow_redirects=follow) | |||||
if "query-continue" in result: | |||||
qcontinue = result["query-continue"]["categorymembers"] | |||||
params["cmcontinue"] = qcontinue["cmcontinue"] | |||||
if limit: | |||||
limit -= len(result["query"]["categorymembers"]) | |||||
else: | |||||
break | |||||
def _get_members_via_sql(self, limit, follow): | def _get_members_via_sql(self, limit, follow): | ||||
"""Iterate over Pages in the category using SQL.""" | """Iterate over Pages in the category using SQL.""" | ||||
@@ -60,55 +87,103 @@ class Category(Page): | |||||
if limit: | if limit: | ||||
query += " LIMIT ?" | query += " LIMIT ?" | ||||
result = self._site.sql_query(query, (title, limit)) | |||||
result = self.site.sql_query(query, (title, limit)) | |||||
else: | else: | ||||
result = self._site.sql_query(query, (title,)) | |||||
result = self.site.sql_query(query, (title,)) | |||||
members = list(result) | members = list(result) | ||||
for row in members: | for row in members: | ||||
base = row[0].replace("_", " ").decode("utf8") | base = row[0].replace("_", " ").decode("utf8") | ||||
namespace = self._site.namespace_id_to_name(row[1]) | |||||
namespace = self.site.namespace_id_to_name(row[1]) | |||||
if namespace: | if namespace: | ||||
title = u":".join((namespace, base)) | title = u":".join((namespace, base)) | ||||
else: # Avoid doing a silly (albeit valid) ":Pagename" thing | else: # Avoid doing a silly (albeit valid) ":Pagename" thing | ||||
title = base | title = base | ||||
yield self._site.get_page(title, follow_redirects=follow, | |||||
yield self.site.get_page(title, follow_redirects=follow, | |||||
pageid=row[2]) | pageid=row[2]) | ||||
def _get_members_via_api(self, limit, follow): | |||||
"""Iterate over Pages in the category using the API.""" | |||||
params = {"action": "query", "list": "categorymembers", | |||||
"cmtitle": self._title} | |||||
def _get_size_via_api(self, member_type): | |||||
"""Return the size of the category using the API.""" | |||||
query = "SELECT COUNT(*) FROM categorylinks WHERE cl_to = ?" | |||||
title = self.title.replace(" ", "_").split(":", 1)[1] | |||||
if member_type == "size": | |||||
result = self.site.sql_query(query, (title,)) | |||||
else: | |||||
query += " AND cl_type = ?" | |||||
result = self.site.sql_query(query, (title, member_type[:-1])) | |||||
return list(result)[0] | |||||
def _get_size_via_sql(self, member_type): | |||||
"""Return the size of the category using SQL.""" | |||||
result = self.site.api_query(action="query", prop="categoryinfo", | |||||
cmtitle=self.title) | |||||
info = result["query"]["pages"].values()[0]["categoryinfo"] | |||||
return info[member_type] | |||||
def _get_size(self, member_type): | |||||
"""Return the size of the category.""" | |||||
services = { | |||||
self.site.SERVICE_API: self._size_via_api, | |||||
self.site.SERVICE_SQL: self._size_via_sql | |||||
} | |||||
return self.site.delegate(services, (member_type,)) | |||||
@property | |||||
def size(self): | |||||
"""The total number of members in the category. | |||||
Includes pages, files, and subcats. Equal to :py:attr:`pages` + | |||||
:py:attr:`files` + :py:attr:`subcats`. This will use either the API or | |||||
SQL depending on which are enabled and the amount of lag on each. This | |||||
is handled by :py:meth:`site.delegate() | |||||
<earwigbot.wiki.site.Site.delegate>`. | |||||
""" | |||||
return self._get_size("size") | |||||
while 1: | |||||
params["cmlimit"] = limit if limit else "max" | |||||
result = self._site.api_query(**params) | |||||
for member in result["query"]["categorymembers"]: | |||||
title = member["title"] | |||||
yield self._site.get_page(title, follow_redirects=follow) | |||||
@property | |||||
def pages(self): | |||||
"""The number of pages in the category. | |||||
if "query-continue" in result: | |||||
qcontinue = result["query-continue"]["categorymembers"] | |||||
params["cmcontinue"] = qcontinue["cmcontinue"] | |||||
if limit: | |||||
limit -= len(result["query"]["categorymembers"]) | |||||
else: | |||||
break | |||||
This will use either the API or SQL depending on which are enabled and | |||||
the amount of lag on each. This is handled by :py:meth:`site.delegate() | |||||
<earwigbot.wiki.site.Site.delegate>`. | |||||
""" | |||||
return self._get_size("pages") | |||||
@property | |||||
def files(self): | |||||
"""The number of files in the category. | |||||
This will use either the API or SQL depending on which are enabled and | |||||
the amount of lag on each. This is handled by :py:meth:`site.delegate() | |||||
<earwigbot.wiki.site.Site.delegate>`. | |||||
""" | |||||
return self._get_size("files") | |||||
def get_members(self, use_sql=False, limit=None, follow_redirects=None): | |||||
@property | |||||
def subcats(self): | |||||
"""The number of subcategories in the category. | |||||
This will use either the API or SQL depending on which are enabled and | |||||
the amount of lag on each. This is handled by :py:meth:`site.delegate() | |||||
<earwigbot.wiki.site.Site.delegate>`. | |||||
""" | |||||
return self._get_size("subcats") | |||||
def get_members(self, limit=None, follow_redirects=None): | |||||
"""Iterate over Pages in the category. | """Iterate over Pages in the category. | ||||
If *use_sql* is ``True``, we will use a SQL query instead of the API. | |||||
Note that pages are retrieved from the API in chunks (by default, in | |||||
500-page chunks for normal users and 5000-page chunks for bots and | |||||
admins), so queries may be made as we go along. If *limit* is given, we | |||||
will provide this many pages, or less if the category is smaller. By | |||||
default, *limit* is ``None``, meaning we will keep iterating over | |||||
members until the category is exhausted. *follow_redirects* is passed | |||||
directly to :py:meth:`site.get_page() | |||||
If *limit* is given, we will provide this many pages, or less if the | |||||
category is smaller. By default, *limit* is ``None``, meaning we will | |||||
keep iterating over members until the category is exhausted. | |||||
*follow_redirects* is passed directly to :py:meth:`site.get_page() | |||||
<earwigbot.wiki.site.Site.get_page>`; it defaults to ``None``, which | <earwigbot.wiki.site.Site.get_page>`; it defaults to ``None``, which | ||||
will use the value passed to our :py:meth:`__init__`. | will use the value passed to our :py:meth:`__init__`. | ||||
This will use either the API or SQL depending on which are enabled and | |||||
the amount of lag on each. This is handled by :py:meth:`site.delegate() | |||||
<earwigbot.wiki.site.Site.delegate>`. | |||||
.. note:: | .. note:: | ||||
Be careful when iterating over very large categories with no limit. | Be careful when iterating over very large categories with no limit. | ||||
If using the API, at best, you will make one query per 5000 pages, | If using the API, at best, you will make one query per 5000 pages, | ||||
@@ -121,9 +196,10 @@ class Category(Page): | |||||
thousand, in which case the sheer number of titles in memory becomes | thousand, in which case the sheer number of titles in memory becomes | ||||
problematic. | problematic. | ||||
""" | """ | ||||
services = { | |||||
self.site.SERVICE_API: self._get_members_via_api, | |||||
self.site.SERVICE_SQL: self._get_members_via_sql | |||||
} | |||||
if follow_redirects is None: | if follow_redirects is None: | ||||
follow_redirects = self._follow_redirects | follow_redirects = self._follow_redirects | ||||
if use_sql: | |||||
return self._get_members_via_sql(limit, follow_redirects) | |||||
else: | |||||
return self._get_members_via_api(limit, follow_redirects) | |||||
return self.site.delegate(services, (follow_redirects,)) |
@@ -117,7 +117,7 @@ class Page(CopyrightMixIn): | |||||
prefix = self._title.split(":", 1)[0] | prefix = self._title.split(":", 1)[0] | ||||
if prefix != title: # ignore a page that's titled "Category" or "User" | if prefix != title: # ignore a page that's titled "Category" or "User" | ||||
try: | try: | ||||
self._namespace = self._site.namespace_name_to_id(prefix) | |||||
self._namespace = self.site.namespace_name_to_id(prefix) | |||||
except exceptions.NamespaceNotFoundError: | except exceptions.NamespaceNotFoundError: | ||||
self._namespace = 0 | self._namespace = 0 | ||||
else: | else: | ||||
@@ -137,7 +137,7 @@ class Page(CopyrightMixIn): | |||||
def __str__(self): | def __str__(self): | ||||
"""Return a nice string representation of the Page.""" | """Return a nice string representation of the Page.""" | ||||
return '<Page "{0}" of {1}>'.format(self.title, str(self._site)) | |||||
return '<Page "{0}" of {1}>'.format(self.title, str(self.site)) | |||||
def _assert_validity(self): | def _assert_validity(self): | ||||
"""Used to ensure that our page's title is valid. | """Used to ensure that our page's title is valid. | ||||
@@ -199,7 +199,7 @@ class Page(CopyrightMixIn): | |||||
Assuming the API is sound, this should not raise any exceptions. | Assuming the API is sound, this should not raise any exceptions. | ||||
""" | """ | ||||
if not result: | if not result: | ||||
query = self._site.api_query | |||||
query = self.site.api_query | |||||
result = query(action="query", rvprop="user", intoken="edit", | result = query(action="query", rvprop="user", intoken="edit", | ||||
prop="info|revisions", rvlimit=1, rvdir="newer", | prop="info|revisions", rvlimit=1, rvdir="newer", | ||||
titles=self._title, inprop="protection|url") | titles=self._title, inprop="protection|url") | ||||
@@ -263,7 +263,7 @@ class Page(CopyrightMixIn): | |||||
want to force content reloading. | want to force content reloading. | ||||
""" | """ | ||||
if not result: | if not result: | ||||
query = self._site.api_query | |||||
query = self.site.api_query | |||||
result = query(action="query", prop="revisions", rvlimit=1, | result = query(action="query", prop="revisions", rvlimit=1, | ||||
rvprop="content|timestamp", titles=self._title) | rvprop="content|timestamp", titles=self._title) | ||||
@@ -310,8 +310,8 @@ class Page(CopyrightMixIn): | |||||
# Try the API query, catching most errors with our handler: | # Try the API query, catching most errors with our handler: | ||||
try: | try: | ||||
result = self._site.api_query(**params) | |||||
except exceptions.SiteAPIError as error: | |||||
result = self.site.api_query(**params) | |||||
except exceptions.APIError as error: | |||||
if not hasattr(error, "code"): | if not hasattr(error, "code"): | ||||
raise # We can only handle errors with a code attribute | raise # We can only handle errors with a code attribute | ||||
result = self._handle_edit_errors(error, params, tries) | result = self._handle_edit_errors(error, params, tries) | ||||
@@ -375,12 +375,12 @@ class Page(CopyrightMixIn): | |||||
elif error.code in ["noedit-anon", "cantcreate-anon", | elif error.code in ["noedit-anon", "cantcreate-anon", | ||||
"noimageredirect-anon"]: | "noimageredirect-anon"]: | ||||
if not all(self._site._login_info): | |||||
if not all(self.site._login_info): | |||||
# Insufficient login info: | # Insufficient login info: | ||||
raise exceptions.PermissionsError(error.info) | raise exceptions.PermissionsError(error.info) | ||||
if tries == 0: | if tries == 0: | ||||
# We have login info; try to login: | # We have login info; try to login: | ||||
self._site._login(self._site._login_info) | |||||
self.site._login(self.site._login_info) | |||||
self._token = None # Need a new token; old one is invalid now | self._token = None # Need a new token; old one is invalid now | ||||
return self._edit(params=params, tries=1) | return self._edit(params=params, tries=1) | ||||
else: | else: | ||||
@@ -416,13 +416,13 @@ class Page(CopyrightMixIn): | |||||
log in. Otherwise, raise PermissionsError with details. | log in. Otherwise, raise PermissionsError with details. | ||||
""" | """ | ||||
if assertion == "user": | if assertion == "user": | ||||
if not all(self._site._login_info): | |||||
if not all(self.site._login_info): | |||||
# Insufficient login info: | # Insufficient login info: | ||||
e = "AssertEdit: user assertion failed, and no login info was provided." | e = "AssertEdit: user assertion failed, and no login info was provided." | ||||
raise exceptions.PermissionsError(e) | raise exceptions.PermissionsError(e) | ||||
if tries == 0: | if tries == 0: | ||||
# We have login info; try to login: | # We have login info; try to login: | ||||
self._site._login(self._site._login_info) | |||||
self.site._login(self.site._login_info) | |||||
self._token = None # Need a new token; old one is invalid now | self._token = None # Need a new token; old one is invalid now | ||||
return self._edit(params=params, tries=1) | return self._edit(params=params, tries=1) | ||||
else: | else: | ||||
@@ -476,7 +476,7 @@ class Page(CopyrightMixIn): | |||||
Makes an API query only if we haven't already made one and the *pageid* | Makes an API query only if we haven't already made one and the *pageid* | ||||
parameter to :py:meth:`__init__` was left as ``None``, which should be | parameter to :py:meth:`__init__` was left as ``None``, which should be | ||||
true for all cases except when pages are returned by an SQL generator | true for all cases except when pages are returned by an SQL generator | ||||
(like :py:meth:`category.get_members(use_sql=True) | |||||
(like :py:meth:`category.get_members() | |||||
<earwigbot.wiki.category.Category.get_members>`). | <earwigbot.wiki.category.Category.get_members>`). | ||||
Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or | Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or | ||||
@@ -502,8 +502,8 @@ class Page(CopyrightMixIn): | |||||
return self._fullurl | return self._fullurl | ||||
else: | else: | ||||
slug = quote(self._title.replace(" ", "_"), safe="/:") | slug = quote(self._title.replace(" ", "_"), safe="/:") | ||||
path = self._site._article_path.replace("$1", slug) | |||||
return ''.join((self._site.url, path)) | |||||
path = self.site._article_path.replace("$1", slug) | |||||
return ''.join((self.site.url, path)) | |||||
@property | @property | ||||
def namespace(self): | def namespace(self): | ||||
@@ -580,7 +580,7 @@ class Page(CopyrightMixIn): | |||||
otherwise missing or invalid. | otherwise missing or invalid. | ||||
""" | """ | ||||
if self._namespace < 0: | if self._namespace < 0: | ||||
ns = self._site.namespace_id_to_name(self._namespace) | |||||
ns = self.site.namespace_id_to_name(self._namespace) | |||||
e = u"Pages in the {0} namespace can't have talk pages.".format(ns) | e = u"Pages in the {0} namespace can't have talk pages.".format(ns) | ||||
raise exceptions.InvalidPageError(e) | raise exceptions.InvalidPageError(e) | ||||
@@ -594,7 +594,7 @@ class Page(CopyrightMixIn): | |||||
except IndexError: | except IndexError: | ||||
body = self._title | body = self._title | ||||
new_prefix = self._site.namespace_id_to_name(new_ns) | |||||
new_prefix = self.site.namespace_id_to_name(new_ns) | |||||
# If the new page is in namespace 0, don't do ":Title" (it's correct, | # If the new page is in namespace 0, don't do ":Title" (it's correct, | ||||
# but unnecessary), just do "Title": | # but unnecessary), just do "Title": | ||||
@@ -605,7 +605,7 @@ class Page(CopyrightMixIn): | |||||
if follow_redirects is None: | if follow_redirects is None: | ||||
follow_redirects = self._follow_redirects | follow_redirects = self._follow_redirects | ||||
return Page(self._site, new_title, follow_redirects) | |||||
return Page(self.site, new_title, follow_redirects) | |||||
def get(self): | def get(self): | ||||
"""Return page content, which is cached if you try to call get again. | """Return page content, which is cached if you try to call get again. | ||||
@@ -616,7 +616,7 @@ class Page(CopyrightMixIn): | |||||
if self._exists == self.PAGE_UNKNOWN: | if self._exists == self.PAGE_UNKNOWN: | ||||
# Kill two birds with one stone by doing an API query for both our | # Kill two birds with one stone by doing an API query for both our | ||||
# attributes and our page content: | # attributes and our page content: | ||||
query = self._site.api_query | |||||
query = self.site.api_query | |||||
result = query(action="query", rvlimit=1, titles=self._title, | result = query(action="query", rvlimit=1, titles=self._title, | ||||
prop="info|revisions", inprop="protection|url", | prop="info|revisions", inprop="protection|url", | ||||
intoken="edit", rvprop="content|timestamp") | intoken="edit", rvprop="content|timestamp") | ||||
@@ -680,7 +680,7 @@ class Page(CopyrightMixIn): | |||||
if not self._creator: | if not self._creator: | ||||
self._load() | self._load() | ||||
self._assert_existence() | self._assert_existence() | ||||
return self._site.get_user(self._creator) | |||||
return self.site.get_user(self._creator) | |||||
def parse(self): | def parse(self): | ||||
"""Parse the page content for templates, links, etc. | """Parse the page content for templates, links, etc. | ||||
@@ -75,13 +75,17 @@ class Site(object): | |||||
- :py:meth:`api_query`: does an API query with kwargs as params | - :py:meth:`api_query`: does an API query with kwargs as params | ||||
- :py:meth:`sql_query`: does an SQL query and yields its results | - :py:meth:`sql_query`: does an SQL query and yields its results | ||||
- :py:meth:`get_replag`: estimates the database replication lag | |||||
- :py:meth:`get_maxlag`: returns the internal database lag | |||||
- :py:meth:`get_replag`: estimates the external database lag | |||||
- :py:meth:`namespace_id_to_name`: returns names associated with an NS id | - :py:meth:`namespace_id_to_name`: returns names associated with an NS id | ||||
- :py:meth:`namespace_name_to_id`: returns the ID associated with a NS name | - :py:meth:`namespace_name_to_id`: returns the ID associated with a NS name | ||||
- :py:meth:`get_page`: returns a Page for the given title | - :py:meth:`get_page`: returns a Page for the given title | ||||
- :py:meth:`get_category`: returns a Category for the given title | - :py:meth:`get_category`: returns a Category for the given title | ||||
- :py:meth:`get_user`: returns a User object for the given name | - :py:meth:`get_user`: returns a User object for the given name | ||||
- :py:meth:`delegate`: controls when the API or SQL is used | |||||
""" | """ | ||||
SERVICE_API = 1 | |||||
SERVICE_SQL = 2 | |||||
def __init__(self, name=None, project=None, lang=None, base_url=None, | def __init__(self, name=None, project=None, lang=None, base_url=None, | ||||
article_path=None, script_path=None, sql=None, | article_path=None, script_path=None, sql=None, | ||||
@@ -124,11 +128,13 @@ class Site(object): | |||||
self._max_retries = 6 | self._max_retries = 6 | ||||
self._last_query_time = 0 | self._last_query_time = 0 | ||||
self._api_lock = Lock() | self._api_lock = Lock() | ||||
self._api_info_cache = {"maxlag": 0, "lastcheck": 0} | |||||
# Attributes used for SQL queries: | # Attributes used for SQL queries: | ||||
self._sql_data = sql | self._sql_data = sql | ||||
self._sql_conn = None | self._sql_conn = None | ||||
self._sql_lock = Lock() | self._sql_lock = Lock() | ||||
self._sql_info_cache = {"replag": 0, "lastcheck": 0, "usable": None} | |||||
# Attribute used in copyright violation checks (see CopyrightMixIn): | # Attribute used in copyright violation checks (see CopyrightMixIn): | ||||
self._search_config = search_config | self._search_config = search_config | ||||
@@ -201,7 +207,7 @@ class Site(object): | |||||
args.append(key + "=" + val) | args.append(key + "=" + val) | ||||
return "&".join(args) | return "&".join(args) | ||||
def _api_query(self, params, tries=0, wait=5): | |||||
def _api_query(self, params, tries=0, wait=5, ignore_maxlag=False): | |||||
"""Do an API query with *params* as a dict of parameters. | """Do an API query with *params* as a dict of parameters. | ||||
See the documentation for :py:meth:`api_query` for full implementation | See the documentation for :py:meth:`api_query` for full implementation | ||||
@@ -215,7 +221,7 @@ class Site(object): | |||||
sleep(wait_time) | sleep(wait_time) | ||||
self._last_query_time = time() | self._last_query_time = time() | ||||
url, data = self._build_api_query(params) | |||||
url, data = self._build_api_query(params, ignore_maxlag) | |||||
self._logger.debug("{0} -> {1}".format(url, data)) | self._logger.debug("{0} -> {1}".format(url, data)) | ||||
try: | try: | ||||
@@ -228,7 +234,7 @@ class Site(object): | |||||
e = e.format(error.code) | e = e.format(error.code) | ||||
else: | else: | ||||
e = "API query failed." | e = "API query failed." | ||||
raise exceptions.SiteAPIError(e) | |||||
raise exceptions.APIError(e) | |||||
result = response.read() | result = response.read() | ||||
if response.headers.get("Content-Encoding") == "gzip": | if response.headers.get("Content-Encoding") == "gzip": | ||||
@@ -238,17 +244,18 @@ class Site(object): | |||||
return self._handle_api_query_result(result, params, tries, wait) | return self._handle_api_query_result(result, params, tries, wait) | ||||
def _build_api_query(self, params): | |||||
def _build_api_query(self, params, ignore_maxlag): | |||||
"""Given API query params, return the URL to query and POST data.""" | """Given API query params, return the URL to query and POST data.""" | ||||
if not self._base_url or self._script_path is None: | if not self._base_url or self._script_path is None: | ||||
e = "Tried to do an API query, but no API URL is known." | e = "Tried to do an API query, but no API URL is known." | ||||
raise exceptions.SiteAPIError(e) | |||||
raise exceptions.APIError(e) | |||||
url = ''.join((self.url, self._script_path, "/api.php")) | url = ''.join((self.url, self._script_path, "/api.php")) | ||||
params["format"] = "json" # This is the only format we understand | params["format"] = "json" # This is the only format we understand | ||||
if self._assert_edit: # If requested, ensure that we're logged in | if self._assert_edit: # If requested, ensure that we're logged in | ||||
params["assert"] = self._assert_edit | params["assert"] = self._assert_edit | ||||
if self._maxlag: # If requested, don't overload the servers | |||||
if self._maxlag and not ignore_maxlag: | |||||
# If requested, don't overload the servers: | |||||
params["maxlag"] = self._maxlag | params["maxlag"] = self._maxlag | ||||
data = self._urlencode_utf8(params) | data = self._urlencode_utf8(params) | ||||
@@ -260,7 +267,7 @@ class Site(object): | |||||
res = loads(result) # Try to parse as a JSON object | res = loads(result) # Try to parse as a JSON object | ||||
except ValueError: | except ValueError: | ||||
e = "API query failed: JSON could not be decoded." | e = "API query failed: JSON could not be decoded." | ||||
raise exceptions.SiteAPIError(e) | |||||
raise exceptions.APIError(e) | |||||
try: | try: | ||||
code = res["error"]["code"] | code = res["error"]["code"] | ||||
@@ -271,7 +278,7 @@ class Site(object): | |||||
if code == "maxlag": # We've been throttled by the server | if code == "maxlag": # We've been throttled by the server | ||||
if tries >= self._max_retries: | if tries >= self._max_retries: | ||||
e = "Maximum number of retries reached ({0})." | e = "Maximum number of retries reached ({0})." | ||||
raise exceptions.SiteAPIError(e.format(self._max_retries)) | |||||
raise exceptions.APIError(e.format(self._max_retries)) | |||||
tries += 1 | tries += 1 | ||||
msg = 'Server says "{0}"; retrying in {1} seconds ({2}/{3})' | msg = 'Server says "{0}"; retrying in {1} seconds ({2}/{3})' | ||||
self._logger.info(msg.format(info, wait, tries, self._max_retries)) | self._logger.info(msg.format(info, wait, tries, self._max_retries)) | ||||
@@ -279,7 +286,7 @@ class Site(object): | |||||
return self._api_query(params, tries=tries, wait=wait*2) | return self._api_query(params, tries=tries, wait=wait*2) | ||||
else: # Some unknown error occurred | else: # Some unknown error occurred | ||||
e = 'API query failed: got error "{0}"; server says: "{1}".' | e = 'API query failed: got error "{0}"; server says: "{1}".' | ||||
error = exceptions.SiteAPIError(e.format(code, info)) | |||||
error = exceptions.APIError(e.format(code, info)) | |||||
error.code, error.info = code, info | error.code, error.info = code, info | ||||
raise error | raise error | ||||
@@ -522,6 +529,48 @@ class Site(object): | |||||
self._sql_conn = oursql.connect(**args) | self._sql_conn = oursql.connect(**args) | ||||
def _get_service_order(self): | |||||
"""Return a preferred order for using services (e.g. the API and SQL). | |||||
A list is returned, starting with the most preferred service first and | |||||
ending with the least preferred one. Currently, there are only two | |||||
services. SERVICE_API will always be included since the API is expected | |||||
to be always usable. In normal circumstances, self.SERVICE_SQL will be | |||||
first (with the API second), since using SQL directly is easier on the | |||||
servers than making web queries with the API. self.SERVICE_SQL will be | |||||
second if replag is greater than three minutes (a cached value updated | |||||
every two minutes at most), *unless* API lag is also very high. | |||||
self.SERVICE_SQL will not be included in the list if we cannot form a | |||||
proper SQL connection. | |||||
""" | |||||
now = time() | |||||
if now - self._sql_info_cache["lastcheck"] > 120: | |||||
self._sql_info_cache["lastcheck"] = now | |||||
try: | |||||
self._sql_info_cache["replag"] = sqllag = self.get_replag() | |||||
except (exceptions.SQLError, oursql.Error): | |||||
self._sql_info_cache["usable"] = False | |||||
return [self.SERVICE_API] | |||||
self._sql_info_cache["usable"] = True | |||||
else: | |||||
if not self._sql_info_cache["usable"]: | |||||
return [self.SERVICE_API] | |||||
if sqllag > 180: | |||||
if not self._maxlag: | |||||
return [self.SERVICE_API, self.SERVICE_SQL] | |||||
if now - self._api_info_cache["lastcheck"] > 120: | |||||
self._api_info_cache["lastcheck"] = now | |||||
try: | |||||
self._api_info_cache["maxlag"] = apilag = self.get_maxlag() | |||||
except exceptions.APIError: | |||||
self._api_info_cache["maxlag"] = apilag = 0 | |||||
if sqllag / (180.0 / self._maxlag) < apilag: | |||||
return [self.SERVICE_SQL, self.SERVICE_API] | |||||
return [self.SERVICE_API, self.SERVICE_SQL] | |||||
return [self.SERVICE_SQL, self.SERVICE_API] | |||||
@property | @property | ||||
def name(self): | def name(self): | ||||
"""The Site's name (or "wikiid" in the API), like ``"enwiki"``.""" | """The Site's name (or "wikiid" in the API), like ``"enwiki"``.""" | ||||
@@ -559,7 +608,7 @@ class Site(object): | |||||
This will first attempt to construct an API url from | This will first attempt to construct an API url from | ||||
:py:attr:`self._base_url` and :py:attr:`self._script_path`. We need | :py:attr:`self._base_url` and :py:attr:`self._script_path`. We need | ||||
both of these, or else we'll raise | both of these, or else we'll raise | ||||
:py:exc:`~earwigbot.exceptions.SiteAPIError`. If | |||||
:py:exc:`~earwigbot.exceptions.APIError`. If | |||||
:py:attr:`self._base_url` is protocol-relative (introduced in MediaWiki | :py:attr:`self._base_url` is protocol-relative (introduced in MediaWiki | ||||
1.18), we'll choose HTTPS only if :py:attr:`self._user_https` is | 1.18), we'll choose HTTPS only if :py:attr:`self._user_https` is | ||||
``True``, otherwise HTTP. | ``True``, otherwise HTTP. | ||||
@@ -578,7 +627,7 @@ class Site(object): | |||||
load it as a JSON object, and return it. | load it as a JSON object, and return it. | ||||
If our request failed for some reason, we'll raise | If our request failed for some reason, we'll raise | ||||
:py:exc:`~earwigbot.exceptions.SiteAPIError` with details. If that | |||||
:py:exc:`~earwigbot.exceptions.APIError` with details. If that | |||||
reason was due to maxlag, we'll sleep for a bit and then repeat the | reason was due to maxlag, we'll sleep for a bit and then repeat the | ||||
query until we exceed :py:attr:`self._max_retries`. | query until we exceed :py:attr:`self._max_retries`. | ||||
@@ -635,8 +684,30 @@ class Site(object): | |||||
for result in cur: | for result in cur: | ||||
yield result | yield result | ||||
def get_maxlag(self, showall=False): | |||||
"""Return the internal database replication lag in seconds. | |||||
In a typical setup, this function returns the replication lag *within* | |||||
the WMF's cluster, *not* external replication lag affecting the | |||||
Toolserver (see :py:meth:`get_replag` for that). This is useful when | |||||
combined with the ``maxlag`` API query param (added by config), in | |||||
which queries will be halted and retried if the lag is too high, | |||||
usually above five seconds. | |||||
With *showall*, will return a list of the lag for all servers in the | |||||
cluster, not just the one with the highest lag. | |||||
""" | |||||
params = {"action": "query", "meta": "siteinfo", "siprop": "dbrepllag"} | |||||
if showall: | |||||
params["sishowalldb"] = 1 | |||||
with self._api_lock: | |||||
result = self._api_query(params, ignore_maxlag=True) | |||||
if showall: | |||||
return [server["lag"] for server in result["query"]["dbrepllag"]] | |||||
return result["query"]["dbrepllag"][0]["lag"] | |||||
def get_replag(self): | def get_replag(self): | ||||
"""Return the estimated database replication lag in seconds. | |||||
"""Return the estimated external database replication lag in seconds. | |||||
Requires SQL access. This function only makes sense on a replicated | Requires SQL access. This function only makes sense on a replicated | ||||
database (e.g. the Wikimedia Toolserver) and on a wiki that receives a | database (e.g. the Wikimedia Toolserver) and on a wiki that receives a | ||||
@@ -739,3 +810,29 @@ class Site(object): | |||||
else: | else: | ||||
username = self._get_username() | username = self._get_username() | ||||
return User(self, username) | return User(self, username) | ||||
def delegate(self, services, args=None, kwargs=None): | |||||
"""Delegate a task to either the API or SQL depending on conditions. | |||||
*services* should be a dictionary in which the key is the service name | |||||
(:py:attr:`self.SERVICE_API <SERVICE_API>` or | |||||
:py:attr:`self.SERVICE_SQL <SERVICE_SQL>`), and the value is the | |||||
function to call for this service. All functions will be passed the | |||||
same arguments the tuple *args* and the dict **kwargs**, which are both | |||||
empty by default. The service order is determined by | |||||
:py:meth:`_get_service_order`. | |||||
Not every service needs an entry in the dictionary. Will raise | |||||
:py:exc:`~earwigbot.exceptions.NoServiceError` if an appropriate | |||||
service cannot be found. | |||||
""" | |||||
if not args: | |||||
args = () | |||||
if not kwargs: | |||||
kwargs = {} | |||||
order = self._get_service_order() | |||||
for srv in order: | |||||
if srv in services: | |||||
return services[srv](*args, **kwargs) | |||||
raise exceptions.NoServiceError(services) |
@@ -82,7 +82,7 @@ class User(object): | |||||
def __str__(self): | def __str__(self): | ||||
"""Return a nice string representation of the User.""" | """Return a nice string representation of the User.""" | ||||
return '<User "{0}" of {1}>'.format(self._name, str(self._site)) | |||||
return '<User "{0}" of {1}>'.format(self.name, str(self.site)) | |||||
def _get_attribute(self, attr): | def _get_attribute(self, attr): | ||||
"""Internally used to get an attribute by name. | """Internally used to get an attribute by name. | ||||
@@ -107,8 +107,8 @@ class User(object): | |||||
is not defined. This defines it. | is not defined. This defines it. | ||||
""" | """ | ||||
props = "blockinfo|groups|rights|editcount|registration|emailable|gender" | props = "blockinfo|groups|rights|editcount|registration|emailable|gender" | ||||
result = self._site.api_query(action="query", list="users", | |||||
ususers=self._name, usprop=props) | |||||
result = self.site.api_query(action="query", list="users", | |||||
ususers=self._name, usprop=props) | |||||
res = result["query"]["users"][0] | res = result["query"]["users"][0] | ||||
# normalize our username in case it was entered oddly | # normalize our username in case it was entered oddly | ||||
@@ -275,9 +275,9 @@ class User(object): | |||||
No checks are made to see if it exists or not. Proper site namespace | No checks are made to see if it exists or not. Proper site namespace | ||||
conventions are followed. | conventions are followed. | ||||
""" | """ | ||||
prefix = self._site.namespace_id_to_name(constants.NS_USER) | |||||
prefix = self.site.namespace_id_to_name(constants.NS_USER) | |||||
pagename = ':'.join((prefix, self._name)) | pagename = ':'.join((prefix, self._name)) | ||||
return Page(self._site, pagename) | |||||
return Page(self.site, pagename) | |||||
def get_talkpage(self): | def get_talkpage(self): | ||||
"""Return a Page object representing the user's talkpage. | """Return a Page object representing the user's talkpage. | ||||
@@ -285,6 +285,6 @@ class User(object): | |||||
No checks are made to see if it exists or not. Proper site namespace | No checks are made to see if it exists or not. Proper site namespace | ||||
conventions are followed. | conventions are followed. | ||||
""" | """ | ||||
prefix = self._site.namespace_id_to_name(constants.NS_USER_TALK) | |||||
prefix = self.site.namespace_id_to_name(constants.NS_USER_TALK) | |||||
pagename = ':'.join((prefix, self._name)) | pagename = ':'.join((prefix, self._name)) | ||||
return Page(self._site, pagename) | |||||
return Page(self.site, pagename) |