From 459c252fc7eff4ce1ec67390c55f7a3eeb7d4fae Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 21 Sep 2014 20:14:25 -0500 Subject: [PATCH] Support new CSRF token API. --- earwigbot/wiki/page.py | 52 +++++++++++++++++---------------------------- earwigbot/wiki/site.py | 57 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 59 insertions(+), 50 deletions(-) diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py index 2c8cba2..d509d96 100644 --- a/earwigbot/wiki/page.py +++ b/earwigbot/wiki/page.py @@ -116,7 +116,6 @@ class Page(CopyvioMixIn): self._creator = None # Attributes used for editing/deleting/protecting/etc: - self._token = None self._basetimestamp = None self._starttimestamp = None @@ -199,18 +198,18 @@ class Page(CopyvioMixIn): """Load various data from the API in a single query. Loads self._title, ._exists, ._is_redirect, ._pageid, ._fullurl, - ._protection, ._namespace, ._is_talkpage, ._creator, ._lastrevid, - ._token, and ._starttimestamp using the API. It will do a query of - its own unless *result* is provided, in which case we'll pretend - *result* is what the query returned. + ._protection, ._namespace, ._is_talkpage, ._creator, ._lastrevid, and + ._starttimestamp using the API. It will do a query of its own unless + *result* is provided, in which case we'll pretend *result* is what the + query returned. Assuming the API is sound, this should not raise any exceptions. """ if not result: query = self.site.api_query - result = query(action="query", rvprop="user", intoken="edit", - prop="info|revisions", rvlimit=1, rvdir="newer", - titles=self._title, inprop="protection|url") + result = query(action="query", prop="info|revisions", + inprop="protection|url", rvprop="user", rvlimit=1, + rvdir="newer", titles=self._title) res = result["query"]["pages"].values()[0] @@ -233,13 +232,7 @@ class Page(CopyvioMixIn): self._fullurl = res["fullurl"] self._protection = res["protection"] - - try: - self._token = res["edittoken"] - except KeyError: - pass - else: - self._starttimestamp = strftime("%Y-%m-%dT%H:%M:%SZ", gmtime()) + self._starttimestamp = strftime("%Y-%m-%dT%H:%M:%SZ", gmtime()) # We've determined the namespace and talkpage status in __init__() # based on the title, but now we can be sure: @@ -291,13 +284,6 @@ class Page(CopyvioMixIn): in _handle_edit_errors(). We'll then throw these back as subclasses of EditError. """ - # Try to get our edit token, and die if we can't: - if not self._token: - self._load_attributes() - if not self._token: - e = "You don't have permission to edit this page." - raise exceptions.PermissionsError(e) - # Weed out invalid pages before we get too far: self._assert_validity() @@ -306,8 +292,7 @@ class Page(CopyvioMixIn): params = self._build_edit_params(text, summary, minor, bot, force, section, captcha_id, captcha_word) else: # Make sure we have the right token: - params["token"] = self._token - self._token = None # Token now invalid + params["token"] = self.site.get_token() # Try the API query, catching most errors with our handler: try: @@ -332,8 +317,9 @@ class Page(CopyvioMixIn): """Given some keyword arguments, build an API edit query string.""" unitxt = text.encode("utf8") if isinstance(text, unicode) else text hashed = md5(unitxt).hexdigest() # Checksum to ensure text is correct - params = {"action": "edit", "title": self._title, "text": text, - "token": self._token, "summary": summary, "md5": hashed} + params = { + "action": "edit", "title": self._title, "text": text, + "token": self.site.get_token(), "summary": summary, "md5": hashed} if section: params["section"] = section @@ -378,13 +364,13 @@ class Page(CopyvioMixIn): self._exists = self.PAGE_UNKNOWN raise exceptions.EditConflictError(error.info) elif error.code == "badtoken" and retry: - params["token"] = self.site.get_token("edit") + params["token"] = self.site.get_token(force=True) try: return self.site.api_query(**params) - except exceptions.APIError as error: - if not hasattr(error, "code"): + except exceptions.APIError as err: + if not hasattr(err, "code"): raise # We can only handle errors with a code attribute - return self._handle_edit_errors(error, params, retry=False) + return self._handle_edit_errors(err, params, retry=False) elif error.code in ["emptypage", "emptynewsection"]: raise exceptions.NoContentError(error.info) elif error.code == "contenttoobig": @@ -577,7 +563,7 @@ class Page(CopyvioMixIn): query = self.site.api_query result = query(action="query", rvlimit=1, titles=self._title, prop="info|revisions", inprop="protection|url", - intoken="edit", rvprop="content|timestamp") + rvprop="content|timestamp") self._load_attributes(result=result) self._assert_existence() self._load_content(result=result) @@ -610,7 +596,7 @@ class Page(CopyvioMixIn): :py:exc:`~earwigbot.exceptions.RedirectError` if the page is not a redirect. """ - re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]" + re_redirect = r"^\s*\#\s*redirect\s*\[\[(.*?)\]\]" content = self.get() try: return re.findall(re_redirect, content, flags=re.I)[0] @@ -709,7 +695,7 @@ class Page(CopyvioMixIn): username = username.lower() optouts = [optout.lower() for optout in optouts] if optouts else [] - r_bots = "\{\{\s*(no)?bots\s*(\||\}\})" + r_bots = r"\{\{\s*(no)?bots\s*(\||\}\})" filter = self.parse().ifilter_templates(recursive=True, matches=r_bots) for template in filter: if template.has_param("deny"): diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index 0800bc3..a3ebe52 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -83,6 +83,8 @@ class Site(object): """ SERVICE_API = 1 SERVICE_SQL = 2 + SPECIAL_TOKENS = ["deleteglobalaccount", "patrol", "rollback", + "setglobalaccountstatus", "userrights", "watch"] def __init__(self, name=None, project=None, lang=None, base_url=None, article_path=None, script_path=None, sql=None, @@ -124,6 +126,7 @@ class Site(object): self._wait_between_queries = wait_between_queries self._max_retries = 6 self._last_query_time = 0 + self._tokens = {} self._api_lock = RLock() self._api_info_cache = {"maxlag": 0, "lastcheck": 0} @@ -252,13 +255,25 @@ class Site(object): return self._handle_api_result(result, params, tries, wait, ae_retry) + def _request_csrf_token(self, params): + """If possible, add a request for a CSRF token to an API query.""" + if params.get("action") == "query": + if params.get("meta"): + if "tokens" not in params["meta"].split("|"): + params["meta"] += "|tokens" + else: + params["meta"] = "tokens" + if params.get("type"): + if "csrf" not in params["type"].split("|"): + params["type"] += "|csrf" + def _build_api_query(self, params, ignore_maxlag, no_assert): """Given API query params, return the URL to query and POST data.""" if not self._base_url or self._script_path is None: e = "Tried to do an API query, but no API URL is known." raise exceptions.APIError(e) - url = ''.join((self.url, self._script_path, "/api.php")) + url = self.url + self._script_path + "/api.php" params["format"] = "json" # This is the only format we understand if self._assert_edit and not no_assert: # If requested, ensure that we're logged in @@ -266,6 +281,9 @@ class Site(object): if self._maxlag and not ignore_maxlag: # If requested, don't overload the servers: params["maxlag"] = self._maxlag + if "csrf" not in self._tokens: + # If we don't have a CSRF token, try to fetch one: + self._request_csrf_token(params) data = self._urlencode_utf8(params) return url, data @@ -282,6 +300,9 @@ class Site(object): code = res["error"]["code"] info = res["error"]["info"] except (TypeError, KeyError): # If there's no error code/info, return + if "query" in res and "tokens" in res["query"]: + for name, token in res["query"]["tokens"].iteritems(): + self._tokens[name.split("token")[0]] = token return res if code == "maxlag": # We've been throttled by the server @@ -326,7 +347,7 @@ class Site(object): # All attributes to be loaded, except _namespaces, which is a special # case because it requires additional params in the API query: attrs = [self._name, self._project, self._lang, self._base_url, - self._article_path, self._script_path] + self._article_path, self._script_path] params = {"action": "query", "meta": "siteinfo", "siprop": "general"} @@ -485,6 +506,7 @@ class Site(object): from our first request, and *attempt* is to prevent getting stuck in a loop if MediaWiki isn't acting right. """ + self._tokens.clear() name, password = login params = {"action": "login", "lgname": name, "lgpassword": password} @@ -764,25 +786,26 @@ class Site(object): result = list(self.sql_query(query)) return int(result[0][0]) - def get_token(self, action): + def get_token(self, action=None, force=False): """Return a token for a data-modifying API action. - *action* must be one of the types listed on - . If it's given as a union - of types separated by |, then the function will return a dictionary - of tokens instead of a single one. + In general, this will be a CSRF token, unless *action* is in a special + list of non-CSRF tokens. Tokens are cached for the session (until + :meth:`_login` is called again); set *force* to ``True`` to force a new + token to be fetched. - Raises :py:exc:`~earwigbot.exceptions.PermissionsError` if we don't - have permissions for the requested action(s), or they are invalid. - Raises :py:exc:`~earwigbot.exceptions.APIError` if there was some other - API issue. + Raises :exc:`.APIError` if there was an API issue. """ - res = self.api_query(action="tokens", type=action) - if "warnings" in res and "tokens" in res["warnings"]: - raise exceptions.PermissionsError(res["warnings"]["tokens"]["*"]) - if "|" in action: - return res["tokens"] - return res["tokens"].values()[0] + if action not in self.SPECIAL_TOKENS: + action = "csrf" + if action in self._tokens and not force: + return self._tokens[action] + + res = self.api_query(action="query", meta="tokens", type=action) + if action not in self._tokens: + err = "Tried to fetch a {0} token, but API returned: {1}" + raise exceptions.APIError(err.format(action, res)) + return self._tokens[action] def namespace_id_to_name(self, ns_id, all=False): """Given a namespace ID, returns associated namespace names.