From 459c252fc7eff4ce1ec67390c55f7a3eeb7d4fae Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Sun, 21 Sep 2014 20:14:25 -0500
Subject: [PATCH] Support new CSRF token API.

---
 earwigbot/wiki/page.py | 52 +++++++++++++++++----------------------------
 earwigbot/wiki/site.py | 57 +++++++++++++++++++++++++++++++++++---------------
 2 files changed, 59 insertions(+), 50 deletions(-)

diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py
index 2c8cba2..d509d96 100644
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -116,7 +116,6 @@ class Page(CopyvioMixIn):
         self._creator = None
 
         # Attributes used for editing/deleting/protecting/etc:
-        self._token = None
         self._basetimestamp = None
         self._starttimestamp = None
 
@@ -199,18 +198,18 @@ class Page(CopyvioMixIn):
         """Load various data from the API in a single query.
 
         Loads self._title, ._exists, ._is_redirect, ._pageid, ._fullurl,
-        ._protection, ._namespace, ._is_talkpage, ._creator, ._lastrevid,
-        ._token, and ._starttimestamp using the API. It will do a query of
-        its own unless *result* is provided, in which case we'll pretend
-        *result* is what the query returned.
+        ._protection, ._namespace, ._is_talkpage, ._creator, ._lastrevid, and
+        ._starttimestamp using the API. It will do a query of its own unless
+        *result* is provided, in which case we'll pretend *result* is what the
+        query returned.
 
         Assuming the API is sound, this should not raise any exceptions.
         """
         if not result:
             query = self.site.api_query
-            result = query(action="query", rvprop="user", intoken="edit",
-                           prop="info|revisions", rvlimit=1, rvdir="newer",
-                           titles=self._title, inprop="protection|url")
+            result = query(action="query", prop="info|revisions",
+                           inprop="protection|url", rvprop="user", rvlimit=1,
+                           rvdir="newer", titles=self._title)
 
         res = result["query"]["pages"].values()[0]
 
@@ -233,13 +232,7 @@ class Page(CopyvioMixIn):
 
         self._fullurl = res["fullurl"]
         self._protection = res["protection"]
-
-        try:
-            self._token = res["edittoken"]
-        except KeyError:
-            pass
-        else:
-            self._starttimestamp = strftime("%Y-%m-%dT%H:%M:%SZ", gmtime())
+        self._starttimestamp = strftime("%Y-%m-%dT%H:%M:%SZ", gmtime())
 
         # We've determined the namespace and talkpage status in __init__()
         # based on the title, but now we can be sure:
@@ -291,13 +284,6 @@ class Page(CopyvioMixIn):
         in _handle_edit_errors(). We'll then throw these back as subclasses of
         EditError.
         """
-        # Try to get our edit token, and die if we can't:
-        if not self._token:
-            self._load_attributes()
-        if not self._token:
-            e = "You don't have permission to edit this page."
-            raise exceptions.PermissionsError(e)
-
         # Weed out invalid pages before we get too far:
         self._assert_validity()
 
@@ -306,8 +292,7 @@ class Page(CopyvioMixIn):
             params = self._build_edit_params(text, summary, minor, bot, force,
                                              section, captcha_id, captcha_word)
         else: # Make sure we have the right token:
-            params["token"] = self._token
-        self._token = None  # Token now invalid
+            params["token"] = self.site.get_token()
 
         # Try the API query, catching most errors with our handler:
         try:
@@ -332,8 +317,9 @@ class Page(CopyvioMixIn):
         """Given some keyword arguments, build an API edit query string."""
         unitxt = text.encode("utf8") if isinstance(text, unicode) else text
         hashed = md5(unitxt).hexdigest()  # Checksum to ensure text is correct
-        params = {"action": "edit", "title": self._title, "text": text,
-                  "token": self._token, "summary": summary, "md5": hashed}
+        params = {
+            "action": "edit", "title": self._title, "text": text,
+            "token": self.site.get_token(), "summary": summary, "md5": hashed}
 
         if section:
             params["section"] = section
@@ -378,13 +364,13 @@ class Page(CopyvioMixIn):
             self._exists = self.PAGE_UNKNOWN
             raise exceptions.EditConflictError(error.info)
         elif error.code == "badtoken" and retry:
-            params["token"] = self.site.get_token("edit")
+            params["token"] = self.site.get_token(force=True)
             try:
                 return self.site.api_query(**params)
-            except exceptions.APIError as error:
-                if not hasattr(error, "code"):
+            except exceptions.APIError as err:
+                if not hasattr(err, "code"):
                     raise  # We can only handle errors with a code attribute
-                return self._handle_edit_errors(error, params, retry=False)
+                return self._handle_edit_errors(err, params, retry=False)
         elif error.code in ["emptypage", "emptynewsection"]:
             raise exceptions.NoContentError(error.info)
         elif error.code == "contenttoobig":
@@ -577,7 +563,7 @@ class Page(CopyvioMixIn):
             query = self.site.api_query
             result = query(action="query", rvlimit=1, titles=self._title,
                            prop="info|revisions", inprop="protection|url",
-                           intoken="edit", rvprop="content|timestamp")
+                           rvprop="content|timestamp")
             self._load_attributes(result=result)
             self._assert_existence()
             self._load_content(result=result)
@@ -610,7 +596,7 @@ class Page(CopyvioMixIn):
         :py:exc:`~earwigbot.exceptions.RedirectError` if the page is not a
         redirect.
         """
-        re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"
+        re_redirect = r"^\s*\#\s*redirect\s*\[\[(.*?)\]\]"
         content = self.get()
         try:
             return re.findall(re_redirect, content, flags=re.I)[0]
@@ -709,7 +695,7 @@ class Page(CopyvioMixIn):
         username = username.lower()
         optouts = [optout.lower() for optout in optouts] if optouts else []
 
-        r_bots = "\{\{\s*(no)?bots\s*(\||\}\})"
+        r_bots = r"\{\{\s*(no)?bots\s*(\||\}\})"
         filter = self.parse().ifilter_templates(recursive=True, matches=r_bots)
         for template in filter:
             if template.has_param("deny"):
diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py
index 0800bc3..a3ebe52 100644
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -83,6 +83,8 @@ class Site(object):
     """
     SERVICE_API = 1
     SERVICE_SQL = 2
+    SPECIAL_TOKENS = ["deleteglobalaccount", "patrol", "rollback",
+                      "setglobalaccountstatus", "userrights", "watch"]
 
     def __init__(self, name=None, project=None, lang=None, base_url=None,
                  article_path=None, script_path=None, sql=None,
@@ -124,6 +126,7 @@ class Site(object):
         self._wait_between_queries = wait_between_queries
         self._max_retries = 6
         self._last_query_time = 0
+        self._tokens = {}
         self._api_lock = RLock()
         self._api_info_cache = {"maxlag": 0, "lastcheck": 0}
 
@@ -252,13 +255,25 @@ class Site(object):
 
         return self._handle_api_result(result, params, tries, wait, ae_retry)
 
+    def _request_csrf_token(self, params):
+        """If possible, add a request for a CSRF token to an API query."""
+        if params.get("action") == "query":
+            if params.get("meta"):
+                if "tokens" not in params["meta"].split("|"):
+                    params["meta"] += "|tokens"
+            else:
+                params["meta"] = "tokens"
+            if params.get("type"):
+                if "csrf" not in params["type"].split("|"):
+                    params["type"] += "|csrf"
+
     def _build_api_query(self, params, ignore_maxlag, no_assert):
         """Given API query params, return the URL to query and POST data."""
         if not self._base_url or self._script_path is None:
             e = "Tried to do an API query, but no API URL is known."
             raise exceptions.APIError(e)
 
-        url = ''.join((self.url, self._script_path, "/api.php"))
+        url = self.url + self._script_path + "/api.php"
         params["format"] = "json"  # This is the only format we understand
         if self._assert_edit and not no_assert:
             # If requested, ensure that we're logged in
@@ -266,6 +281,9 @@ class Site(object):
         if self._maxlag and not ignore_maxlag:
             # If requested, don't overload the servers:
             params["maxlag"] = self._maxlag
+        if "csrf" not in self._tokens:
+            # If we don't have a CSRF token, try to fetch one:
+            self._request_csrf_token(params)
 
         data = self._urlencode_utf8(params)
         return url, data
@@ -282,6 +300,9 @@ class Site(object):
             code = res["error"]["code"]
             info = res["error"]["info"]
         except (TypeError, KeyError):  # If there's no error code/info, return
+            if "query" in res and "tokens" in res["query"]:
+                for name, token in res["query"]["tokens"].iteritems():
+                    self._tokens[name.split("token")[0]] = token
             return res
 
         if code == "maxlag":  # We've been throttled by the server
@@ -326,7 +347,7 @@ class Site(object):
         # All attributes to be loaded, except _namespaces, which is a special
         # case because it requires additional params in the API query:
         attrs = [self._name, self._project, self._lang, self._base_url,
-            self._article_path, self._script_path]
+                 self._article_path, self._script_path]
 
         params = {"action": "query", "meta": "siteinfo", "siprop": "general"}
 
@@ -485,6 +506,7 @@ class Site(object):
         from our first request, and *attempt* is to prevent getting stuck in a
         loop if MediaWiki isn't acting right.
         """
+        self._tokens.clear()
         name, password = login
 
         params = {"action": "login", "lgname": name, "lgpassword": password}
@@ -764,25 +786,26 @@ class Site(object):
         result = list(self.sql_query(query))
         return int(result[0][0])
 
-    def get_token(self, action):
+    def get_token(self, action=None, force=False):
         """Return a token for a data-modifying API action.
 
-        *action* must be one of the types listed on
-        <https://www.mediawiki.org/wiki/API:Tokens>. If it's given as a union
-        of types separated by |, then the function will return a dictionary
-        of tokens instead of a single one.
+        In general, this will be a CSRF token, unless *action* is in a special
+        list of non-CSRF tokens. Tokens are cached for the session (until
+        :meth:`_login` is called again); set *force* to ``True`` to force a new
+        token to be fetched.
 
-        Raises :py:exc:`~earwigbot.exceptions.PermissionsError` if we don't
-        have permissions for the requested action(s), or they are invalid.
-        Raises :py:exc:`~earwigbot.exceptions.APIError` if there was some other
-        API issue.
+        Raises :exc:`.APIError` if there was an API issue.
         """
-        res = self.api_query(action="tokens", type=action)
-        if "warnings" in res and "tokens" in res["warnings"]:
-            raise exceptions.PermissionsError(res["warnings"]["tokens"]["*"])
-        if "|" in action:
-            return res["tokens"]
-        return res["tokens"].values()[0]
+        if action not in self.SPECIAL_TOKENS:
+            action = "csrf"
+        if action in self._tokens and not force:
+            return self._tokens[action]
+
+        res = self.api_query(action="query", meta="tokens", type=action)
+        if action not in self._tokens:
+            err = "Tried to fetch a {0} token, but API returned: {1}"
+            raise exceptions.APIError(err.format(action, res))
+        return self._tokens[action]
 
     def namespace_id_to_name(self, ns_id, all=False):
         """Given a namespace ID, returns associated namespace names.