A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1012 regels
41 KiB

  1. # Copyright (C) 2009-2024 Ben Kurtovic <ben.kurtovic@gmail.com>
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19. # SOFTWARE.
  20. from http.cookiejar import CookieJar
  21. from json import dumps
  22. from logging import NullHandler, getLogger
  23. from os.path import expanduser
  24. from threading import RLock
  25. from time import sleep, time
  26. from urllib.parse import unquote_plus, urlparse
  27. import requests
  28. from requests_oauthlib import OAuth1
  29. from earwigbot import exceptions, importer
  30. from earwigbot.wiki import constants
  31. from earwigbot.wiki.category import Category
  32. from earwigbot.wiki.page import Page
  33. from earwigbot.wiki.user import User
  34. pymysql = importer.new("pymysql")
  35. __all__ = ["Site"]
  36. class Site:
  37. """
  38. **EarwigBot: Wiki Toolset: Site**
  39. Represents a site, with support for API queries and returning
  40. :py:class:`~earwigbot.wiki.page.Page`,
  41. :py:class:`~earwigbot.wiki.user.User`,
  42. and :py:class:`~earwigbot.wiki.category.Category` objects. The constructor
  43. takes a bunch of arguments and you probably won't need to call it directly,
  44. rather :py:meth:`wiki.get_site() <earwigbot.wiki.sitesdb.SitesDB.get_site>`
  45. for returning :py:class:`Site`
  46. instances, :py:meth:`wiki.add_site()
  47. <earwigbot.wiki.sitesdb.SitesDB.add_site>` for adding new ones to our
  48. database, and :py:meth:`wiki.remove_site()
  49. <earwigbot.wiki.sitesdb.SitesDB.remove_site>` for removing old ones from
  50. our database, should suffice.
  51. *Attributes:*
  52. - :py:attr:`name`: the site's name (or "wikiid"), like ``"enwiki"``
  53. - :py:attr:`project`: the site's project name, like ``"wikipedia"``
  54. - :py:attr:`lang`: the site's language code, like ``"en"``
  55. - :py:attr:`domain`: the site's web domain, like ``"en.wikipedia.org"``
  56. - :py:attr:`url`: the site's URL, like ``"https://en.wikipedia.org"``
  57. *Public methods:*
  58. - :py:meth:`api_query`: does an API query with kwargs as params
  59. - :py:meth:`sql_query`: does an SQL query and yields its results
  60. - :py:meth:`get_maxlag`: returns the internal database lag
  61. - :py:meth:`get_replag`: estimates the external database lag
  62. - :py:meth:`get_token`: gets a token for a specific API action
  63. - :py:meth:`namespace_id_to_name`: returns names associated with an NS id
  64. - :py:meth:`namespace_name_to_id`: returns the ID associated with a NS name
  65. - :py:meth:`get_page`: returns a Page for the given title
  66. - :py:meth:`get_category`: returns a Category for the given title
  67. - :py:meth:`get_user`: returns a User object for the given name
  68. - :py:meth:`delegate`: controls when the API or SQL is used
  69. """
  70. SERVICE_API = 1
  71. SERVICE_SQL = 2
  72. SPECIAL_TOKENS = [
  73. "createaccount",
  74. "deleteglobalaccount",
  75. "login",
  76. "patrol",
  77. "rollback",
  78. "setglobalaccountstatus",
  79. "userrights",
  80. "watch",
  81. ]
  82. def __init__(
  83. self,
  84. name=None,
  85. project=None,
  86. lang=None,
  87. base_url=None,
  88. article_path=None,
  89. script_path=None,
  90. sql=None,
  91. namespaces=None,
  92. login=(None, None),
  93. oauth=None,
  94. cookiejar=None,
  95. user_agent=None,
  96. use_https=True,
  97. assert_edit=None,
  98. maxlag=None,
  99. wait_between_queries=1,
  100. logger=None,
  101. search_config=None,
  102. ):
  103. """Constructor for new Site instances.
  104. This probably isn't necessary to call yourself unless you're building a
  105. Site that's not in your config and you don't want to add it - normally
  106. all you need is wiki.get_site(name), which creates the Site for you
  107. based on your config file and the sites database. We accept a bunch of
  108. kwargs, but the only ones you really "need" are *base_url* and
  109. *script_path*; this is enough to figure out an API url. *login*, a
  110. tuple of (username, password), can be used to log in using the legacy
  111. BotPasswords system; otherwise, a dict of OAuth info should be provided
  112. to *oauth*. *cookiejar* will be used to store cookies, and we'll use a
  113. normal CookieJar if none is given.
  114. First, we'll store the given arguments as attributes, then set up our
  115. requests session. We'll load any of the attributes that weren't given
  116. from the API, and then log in if a username/pass was given and we
  117. aren't already logged in.
  118. """
  119. # Attributes referring to site information, filled in by an API query
  120. # if they are missing (and an API url can be determined):
  121. self._name = name
  122. self._project = project
  123. self._lang = lang
  124. self._base_url = base_url
  125. self._article_path = article_path
  126. self._script_path = script_path
  127. self._namespaces = namespaces
  128. # Attributes used for API queries:
  129. self._use_https = use_https
  130. self._assert_edit = assert_edit
  131. self._maxlag = maxlag
  132. self._wait_between_queries = wait_between_queries
  133. self._max_retries = 6
  134. self._last_query_time = 0
  135. self._tokens = {}
  136. self._api_lock = RLock()
  137. self._api_info_cache = {"maxlag": 0, "lastcheck": 0}
  138. # Attributes used for SQL queries:
  139. if sql:
  140. self._sql_data = sql
  141. else:
  142. self._sql_data = {}
  143. self._sql_conn = None
  144. self._sql_lock = RLock()
  145. self._sql_info_cache = {"replag": 0, "lastcheck": 0, "usable": None}
  146. # Attribute used in copyright violation checks (see CopyrightMixIn):
  147. if search_config:
  148. self._search_config = search_config
  149. else:
  150. self._search_config = {}
  151. # Set up cookiejar and requests session for making API queries:
  152. if cookiejar is not None:
  153. self._cookiejar = cookiejar
  154. else:
  155. self._cookiejar = CookieJar()
  156. self._last_cookiejar_save = None
  157. if not user_agent:
  158. user_agent = constants.USER_AGENT # Set default UA
  159. self._oauth = oauth
  160. self._session = requests.Session()
  161. self._session.cookies = self._cookiejar
  162. self._session.headers["User-Agent"] = user_agent
  163. if oauth:
  164. self._session.auth = OAuth1(
  165. oauth["consumer_token"],
  166. oauth["consumer_secret"],
  167. oauth["access_token"],
  168. oauth["access_secret"],
  169. )
  170. # Set up our internal logger:
  171. if logger:
  172. self._logger = logger
  173. else: # Just set up a null logger to eat up our messages:
  174. self._logger = getLogger("earwigbot.wiki")
  175. self._logger.addHandler(NullHandler())
  176. # Get all of the above attributes that were not specified as arguments:
  177. self._load_attributes()
  178. # If we have a name/pass and the API says we're not logged in, log in:
  179. self._login_info = name, password = login
  180. if not self._oauth and name and password:
  181. logged_in_as = self._get_username_from_cookies()
  182. if not logged_in_as or name.replace("_", " ") != logged_in_as:
  183. self._login(login)
  184. def __repr__(self):
  185. """Return the canonical string representation of the Site."""
  186. res = ", ".join(
  187. (
  188. "Site(name={_name!r}",
  189. "project={_project!r}",
  190. "lang={_lang!r}",
  191. "base_url={_base_url!r}",
  192. "article_path={_article_path!r}",
  193. "script_path={_script_path!r}",
  194. "use_https={_use_https!r}",
  195. "assert_edit={_assert_edit!r}",
  196. "maxlag={_maxlag!r}",
  197. "sql={_sql_data!r}",
  198. "login={0}",
  199. "oauth={1}",
  200. "user_agent={3!r}",
  201. "cookiejar={2})",
  202. )
  203. )
  204. name, password = self._login_info
  205. login = "({}, {})".format(repr(name), "hidden" if password else None)
  206. oauth = "hidden" if self._oauth else None
  207. cookies = self._cookiejar.__class__.__name__
  208. if hasattr(self._cookiejar, "filename"):
  209. cookies += "({!r})".format(getattr(self._cookiejar, "filename"))
  210. else:
  211. cookies += "()"
  212. agent = self.user_agent
  213. return res.format(login, oauth, cookies, agent, **self.__dict__)
  214. def __str__(self):
  215. """Return a nice string representation of the Site."""
  216. res = "<Site {0} ({1}:{2}) at {3}>"
  217. return res.format(self.name, self.project, self.lang, self.domain)
  218. def _unicodeify(self, value, encoding="utf8"):
  219. """Return input as unicode if it's not unicode to begin with."""
  220. if isinstance(value, str):
  221. return value
  222. return str(value, encoding)
  223. def _api_query(
  224. self,
  225. params,
  226. tries=0,
  227. wait=5,
  228. ignore_maxlag=False,
  229. no_assert=False,
  230. ae_retry=True,
  231. ):
  232. """Do an API query with *params* as a dict of parameters.
  233. See the documentation for :py:meth:`api_query` for full implementation
  234. details. *tries*, *wait*, and *ignore_maxlag* are for maxlag;
  235. *no_assert* and *ae_retry* are for AssertEdit.
  236. """
  237. since_last_query = time() - self._last_query_time # Throttling support
  238. if since_last_query < self._wait_between_queries:
  239. wait_time = self._wait_between_queries - since_last_query
  240. log = f"Throttled: waiting {round(wait_time, 2)} seconds"
  241. self._logger.debug(log)
  242. sleep(wait_time)
  243. self._last_query_time = time()
  244. url, params = self._build_api_query(params, ignore_maxlag, no_assert)
  245. if "lgpassword" in params:
  246. self._logger.debug(f"{url} -> <hidden>")
  247. else:
  248. data = dumps(params)
  249. if len(data) > 1000:
  250. self._logger.debug(f"{url} -> {data[:997]}...")
  251. else:
  252. self._logger.debug(f"{url} -> {data}")
  253. try:
  254. response = self._session.post(url, data=params)
  255. response.raise_for_status()
  256. except requests.RequestException as exc:
  257. raise exceptions.APIError(f"API query failed: {exc}")
  258. return self._handle_api_result(response, params, tries, wait, ae_retry)
  259. def _request_csrf_token(self, params):
  260. """If possible, add a request for a CSRF token to an API query."""
  261. if params.get("action") == "query":
  262. if params.get("meta"):
  263. if "tokens" not in params["meta"].split("|"):
  264. params["meta"] += "|tokens"
  265. else:
  266. params["meta"] = "tokens"
  267. if params.get("type"):
  268. if "csrf" not in params["type"].split("|"):
  269. params["type"] += "|csrf"
  270. def _build_api_query(self, params, ignore_maxlag, no_assert):
  271. """Given API query params, return the URL to query and POST data."""
  272. if not self._base_url or self._script_path is None:
  273. e = "Tried to do an API query, but no API URL is known."
  274. raise exceptions.APIError(e)
  275. url = self.url + self._script_path + "/api.php"
  276. params["format"] = "json" # This is the only format we understand
  277. if self._assert_edit and not no_assert:
  278. # If requested, ensure that we're logged in
  279. params["assert"] = self._assert_edit
  280. if self._maxlag and not ignore_maxlag:
  281. # If requested, don't overload the servers:
  282. params["maxlag"] = self._maxlag
  283. if "csrf" not in self._tokens:
  284. # If we don't have a CSRF token, try to fetch one:
  285. self._request_csrf_token(params)
  286. return url, params
  287. def _handle_api_result(self, response, params, tries, wait, ae_retry):
  288. """Given an API query response, attempt to return useful data."""
  289. try:
  290. res = response.json()
  291. except ValueError:
  292. e = "API query failed: JSON could not be decoded."
  293. raise exceptions.APIError(e)
  294. if "warnings" in res:
  295. for name, value in res["warnings"].items():
  296. try:
  297. warning = value["warnings"]
  298. except KeyError:
  299. try:
  300. warning = value["*"]
  301. except KeyError:
  302. warning = value
  303. self._logger.warning("API warning: %s: %s", name, warning)
  304. if self._should_save_cookiejar():
  305. self._save_cookiejar()
  306. try:
  307. code = res["error"]["code"]
  308. info = res["error"]["info"]
  309. except (TypeError, KeyError): # If there's no error code/info, return
  310. if "query" in res and "tokens" in res["query"]:
  311. for name, token in res["query"]["tokens"].items():
  312. self._tokens[name.split("token")[0]] = token
  313. return res
  314. if code == "maxlag": # We've been throttled by the server
  315. if tries >= self._max_retries:
  316. e = "Maximum number of retries reached ({0})."
  317. raise exceptions.APIError(e.format(self._max_retries))
  318. tries += 1
  319. msg = 'Server says "{0}"; retrying in {1} seconds ({2}/{3})'
  320. self._logger.info(msg.format(info, wait, tries, self._max_retries))
  321. sleep(wait)
  322. return self._api_query(params, tries, wait * 2, ae_retry=ae_retry)
  323. elif code in ["assertuserfailed", "assertbotfailed"]: # AssertEdit
  324. if ae_retry and all(self._login_info) and not self._oauth:
  325. # Try to log in if we got logged out:
  326. self._login(self._login_info)
  327. if "token" in params: # Fetch a new one; this is invalid now
  328. params["token"] = self.get_token(params["action"])
  329. return self._api_query(params, tries, wait, ae_retry=False)
  330. if not all(self._login_info) and not self._oauth:
  331. e = "Assertion failed, and no login info was provided."
  332. elif code == "assertbotfailed":
  333. e = "Bot assertion failed: we don't have a bot flag!"
  334. else:
  335. e = "User assertion failed due to an unknown issue. Cookie or OAuth problem?"
  336. raise exceptions.PermissionsError("AssertEdit: " + e)
  337. else: # Some unknown error occurred
  338. e = 'API query failed: got error "{0}"; server says: "{1}".'
  339. error = exceptions.APIError(e.format(code, info))
  340. error.code, error.info = code, info
  341. raise error
  342. def _load_attributes(self, force=False):
  343. """Load data about our Site from the API.
  344. This function is called by __init__() when one of the site attributes
  345. was not given as a keyword argument. We'll do an API query to get the
  346. missing data, but only if there actually *is* missing data.
  347. Additionally, you can call this with *force* set to True to forcibly
  348. reload all attributes.
  349. """
  350. # All attributes to be loaded, except _namespaces, which is a special
  351. # case because it requires additional params in the API query:
  352. attrs = [
  353. self._name,
  354. self._project,
  355. self._lang,
  356. self._base_url,
  357. self._article_path,
  358. self._script_path,
  359. ]
  360. params = {"action": "query", "meta": "siteinfo", "siprop": "general"}
  361. if not self._namespaces or force:
  362. params["siprop"] += "|namespaces|namespacealiases"
  363. with self._api_lock:
  364. result = self._api_query(params, no_assert=True)
  365. self._load_namespaces(result)
  366. elif all(attrs): # Everything is already specified and we're not told
  367. return # to force a reload, so do nothing
  368. else: # We're only loading attributes other than _namespaces
  369. with self._api_lock:
  370. result = self._api_query(params, no_assert=True)
  371. res = result["query"]["general"]
  372. self._name = res["wikiid"]
  373. self._project = res["sitename"].lower()
  374. self._lang = res["lang"]
  375. self._base_url = res["server"]
  376. self._article_path = res["articlepath"]
  377. self._script_path = res["scriptpath"]
  378. def _load_namespaces(self, result):
  379. """Fill self._namespaces with a dict of namespace IDs and names.
  380. Called by _load_attributes() with API data as *result* when
  381. self._namespaces was not given as an kwarg to __init__().
  382. """
  383. self._namespaces = {}
  384. for namespace in result["query"]["namespaces"].values():
  385. ns_id = namespace["id"]
  386. name = namespace["*"]
  387. try:
  388. canonical = namespace["canonical"]
  389. except KeyError:
  390. self._namespaces[ns_id] = [name]
  391. else:
  392. if name != canonical:
  393. self._namespaces[ns_id] = [name, canonical]
  394. else:
  395. self._namespaces[ns_id] = [name]
  396. for namespace in result["query"]["namespacealiases"]:
  397. ns_id = namespace["id"]
  398. alias = namespace["*"]
  399. self._namespaces[ns_id].append(alias)
  400. def _get_cookie(self, name, domain):
  401. """Return the named cookie unless it is expired or doesn't exist."""
  402. for cookie in self._cookiejar:
  403. if cookie.name == name and cookie.domain == domain:
  404. if cookie.is_expired():
  405. break
  406. return cookie
  407. def _get_username_from_cookies(self):
  408. """Try to return our username based solely on cookies.
  409. First, we'll look for a cookie named self._name + "Token", like
  410. "enwikiToken". If it exists and isn't expired, we'll assume it's valid
  411. and try to return the value of the cookie self._name + "UserName" (like
  412. "enwikiUserName"). This should work fine on wikis without single-user
  413. login.
  414. If `enwikiToken` doesn't exist, we'll try to find a cookie named
  415. `centralauth_Token`. If this exists and is not expired, we'll try to
  416. return the value of `centralauth_User`.
  417. If we didn't get any matches, we'll return None. Our goal here isn't to
  418. return the most likely username, or what we *want* our username to be
  419. (for that, we'd do self._login_info[0]), but rather to get our current
  420. username without an unnecessary ?action=query&meta=userinfo API query.
  421. """
  422. name = "".join((self._name, "Token"))
  423. cookie = self._get_cookie(name, self.domain)
  424. if cookie:
  425. name = "".join((self._name, "UserName"))
  426. user_name = self._get_cookie(name, self.domain)
  427. if user_name:
  428. return unquote_plus(user_name.value)
  429. for cookie in self._cookiejar:
  430. if cookie.name != "centralauth_Token" or cookie.is_expired():
  431. continue
  432. base = cookie.domain
  433. if base.startswith(".") and not cookie.domain_initial_dot:
  434. base = base[1:]
  435. if self.domain.endswith(base):
  436. user_name = self._get_cookie("centralauth_User", cookie.domain)
  437. if user_name:
  438. return unquote_plus(user_name.value)
  439. def _get_username_from_api(self):
  440. """Do a simple API query to get our username and return it.
  441. This is a reliable way to make sure we are actually logged in, because
  442. it doesn't deal with annoying cookie logic, but it results in an API
  443. query that is unnecessary in some cases.
  444. Called by _get_username() (in turn called by get_user() with no
  445. username argument) when cookie lookup fails, probably indicating that
  446. we are logged out.
  447. """
  448. result = self.api_query(action="query", meta="userinfo")
  449. return result["query"]["userinfo"]["name"]
  450. def _get_username(self):
  451. """Return the name of the current user, whether logged in or not.
  452. First, we'll try to deduce it solely from cookies, to avoid an
  453. unnecessary API query. For the cookie-detection method, see
  454. _get_username_from_cookies()'s docs.
  455. If our username isn't in cookies, then we're either using OAuth or
  456. we're probably not logged in, or something fishy is going on (like
  457. forced logout). If we're using OAuth and a username was configured,
  458. assume it is accurate and use it. Otherwise, do a single API query for
  459. our username (or IP address) and return that.
  460. """
  461. name = self._get_username_from_cookies()
  462. if name:
  463. return name
  464. if self._oauth and self._login_info[0]:
  465. return self._login_info[0]
  466. return self._get_username_from_api()
  467. def _should_save_cookiejar(self):
  468. """Return a bool indicating whether we should save the cookiejar.
  469. This is True if we haven't saved the cookiejar yet this session, or if
  470. our last save was over a day ago.
  471. """
  472. max_staleness = 60 * 60 * 24 # 1 day
  473. if not self._last_cookiejar_save:
  474. return True
  475. return time() - self._last_cookiejar_save > max_staleness
  476. def _save_cookiejar(self):
  477. """Try to save our cookiejar after doing a (normal) login or logout.
  478. Calls the standard .save() method with no filename. Don't fret if our
  479. cookiejar doesn't support saving (CookieJar raises AttributeError,
  480. FileCookieJar raises NotImplementedError) or no default filename was
  481. given (LWPCookieJar and MozillaCookieJar raise ValueError).
  482. """
  483. if hasattr(self._cookiejar, "save"):
  484. try:
  485. getattr(self._cookiejar, "save")()
  486. except (NotImplementedError, ValueError):
  487. pass
  488. self._last_cookiejar_save = time()
  489. def _login(self, login):
  490. """Safely login through the API.
  491. Normally, this is called by __init__() if a username and password have
  492. been provided and no valid login cookies were found. The only other
  493. time it needs to be called is when those cookies expire, which is done
  494. automatically by api_query() if a query fails.
  495. *login* is a (username, password) tuple.
  496. Raises LoginError on login errors (duh), like bad passwords and
  497. nonexistent usernames.
  498. """
  499. self._tokens.clear()
  500. name, password = login
  501. params = {"action": "query", "meta": "tokens", "type": "login"}
  502. with self._api_lock:
  503. result = self._api_query(params, no_assert=True)
  504. try:
  505. token = result["query"]["tokens"]["logintoken"]
  506. except KeyError:
  507. raise exceptions.LoginError("Couldn't get login token")
  508. params = {
  509. "action": "login",
  510. "lgname": name,
  511. "lgpassword": password,
  512. "lgtoken": token,
  513. }
  514. with self._api_lock:
  515. result = self._api_query(params, no_assert=True)
  516. res = result["login"]["result"]
  517. if res == "Success":
  518. self._tokens.clear()
  519. self._save_cookiejar()
  520. return
  521. if res == "Illegal":
  522. e = "The provided username is illegal."
  523. elif res == "NotExists":
  524. e = "The provided username does not exist."
  525. elif res == "EmptyPass":
  526. e = "No password was given."
  527. elif res == "WrongPass" or res == "WrongPluginPass":
  528. e = "The given password is incorrect."
  529. else:
  530. e = f"Couldn't login; server says '{res}'."
  531. raise exceptions.LoginError(e)
  532. def _logout(self):
  533. """Safely logout through the API.
  534. We'll do a simple API request (api.php?action=logout), clear our
  535. cookiejar (which probably contains now-invalidated cookies) and try to
  536. save it, if it supports that sort of thing.
  537. """
  538. self.api_query(action="logout")
  539. self._cookiejar.clear()
  540. self._save_cookiejar()
  541. def _sql_connect(self, **kwargs):
  542. """Attempt to establish a connection with this site's SQL database.
  543. pymysql.connect() will be called with self._sql_data as its kwargs.
  544. Any kwargs given to this function will be passed to connect() and will
  545. have precedence over the config file.
  546. Will raise SQLError() if the module "pymysql" is not available. pymysql
  547. may raise its own exceptions (e.g. pymysql.InterfaceError) if it cannot
  548. establish a connection.
  549. """
  550. args = self._sql_data
  551. for key, value in kwargs.items():
  552. args[key] = value
  553. if (
  554. "read_default_file" not in args
  555. and "user" not in args
  556. and "passwd" not in args
  557. ):
  558. args["read_default_file"] = expanduser("~/.my.cnf")
  559. elif "read_default_file" in args:
  560. args["read_default_file"] = expanduser(args["read_default_file"])
  561. if "autoping" not in args:
  562. args["autoping"] = True
  563. if "autoreconnect" not in args:
  564. args["autoreconnect"] = True
  565. try:
  566. self._sql_conn = pymysql.connect(**args)
  567. except ImportError:
  568. e = "SQL querying requires the 'pymysql' package: https://pymysql.readthedocs.io/"
  569. raise exceptions.SQLError(e)
  570. def _get_service_order(self):
  571. """Return a preferred order for using services (e.g. the API and SQL).
  572. A list is returned, starting with the most preferred service first and
  573. ending with the least preferred one. Currently, there are only two
  574. services. SERVICE_API will always be included since the API is expected
  575. to be always usable. In normal circumstances, self.SERVICE_SQL will be
  576. first (with the API second), since using SQL directly is easier on the
  577. servers than making web queries with the API. self.SERVICE_SQL will be
  578. second if replag is greater than three minutes (a cached value updated
  579. every two minutes at most), *unless* API lag is also very high.
  580. self.SERVICE_SQL will not be included in the list if we cannot form a
  581. proper SQL connection.
  582. """
  583. now = time()
  584. if now - self._sql_info_cache["lastcheck"] > 120:
  585. self._sql_info_cache["lastcheck"] = now
  586. try:
  587. try:
  588. self._sql_info_cache["replag"] = sqllag = self.get_replag()
  589. except pymysql.Error as exc:
  590. raise exceptions.SQLError(str(exc))
  591. except (exceptions.SQLError, ImportError):
  592. self._sql_info_cache["usable"] = False
  593. return [self.SERVICE_API]
  594. self._sql_info_cache["usable"] = True
  595. else:
  596. if not self._sql_info_cache["usable"]:
  597. return [self.SERVICE_API]
  598. sqllag = self._sql_info_cache["replag"]
  599. if sqllag > 300:
  600. if not self._maxlag:
  601. return [self.SERVICE_API, self.SERVICE_SQL]
  602. if now - self._api_info_cache["lastcheck"] > 300:
  603. self._api_info_cache["lastcheck"] = now
  604. try:
  605. self._api_info_cache["maxlag"] = apilag = self.get_maxlag()
  606. except exceptions.APIError:
  607. self._api_info_cache["maxlag"] = apilag = 0
  608. else:
  609. apilag = self._api_info_cache["maxlag"]
  610. if apilag > self._maxlag:
  611. return [self.SERVICE_SQL, self.SERVICE_API]
  612. return [self.SERVICE_API, self.SERVICE_SQL]
  613. return [self.SERVICE_SQL, self.SERVICE_API]
  614. @property
  615. def name(self):
  616. """The Site's name (or "wikiid" in the API), like ``"enwiki"``."""
  617. return self._name
  618. @property
  619. def project(self):
  620. """The Site's project name in lowercase, like ``"wikipedia"``."""
  621. return self._project
  622. @property
  623. def lang(self):
  624. """The Site's language code, like ``"en"`` or ``"es"``."""
  625. return self._lang
  626. @property
  627. def domain(self):
  628. """The Site's web domain, like ``"en.wikipedia.org"``."""
  629. return urlparse(self._base_url).netloc
  630. @property
  631. def url(self):
  632. """The Site's full base URL, like ``"https://en.wikipedia.org"``."""
  633. url = self._base_url
  634. if url.startswith("//"): # Protocol-relative URLs from 1.18
  635. if self._use_https:
  636. url = "https:" + url
  637. else:
  638. url = "http:" + url
  639. return url
  640. @property
  641. def user_agent(self):
  642. """The User-Agent header sent to the API by the requests session."""
  643. return self._session.headers["User-Agent"]
  644. def api_query(self, **kwargs):
  645. """Do an API query with `kwargs` as the parameters.
  646. This will first attempt to construct an API url from
  647. :py:attr:`self._base_url` and :py:attr:`self._script_path`. We need
  648. both of these, or else we'll raise
  649. :py:exc:`~earwigbot.exceptions.APIError`. If
  650. :py:attr:`self._base_url` is protocol-relative (introduced in MediaWiki
  651. 1.18), we'll choose HTTPS only if :py:attr:`self._user_https` is
  652. ``True``, otherwise HTTP.
  653. We'll encode the given params, adding ``format=json`` along the way, as
  654. well as ``&assert=`` and ``&maxlag=`` based on
  655. :py:attr:`self._assert_edit` and :py:attr:`_maxlag` respectively.
  656. Additionally, we'll sleep a bit if the last query was made fewer than
  657. :py:attr:`self._wait_between_queries` seconds ago. The request is made
  658. through :py:attr:`self._session`, which has cookie support
  659. (:py:attr:`self._cookiejar`) and a ``User-Agent``
  660. (:py:const:`earwigbot.wiki.constants.USER_AGENT`).
  661. Assuming everything went well, we'll gunzip the data (if compressed),
  662. load it as a JSON object, and return it.
  663. If our request failed for some reason, we'll raise
  664. :py:exc:`~earwigbot.exceptions.APIError` with details. If that
  665. reason was due to maxlag, we'll sleep for a bit and then repeat the
  666. query until we exceed :py:attr:`self._max_retries`.
  667. There is helpful MediaWiki API documentation at `MediaWiki.org
  668. <https://www.mediawiki.org/wiki/API>`_.
  669. """
  670. with self._api_lock:
  671. return self._api_query(kwargs)
  672. def sql_query(
  673. self,
  674. query,
  675. params=(),
  676. plain_query=False,
  677. dict_cursor=False,
  678. cursor_class=None,
  679. buffsize=1024,
  680. ):
  681. """Do an SQL query and yield its results.
  682. If *plain_query* is ``True``, we will force an unparameterized query.
  683. Specifying both *params* and *plain_query* will cause an error. If
  684. *dict_cursor* is ``True``, we will use
  685. :py:class:`pymysql.cursors.DictCursor` as our cursor, otherwise the
  686. default :py:class:`pymysql.cursors.Cursor`. If *cursor_class* is given,
  687. it will override this option.
  688. *buffsize* is the size of each memory-buffered group of results, to
  689. reduce the number of conversations with the database; it is passed to
  690. :py:meth:`cursor.fetchmany() <pymysql.cursors.Cursor.fetchmany>`. If
  691. set to ``0```, all results will be buffered in memory at once (this
  692. uses :py:meth:`fetchall() <pymysql.cursors.Cursor.fetchall>`). If set
  693. to ``1``, it is equivalent to using
  694. :py:meth:`fetchone() <pymysql.cursors.Cursor.fetchone>`.
  695. Example usage::
  696. >>> query = "SELECT user_id, user_registration FROM user WHERE user_name = ?"
  697. >>> params = ("The Earwig",)
  698. >>> result1 = site.sql_query(query, params)
  699. >>> result2 = site.sql_query(query, params, dict_cursor=True)
  700. >>> for row in result1: print row
  701. (7418060L, '20080703215134')
  702. >>> for row in result2: print row
  703. {'user_id': 7418060L, 'user_registration': '20080703215134'}
  704. This may raise :py:exc:`~earwigbot.exceptions.SQLError` or one of
  705. pymysql's exceptions (:py:exc:`pymysql.ProgrammingError`,
  706. :py:exc:`pymysql.InterfaceError`, ...) if there were problems with the
  707. query.
  708. See :py:meth:`_sql_connect` for information on how a connection is
  709. acquired. Also relevant is `pymysql's documentation
  710. <https://pymysql.readthedocs.io/>`_ for details on that package.
  711. """
  712. if not cursor_class:
  713. if dict_cursor:
  714. cursor_class = pymysql.cursors.DictCursor
  715. else:
  716. cursor_class = pymysql.cursors.Cursor
  717. klass = cursor_class
  718. with self._sql_lock:
  719. if not self._sql_conn:
  720. self._sql_connect()
  721. with self._sql_conn.cursor(klass) as cur:
  722. cur.execute(query, params, plain_query)
  723. if buffsize:
  724. while True:
  725. group = cur.fetchmany(buffsize)
  726. if not group:
  727. return
  728. for result in group:
  729. yield result
  730. for result in cur.fetchall():
  731. yield result
  732. def get_maxlag(self, showall=False):
  733. """Return the internal database replication lag in seconds.
  734. In a typical setup, this function returns the replication lag *within*
  735. the WMF's cluster, *not* external replication lag affecting the
  736. Toolserver (see :py:meth:`get_replag` for that). This is useful when
  737. combined with the ``maxlag`` API query param (added by config), in
  738. which queries will be halted and retried if the lag is too high,
  739. usually above five seconds.
  740. With *showall*, will return a list of the lag for all servers in the
  741. cluster, not just the one with the highest lag.
  742. """
  743. params = {"action": "query", "meta": "siteinfo", "siprop": "dbrepllag"}
  744. if showall:
  745. params["sishowalldb"] = 1
  746. with self._api_lock:
  747. result = self._api_query(params, ignore_maxlag=True)
  748. if showall:
  749. return [server["lag"] for server in result["query"]["dbrepllag"]]
  750. return result["query"]["dbrepllag"][0]["lag"]
  751. def get_replag(self):
  752. """Return the estimated external database replication lag in seconds.
  753. Requires SQL access. This function only makes sense on a replicated
  754. database (e.g. the Wikimedia Toolserver) and on a wiki that receives a
  755. large number of edits (ideally, at least one per second), or the result
  756. may be larger than expected, since it works by subtracting the current
  757. time from the timestamp of the latest recent changes event.
  758. This may raise :py:exc:`~earwigbot.exceptions.SQLError` or one of
  759. pymysql's exceptions (:py:exc:`pymysql.ProgrammingError`,
  760. :py:exc:`pymysql.InterfaceError`, ...) if there were problems.
  761. """
  762. query = """SELECT UNIX_TIMESTAMP() - UNIX_TIMESTAMP(rc_timestamp) FROM
  763. recentchanges ORDER BY rc_timestamp DESC LIMIT 1"""
  764. result = list(self.sql_query(query))
  765. return int(result[0][0])
  766. def get_token(self, action=None, force=False):
  767. """Return a token for a data-modifying API action.
  768. In general, this will be a CSRF token, unless *action* is in a special
  769. list of non-CSRF tokens. Tokens are cached for the session (until
  770. :meth:`_login` is called again); set *force* to ``True`` to force a new
  771. token to be fetched.
  772. Raises :exc:`.APIError` if there was an API issue.
  773. """
  774. if action not in self.SPECIAL_TOKENS:
  775. action = "csrf"
  776. if action in self._tokens and not force:
  777. return self._tokens[action]
  778. res = self.api_query(action="query", meta="tokens", type=action)
  779. if action not in self._tokens:
  780. err = "Tried to fetch a {0} token, but API returned: {1}"
  781. raise exceptions.APIError(err.format(action, res))
  782. return self._tokens[action]
  783. def namespace_id_to_name(self, ns_id, all=False):
  784. """Given a namespace ID, returns associated namespace names.
  785. If *all* is ``False`` (default), we'll return the first name in the
  786. list, which is usually the localized version. Otherwise, we'll return
  787. the entire list, which includes the canonical name. For example, this
  788. returns ``u"Wikipedia"`` if *ns_id* = ``4`` and *all* is ``False`` on
  789. ``enwiki``; returns ``[u"Wikipedia", u"Project", u"WP"]`` if *ns_id* =
  790. ``4`` and *all* is ``True``.
  791. Raises :py:exc:`~earwigbot.exceptions.NamespaceNotFoundError` if the ID
  792. is not found.
  793. """
  794. try:
  795. if all:
  796. return self._namespaces[ns_id]
  797. else:
  798. return self._namespaces[ns_id][0]
  799. except KeyError:
  800. e = f"There is no namespace with id {ns_id}."
  801. raise exceptions.NamespaceNotFoundError(e)
  802. def namespace_name_to_id(self, name):
  803. """Given a namespace name, returns the associated ID.
  804. Like :py:meth:`namespace_id_to_name`, but reversed. Case is ignored,
  805. because namespaces are assumed to be case-insensitive.
  806. Raises :py:exc:`~earwigbot.exceptions.NamespaceNotFoundError` if the
  807. name is not found.
  808. """
  809. lname = name.lower()
  810. for ns_id, names in self._namespaces.items():
  811. lnames = [n.lower() for n in names] # Be case-insensitive
  812. if lname in lnames:
  813. return ns_id
  814. e = f"There is no namespace with name '{name}'."
  815. raise exceptions.NamespaceNotFoundError(e)
  816. def get_page(self, title, follow_redirects=False, pageid=None):
  817. """Return a :py:class:`Page` object for the given title.
  818. *follow_redirects* is passed directly to
  819. :py:class:`~earwigbot.wiki.page.Page`'s constructor. Also, this will
  820. return a :py:class:`~earwigbot.wiki.category.Category` object instead
  821. if the given title is in the category namespace. As
  822. :py:class:`~earwigbot.wiki.category.Category` is a subclass of
  823. :py:class:`~earwigbot.wiki.page.Page`, this should not cause problems.
  824. Note that this doesn't do any direct checks for existence or
  825. redirect-following: :py:class:`~earwigbot.wiki.page.Page`'s methods
  826. provide that.
  827. """
  828. title = self._unicodeify(title)
  829. prefixes = self.namespace_id_to_name(constants.NS_CATEGORY, all=True)
  830. prefix = title.split(":", 1)[0]
  831. if prefix != title: # Avoid a page that is simply "Category"
  832. if prefix in prefixes:
  833. return Category(self, title, follow_redirects, pageid, self._logger)
  834. return Page(self, title, follow_redirects, pageid, self._logger)
  835. def get_category(self, catname, follow_redirects=False, pageid=None):
  836. """Return a :py:class:`Category` object for the given category name.
  837. *catname* should be given *without* a namespace prefix. This method is
  838. really just shorthand for :py:meth:`get_page("Category:" + catname)
  839. <get_page>`.
  840. """
  841. catname = self._unicodeify(catname)
  842. prefix = self.namespace_id_to_name(constants.NS_CATEGORY)
  843. pagename = ":".join((prefix, catname))
  844. return Category(self, pagename, follow_redirects, pageid, self._logger)
  845. def get_user(self, username=None):
  846. """Return a :py:class:`User` object for the given username.
  847. If *username* is left as ``None``, then a
  848. :py:class:`~earwigbot.wiki.user.User` object representing the currently
  849. logged-in (or anonymous!) user is returned.
  850. """
  851. if username:
  852. username = self._unicodeify(username)
  853. else:
  854. username = self._get_username()
  855. return User(self, username, self._logger)
  856. def delegate(self, services, args=None, kwargs=None):
  857. """Delegate a task to either the API or SQL depending on conditions.
  858. *services* should be a dictionary in which the key is the service name
  859. (:py:attr:`self.SERVICE_API <SERVICE_API>` or
  860. :py:attr:`self.SERVICE_SQL <SERVICE_SQL>`), and the value is the
  861. function to call for this service. All functions will be passed the
  862. same arguments the tuple *args* and the dict *kwargs*, which are both
  863. empty by default. The service order is determined by
  864. :py:meth:`_get_service_order`.
  865. Not every service needs an entry in the dictionary. Will raise
  866. :py:exc:`~earwigbot.exceptions.NoServiceError` if an appropriate
  867. service cannot be found.
  868. """
  869. if not args:
  870. args = ()
  871. if not kwargs:
  872. kwargs = {}
  873. order = self._get_service_order()
  874. for srv in order:
  875. if srv in services:
  876. try:
  877. return services[srv](*args, **kwargs)
  878. except exceptions.ServiceError:
  879. continue
  880. raise exceptions.NoServiceError(services)