A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

856 行
36 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from cookielib import CookieJar
  23. from gzip import GzipFile
  24. from json import loads
  25. from logging import getLogger, NullHandler
  26. from os.path import expanduser
  27. from re import escape as re_escape, match as re_match
  28. from StringIO import StringIO
  29. from threading import Lock
  30. from time import sleep, time
  31. from urllib import quote_plus
  32. from urllib2 import build_opener, HTTPCookieProcessor, URLError
  33. from urlparse import urlparse
  34. try:
  35. import oursql
  36. except ImportError:
  37. oursql = None
  38. from earwigbot import exceptions
  39. from earwigbot.wiki import constants
  40. from earwigbot.wiki.category import Category
  41. from earwigbot.wiki.page import Page
  42. from earwigbot.wiki.user import User
  43. __all__ = ["Site"]
  44. class Site(object):
  45. """
  46. **EarwigBot: Wiki Toolset: Site**
  47. Represents a site, with support for API queries and returning
  48. :py:class:`~earwigbot.wiki.page.Page`,
  49. :py:class:`~earwigbot.wiki.user.User`,
  50. and :py:class:`~earwigbot.wiki.category.Category` objects. The constructor
  51. takes a bunch of arguments and you probably won't need to call it directly,
  52. rather :py:meth:`wiki.get_site() <earwigbot.wiki.sitesdb.SitesDB.get_site>`
  53. for returning :py:class:`Site`
  54. instances, :py:meth:`wiki.add_site()
  55. <earwigbot.wiki.sitesdb.SitesDB.add_site>` for adding new ones to our
  56. database, and :py:meth:`wiki.remove_site()
  57. <earwigbot.wiki.sitesdb.SitesDB.remove_site>` for removing old ones from
  58. our database, should suffice.
  59. *Attributes:*
  60. - :py:attr:`name`: the site's name (or "wikiid"), like ``"enwiki"``
  61. - :py:attr:`project`: the site's project name, like ``"wikipedia"``
  62. - :py:attr:`lang`: the site's language code, like ``"en"``
  63. - :py:attr:`domain`: the site's web domain, like ``"en.wikipedia.org"``
  64. - :py:attr:`url`: the site's URL, like ``"https://en.wikipedia.org"``
  65. *Public methods:*
  66. - :py:meth:`api_query`: does an API query with kwargs as params
  67. - :py:meth:`sql_query`: does an SQL query and yields its results
  68. - :py:meth:`get_maxlag`: returns the internal database lag
  69. - :py:meth:`get_replag`: estimates the external database lag
  70. - :py:meth:`namespace_id_to_name`: returns names associated with an NS id
  71. - :py:meth:`namespace_name_to_id`: returns the ID associated with a NS name
  72. - :py:meth:`get_page`: returns a Page for the given title
  73. - :py:meth:`get_category`: returns a Category for the given title
  74. - :py:meth:`get_user`: returns a User object for the given name
  75. - :py:meth:`delegate`: controls when the API or SQL is used
  76. """
  77. SERVICE_API = 1
  78. SERVICE_SQL = 2
  79. def __init__(self, name=None, project=None, lang=None, base_url=None,
  80. article_path=None, script_path=None, sql=None,
  81. namespaces=None, login=(None, None), cookiejar=None,
  82. user_agent=None, use_https=False, assert_edit=None,
  83. maxlag=None, wait_between_queries=3, logger=None,
  84. search_config=None):
  85. """Constructor for new Site instances.
  86. This probably isn't necessary to call yourself unless you're building a
  87. Site that's not in your config and you don't want to add it - normally
  88. all you need is wiki.get_site(name), which creates the Site for you
  89. based on your config file and the sites database. We accept a bunch of
  90. kwargs, but the only ones you really "need" are *base_url* and
  91. *script_path*; this is enough to figure out an API url. *login*, a
  92. tuple of (username, password), is highly recommended. *cookiejar* will
  93. be used to store cookies, and we'll use a normal CookieJar if none is
  94. given.
  95. First, we'll store the given arguments as attributes, then set up our
  96. URL opener. We'll load any of the attributes that weren't given from
  97. the API, and then log in if a username/pass was given and we aren't
  98. already logged in.
  99. """
  100. # Attributes referring to site information, filled in by an API query
  101. # if they are missing (and an API url can be determined):
  102. self._name = name
  103. self._project = project
  104. self._lang = lang
  105. self._base_url = base_url
  106. self._article_path = article_path
  107. self._script_path = script_path
  108. self._namespaces = namespaces
  109. # Attributes used for API queries:
  110. self._use_https = use_https
  111. self._assert_edit = assert_edit
  112. self._maxlag = maxlag
  113. self._wait_between_queries = wait_between_queries
  114. self._max_retries = 6
  115. self._last_query_time = 0
  116. self._api_lock = Lock()
  117. self._api_info_cache = {"maxlag": 0, "lastcheck": 0}
  118. # Attributes used for SQL queries:
  119. if sql:
  120. self._sql_data = sql
  121. else:
  122. self._sql_data = {}
  123. self._sql_conn = None
  124. self._sql_lock = Lock()
  125. self._sql_info_cache = {"replag": 0, "lastcheck": 0, "usable": None}
  126. # Attribute used in copyright violation checks (see CopyrightMixIn):
  127. if search_config:
  128. self._search_config = search_config
  129. else:
  130. self._search_config = {}
  131. # Set up cookiejar and URL opener for making API queries:
  132. if cookiejar is not None:
  133. self._cookiejar = cookiejar
  134. else:
  135. self._cookiejar = CookieJar()
  136. if not user_agent:
  137. user_agent = constants.USER_AGENT # Set default UA
  138. self._opener = build_opener(HTTPCookieProcessor(self._cookiejar))
  139. self._opener.addheaders = [("User-Agent", user_agent),
  140. ("Accept-Encoding", "gzip")]
  141. # Set up our internal logger:
  142. if logger:
  143. self._logger = logger
  144. else: # Just set up a null logger to eat up our messages:
  145. self._logger = getLogger("earwigbot.wiki")
  146. self._logger.addHandler(NullHandler())
  147. # Get all of the above attributes that were not specified as arguments:
  148. self._load_attributes()
  149. # If we have a name/pass and the API says we're not logged in, log in:
  150. self._login_info = name, password = login
  151. if name and password:
  152. logged_in_as = self._get_username_from_cookies()
  153. if not logged_in_as or name != logged_in_as:
  154. self._login(login)
  155. def __repr__(self):
  156. """Return the canonical string representation of the Site."""
  157. res = ", ".join((
  158. "Site(name={_name!r}", "project={_project!r}", "lang={_lang!r}",
  159. "base_url={_base_url!r}", "article_path={_article_path!r}",
  160. "script_path={_script_path!r}", "use_https={_use_https!r}",
  161. "assert_edit={_assert_edit!r}", "maxlag={_maxlag!r}",
  162. "sql={_sql_data!r}", "login={0}", "user_agent={2!r}",
  163. "cookiejar={1})"))
  164. name, password = self._login_info
  165. login = "({0}, {1})".format(repr(name), "hidden" if password else None)
  166. cookies = self._cookiejar.__class__.__name__
  167. if hasattr(self._cookiejar, "filename"):
  168. cookies += "({0!r})".format(getattr(self._cookiejar, "filename"))
  169. else:
  170. cookies += "()"
  171. agent = self._opener.addheaders[0][1]
  172. return res.format(login, cookies, agent, **self.__dict__)
  173. def __str__(self):
  174. """Return a nice string representation of the Site."""
  175. res = "<Site {0} ({1}:{2}) at {3}>"
  176. return res.format(self.name, self.project, self.lang, self.domain)
  177. def _unicodeify(self, value, encoding="utf8"):
  178. """Return input as unicode if it's not unicode to begin with."""
  179. if isinstance(value, unicode):
  180. return value
  181. return unicode(value, encoding)
  182. def _urlencode_utf8(self, params):
  183. """Implement urllib.urlencode() with support for unicode input."""
  184. enc = lambda s: s.encode("utf8") if isinstance(s, unicode) else str(s)
  185. args = []
  186. for key, val in params.iteritems():
  187. key = quote_plus(enc(key))
  188. val = quote_plus(enc(val))
  189. args.append(key + "=" + val)
  190. return "&".join(args)
  191. def _api_query(self, params, tries=0, wait=5, ignore_maxlag=False):
  192. """Do an API query with *params* as a dict of parameters.
  193. See the documentation for :py:meth:`api_query` for full implementation
  194. details.
  195. """
  196. since_last_query = time() - self._last_query_time # Throttling support
  197. if since_last_query < self._wait_between_queries:
  198. wait_time = self._wait_between_queries - since_last_query
  199. log = "Throttled: waiting {0} seconds".format(round(wait_time, 2))
  200. self._logger.debug(log)
  201. sleep(wait_time)
  202. self._last_query_time = time()
  203. url, data = self._build_api_query(params, ignore_maxlag)
  204. if "lgpassword" in params:
  205. self._logger.debug("{0} -> <hidden>".format(url))
  206. else:
  207. self._logger.debug("{0} -> {1}".format(url, data))
  208. try:
  209. response = self._opener.open(url, data)
  210. except URLError as error:
  211. if hasattr(error, "reason"):
  212. e = "API query failed: {0}.".format(error.reason)
  213. elif hasattr(error, "code"):
  214. e = "API query failed: got an error code of {0}."
  215. e = e.format(error.code)
  216. else:
  217. e = "API query failed."
  218. raise exceptions.APIError(e)
  219. result = response.read()
  220. if response.headers.get("Content-Encoding") == "gzip":
  221. stream = StringIO(result)
  222. gzipper = GzipFile(fileobj=stream)
  223. result = gzipper.read()
  224. return self._handle_api_query_result(result, params, tries, wait)
  225. def _build_api_query(self, params, ignore_maxlag):
  226. """Given API query params, return the URL to query and POST data."""
  227. if not self._base_url or self._script_path is None:
  228. e = "Tried to do an API query, but no API URL is known."
  229. raise exceptions.APIError(e)
  230. url = ''.join((self.url, self._script_path, "/api.php"))
  231. params["format"] = "json" # This is the only format we understand
  232. if self._assert_edit: # If requested, ensure that we're logged in
  233. params["assert"] = self._assert_edit
  234. if self._maxlag and not ignore_maxlag:
  235. # If requested, don't overload the servers:
  236. params["maxlag"] = self._maxlag
  237. data = self._urlencode_utf8(params)
  238. return url, data
  239. def _handle_api_query_result(self, result, params, tries, wait):
  240. """Given the result of an API query, attempt to return useful data."""
  241. try:
  242. res = loads(result) # Try to parse as a JSON object
  243. except ValueError:
  244. e = "API query failed: JSON could not be decoded."
  245. raise exceptions.APIError(e)
  246. try:
  247. code = res["error"]["code"]
  248. info = res["error"]["info"]
  249. except (TypeError, KeyError): # Having these keys indicates a problem
  250. return res # All is well; return the decoded JSON
  251. if code == "maxlag": # We've been throttled by the server
  252. if tries >= self._max_retries:
  253. e = "Maximum number of retries reached ({0})."
  254. raise exceptions.APIError(e.format(self._max_retries))
  255. tries += 1
  256. msg = 'Server says "{0}"; retrying in {1} seconds ({2}/{3})'
  257. self._logger.info(msg.format(info, wait, tries, self._max_retries))
  258. sleep(wait)
  259. return self._api_query(params, tries=tries, wait=wait*2)
  260. else: # Some unknown error occurred
  261. e = 'API query failed: got error "{0}"; server says: "{1}".'
  262. error = exceptions.APIError(e.format(code, info))
  263. error.code, error.info = code, info
  264. raise error
  265. def _load_attributes(self, force=False):
  266. """Load data about our Site from the API.
  267. This function is called by __init__() when one of the site attributes
  268. was not given as a keyword argument. We'll do an API query to get the
  269. missing data, but only if there actually *is* missing data.
  270. Additionally, you can call this with *force* set to True to forcibly
  271. reload all attributes.
  272. """
  273. # All attributes to be loaded, except _namespaces, which is a special
  274. # case because it requires additional params in the API query:
  275. attrs = [self._name, self._project, self._lang, self._base_url,
  276. self._article_path, self._script_path]
  277. params = {"action": "query", "meta": "siteinfo", "siprop": "general"}
  278. if not self._namespaces or force:
  279. params["siprop"] += "|namespaces|namespacealiases"
  280. result = self.api_query(**params)
  281. self._load_namespaces(result)
  282. elif all(attrs): # Everything is already specified and we're not told
  283. return # to force a reload, so do nothing
  284. else: # We're only loading attributes other than _namespaces
  285. result = self.api_query(**params)
  286. res = result["query"]["general"]
  287. self._name = res["wikiid"]
  288. self._project = res["sitename"].lower()
  289. self._lang = res["lang"]
  290. self._base_url = res["server"]
  291. self._article_path = res["articlepath"]
  292. self._script_path = res["scriptpath"]
  293. def _load_namespaces(self, result):
  294. """Fill self._namespaces with a dict of namespace IDs and names.
  295. Called by _load_attributes() with API data as *result* when
  296. self._namespaces was not given as an kwarg to __init__().
  297. """
  298. self._namespaces = {}
  299. for namespace in result["query"]["namespaces"].values():
  300. ns_id = namespace["id"]
  301. name = namespace["*"]
  302. try:
  303. canonical = namespace["canonical"]
  304. except KeyError:
  305. self._namespaces[ns_id] = [name]
  306. else:
  307. if name != canonical:
  308. self._namespaces[ns_id] = [name, canonical]
  309. else:
  310. self._namespaces[ns_id] = [name]
  311. for namespace in result["query"]["namespacealiases"]:
  312. ns_id = namespace["id"]
  313. alias = namespace["*"]
  314. self._namespaces[ns_id].append(alias)
  315. def _get_cookie(self, name, domain):
  316. """Return the named cookie unless it is expired or doesn't exist."""
  317. for cookie in self._cookiejar:
  318. if cookie.name == name and cookie.domain == domain:
  319. if cookie.is_expired():
  320. break
  321. return cookie
  322. def _get_username_from_cookies(self):
  323. """Try to return our username based solely on cookies.
  324. First, we'll look for a cookie named self._name + "Token", like
  325. "enwikiToken". If it exists and isn't expired, we'll assume it's valid
  326. and try to return the value of the cookie self._name + "UserName" (like
  327. "enwikiUserName"). This should work fine on wikis without single-user
  328. login.
  329. If `enwikiToken` doesn't exist, we'll try to find a cookie named
  330. `centralauth_Token`. If this exists and is not expired, we'll try to
  331. return the value of `centralauth_User`.
  332. If we didn't get any matches, we'll return None. Our goal here isn't to
  333. return the most likely username, or what we *want* our username to be
  334. (for that, we'd do self._login_info[0]), but rather to get our current
  335. username without an unnecessary ?action=query&meta=userinfo API query.
  336. """
  337. name = ''.join((self._name, "Token"))
  338. cookie = self._get_cookie(name, self.domain)
  339. if cookie:
  340. name = ''.join((self._name, "UserName"))
  341. user_name = self._get_cookie(name, self.domain)
  342. if user_name:
  343. return user_name.value
  344. name = "centralauth_Token"
  345. for cookie in self._cookiejar:
  346. if not cookie.domain_initial_dot or cookie.is_expired():
  347. continue
  348. if cookie.name != name:
  349. continue
  350. # Build a regex that will match domains this cookie affects:
  351. search = ''.join(("(.*?)", re_escape(cookie.domain)))
  352. if re_match(search, self.domain): # Test it against our site
  353. user_name = self._get_cookie("centralauth_User", cookie.domain)
  354. if user_name:
  355. return user_name.value
  356. def _get_username_from_api(self):
  357. """Do a simple API query to get our username and return it.
  358. This is a reliable way to make sure we are actually logged in, because
  359. it doesn't deal with annoying cookie logic, but it results in an API
  360. query that is unnecessary in some cases.
  361. Called by _get_username() (in turn called by get_user() with no
  362. username argument) when cookie lookup fails, probably indicating that
  363. we are logged out.
  364. """
  365. result = self.api_query(action="query", meta="userinfo")
  366. return result["query"]["userinfo"]["name"]
  367. def _get_username(self):
  368. """Return the name of the current user, whether logged in or not.
  369. First, we'll try to deduce it solely from cookies, to avoid an
  370. unnecessary API query. For the cookie-detection method, see
  371. _get_username_from_cookies()'s docs.
  372. If our username isn't in cookies, then we're probably not logged in, or
  373. something fishy is going on (like forced logout). In this case, do a
  374. single API query for our username (or IP address) and return that.
  375. """
  376. name = self._get_username_from_cookies()
  377. if name:
  378. return name
  379. return self._get_username_from_api()
  380. def _save_cookiejar(self):
  381. """Try to save our cookiejar after doing a (normal) login or logout.
  382. Calls the standard .save() method with no filename. Don't fret if our
  383. cookiejar doesn't support saving (CookieJar raises AttributeError,
  384. FileCookieJar raises NotImplementedError) or no default filename was
  385. given (LWPCookieJar and MozillaCookieJar raise ValueError).
  386. """
  387. if hasattr(self._cookiejar, "save"):
  388. try:
  389. getattr(self._cookiejar, "save")()
  390. except (NotImplementedError, ValueError):
  391. pass
  392. def _login(self, login, token=None, attempt=0):
  393. """Safely login through the API.
  394. Normally, this is called by __init__() if a username and password have
  395. been provided and no valid login cookies were found. The only other
  396. time it needs to be called is when those cookies expire, which is done
  397. automatically by api_query() if a query fails.
  398. Recent versions of MediaWiki's API have fixed a CSRF vulnerability,
  399. requiring login to be done in two separate requests. If the response
  400. from from our initial request is "NeedToken", we'll do another one with
  401. the token. If login is successful, we'll try to save our cookiejar.
  402. Raises LoginError on login errors (duh), like bad passwords and
  403. nonexistent usernames.
  404. *login* is a (username, password) tuple. *token* is the token returned
  405. from our first request, and *attempt* is to prevent getting stuck in a
  406. loop if MediaWiki isn't acting right.
  407. """
  408. name, password = login
  409. if token:
  410. result = self.api_query(action="login", lgname=name,
  411. lgpassword=password, lgtoken=token)
  412. else:
  413. result = self.api_query(action="login", lgname=name,
  414. lgpassword=password)
  415. res = result["login"]["result"]
  416. if res == "Success":
  417. self._save_cookiejar()
  418. elif res == "NeedToken" and attempt == 0:
  419. token = result["login"]["token"]
  420. return self._login(login, token, attempt=1)
  421. else:
  422. if res == "Illegal":
  423. e = "The provided username is illegal."
  424. elif res == "NotExists":
  425. e = "The provided username does not exist."
  426. elif res == "EmptyPass":
  427. e = "No password was given."
  428. elif res == "WrongPass" or res == "WrongPluginPass":
  429. e = "The given password is incorrect."
  430. else:
  431. e = "Couldn't login; server says '{0}'.".format(res)
  432. raise exceptions.LoginError(e)
  433. def _logout(self):
  434. """Safely logout through the API.
  435. We'll do a simple API request (api.php?action=logout), clear our
  436. cookiejar (which probably contains now-invalidated cookies) and try to
  437. save it, if it supports that sort of thing.
  438. """
  439. self.api_query(action="logout")
  440. self._cookiejar.clear()
  441. self._save_cookiejar()
  442. def _sql_connect(self, **kwargs):
  443. """Attempt to establish a connection with this site's SQL database.
  444. oursql.connect() will be called with self._sql_data as its kwargs.
  445. Any kwargs given to this function will be passed to connect() and will
  446. have precedence over the config file.
  447. Will raise SQLError() if the module "oursql" is not available. oursql
  448. may raise its own exceptions (e.g. oursql.InterfaceError) if it cannot
  449. establish a connection.
  450. """
  451. if not oursql:
  452. e = "Module 'oursql' is required for SQL queries."
  453. raise exceptions.SQLError(e)
  454. args = self._sql_data
  455. for key, value in kwargs.iteritems():
  456. args[key] = value
  457. if "read_default_file" not in args and "user" not in args and "passwd" not in args:
  458. args["read_default_file"] = expanduser("~/.my.cnf")
  459. if "autoping" not in args:
  460. args["autoping"] = True
  461. if "autoreconnect" not in args:
  462. args["autoreconnect"] = True
  463. self._sql_conn = oursql.connect(**args)
  464. def _get_service_order(self):
  465. """Return a preferred order for using services (e.g. the API and SQL).
  466. A list is returned, starting with the most preferred service first and
  467. ending with the least preferred one. Currently, there are only two
  468. services. SERVICE_API will always be included since the API is expected
  469. to be always usable. In normal circumstances, self.SERVICE_SQL will be
  470. first (with the API second), since using SQL directly is easier on the
  471. servers than making web queries with the API. self.SERVICE_SQL will be
  472. second if replag is greater than three minutes (a cached value updated
  473. every two minutes at most), *unless* API lag is also very high.
  474. self.SERVICE_SQL will not be included in the list if we cannot form a
  475. proper SQL connection.
  476. """
  477. now = time()
  478. if now - self._sql_info_cache["lastcheck"] > 120:
  479. self._sql_info_cache["lastcheck"] = now
  480. try:
  481. self._sql_info_cache["replag"] = sqllag = self.get_replag()
  482. except (exceptions.SQLError, oursql.Error):
  483. self._sql_info_cache["usable"] = False
  484. return [self.SERVICE_API]
  485. self._sql_info_cache["usable"] = True
  486. else:
  487. if not self._sql_info_cache["usable"]:
  488. return [self.SERVICE_API]
  489. sqllag = self._sql_info_cache["replag"]
  490. if sqllag > 300:
  491. if not self._maxlag:
  492. return [self.SERVICE_API, self.SERVICE_SQL]
  493. if now - self._api_info_cache["lastcheck"] > 300:
  494. self._api_info_cache["lastcheck"] = now
  495. try:
  496. self._api_info_cache["maxlag"] = apilag = self.get_maxlag()
  497. except exceptions.APIError:
  498. self._api_info_cache["maxlag"] = apilag = 0
  499. else:
  500. apilag = self._api_info_cache["maxlag"]
  501. if apilag > self._maxlag:
  502. return [self.SERVICE_SQL, self.SERVICE_API]
  503. return [self.SERVICE_API, self.SERVICE_SQL]
  504. return [self.SERVICE_SQL, self.SERVICE_API]
  505. @property
  506. def name(self):
  507. """The Site's name (or "wikiid" in the API), like ``"enwiki"``."""
  508. return self._name
  509. @property
  510. def project(self):
  511. """The Site's project name in lowercase, like ``"wikipedia"``."""
  512. return self._project
  513. @property
  514. def lang(self):
  515. """The Site's language code, like ``"en"`` or ``"es"``."""
  516. return self._lang
  517. @property
  518. def domain(self):
  519. """The Site's web domain, like ``"en.wikipedia.org"``."""
  520. return urlparse(self._base_url).netloc
  521. @property
  522. def url(self):
  523. """The Site's full base URL, like ``"https://en.wikipedia.org"``."""
  524. url = self._base_url
  525. if url.startswith("//"): # Protocol-relative URLs from 1.18
  526. if self._use_https:
  527. url = "https:" + url
  528. else:
  529. url = "http:" + url
  530. return url
  531. def api_query(self, **kwargs):
  532. """Do an API query with `kwargs` as the parameters.
  533. This will first attempt to construct an API url from
  534. :py:attr:`self._base_url` and :py:attr:`self._script_path`. We need
  535. both of these, or else we'll raise
  536. :py:exc:`~earwigbot.exceptions.APIError`. If
  537. :py:attr:`self._base_url` is protocol-relative (introduced in MediaWiki
  538. 1.18), we'll choose HTTPS only if :py:attr:`self._user_https` is
  539. ``True``, otherwise HTTP.
  540. We'll encode the given params, adding ``format=json`` along the way, as
  541. well as ``&assert=`` and ``&maxlag=`` based on
  542. :py:attr:`self._assert_edit` and :py:attr:`_maxlag` respectively.
  543. Additionally, we'll sleep a bit if the last query was made fewer than
  544. :py:attr:`self._wait_between_queries` seconds ago. The request is made
  545. through :py:attr:`self._opener`, which has cookie support
  546. (:py:attr:`self._cookiejar`), a ``User-Agent``
  547. (:py:const:`earwigbot.wiki.constants.USER_AGENT`), and
  548. ``Accept-Encoding`` set to ``"gzip"``.
  549. Assuming everything went well, we'll gunzip the data (if compressed),
  550. load it as a JSON object, and return it.
  551. If our request failed for some reason, we'll raise
  552. :py:exc:`~earwigbot.exceptions.APIError` with details. If that
  553. reason was due to maxlag, we'll sleep for a bit and then repeat the
  554. query until we exceed :py:attr:`self._max_retries`.
  555. There is helpful MediaWiki API documentation at `MediaWiki.org
  556. <http://www.mediawiki.org/wiki/API>`_.
  557. """
  558. with self._api_lock:
  559. return self._api_query(kwargs)
  560. def sql_query(self, query, params=(), plain_query=False, dict_cursor=False,
  561. cursor_class=None, show_table=False):
  562. """Do an SQL query and yield its results.
  563. If *plain_query* is ``True``, we will force an unparameterized query.
  564. Specifying both *params* and *plain_query* will cause an error. If
  565. *dict_cursor* is ``True``, we will use :py:class:`oursql.DictCursor` as
  566. our cursor, otherwise the default :py:class:`oursql.Cursor`. If
  567. *cursor_class* is given, it will override this option. If *show_table*
  568. is True, the name of the table will be prepended to the name of the
  569. column. This will mainly affect an :py:class:`~oursql.DictCursor`.
  570. Example usage::
  571. >>> query = "SELECT user_id, user_registration FROM user WHERE user_name = ?"
  572. >>> params = ("The Earwig",)
  573. >>> result1 = site.sql_query(query, params)
  574. >>> result2 = site.sql_query(query, params, dict_cursor=True)
  575. >>> for row in result1: print row
  576. (7418060L, '20080703215134')
  577. >>> for row in result2: print row
  578. {'user_id': 7418060L, 'user_registration': '20080703215134'}
  579. This may raise :py:exc:`~earwigbot.exceptions.SQLError` or one of
  580. oursql's exceptions (:py:exc:`oursql.ProgrammingError`,
  581. :py:exc:`oursql.InterfaceError`, ...) if there were problems with the
  582. query.
  583. See :py:meth:`_sql_connect` for information on how a connection is
  584. acquired. Also relevant is `oursql's documentation
  585. <http://packages.python.org/oursql>`_ for details on that package.
  586. """
  587. if not cursor_class:
  588. if dict_cursor:
  589. cursor_class = oursql.DictCursor
  590. else:
  591. cursor_class = oursql.Cursor
  592. klass = cursor_class
  593. with self._sql_lock:
  594. if not self._sql_conn:
  595. self._sql_connect()
  596. with self._sql_conn.cursor(klass, show_table=show_table) as cur:
  597. cur.execute(query, params, plain_query)
  598. for result in cur:
  599. yield result
  600. def get_maxlag(self, showall=False):
  601. """Return the internal database replication lag in seconds.
  602. In a typical setup, this function returns the replication lag *within*
  603. the WMF's cluster, *not* external replication lag affecting the
  604. Toolserver (see :py:meth:`get_replag` for that). This is useful when
  605. combined with the ``maxlag`` API query param (added by config), in
  606. which queries will be halted and retried if the lag is too high,
  607. usually above five seconds.
  608. With *showall*, will return a list of the lag for all servers in the
  609. cluster, not just the one with the highest lag.
  610. """
  611. params = {"action": "query", "meta": "siteinfo", "siprop": "dbrepllag"}
  612. if showall:
  613. params["sishowalldb"] = 1
  614. with self._api_lock:
  615. result = self._api_query(params, ignore_maxlag=True)
  616. if showall:
  617. return [server["lag"] for server in result["query"]["dbrepllag"]]
  618. return result["query"]["dbrepllag"][0]["lag"]
  619. def get_replag(self):
  620. """Return the estimated external database replication lag in seconds.
  621. Requires SQL access. This function only makes sense on a replicated
  622. database (e.g. the Wikimedia Toolserver) and on a wiki that receives a
  623. large number of edits (ideally, at least one per second), or the result
  624. may be larger than expected, since it works by subtracting the current
  625. time from the timestamp of the latest recent changes event.
  626. This may raise :py:exc:`~earwigbot.exceptions.SQLError` or one of
  627. oursql's exceptions (:py:exc:`oursql.ProgrammingError`,
  628. :py:exc:`oursql.InterfaceError`, ...) if there were problems.
  629. """
  630. query = """SELECT UNIX_TIMESTAMP() - UNIX_TIMESTAMP(rc_timestamp) FROM
  631. recentchanges ORDER BY rc_timestamp DESC LIMIT 1"""
  632. result = list(self.sql_query(query))
  633. return result[0][0]
  634. def namespace_id_to_name(self, ns_id, all=False):
  635. """Given a namespace ID, returns associated namespace names.
  636. If *all* is ``False`` (default), we'll return the first name in the
  637. list, which is usually the localized version. Otherwise, we'll return
  638. the entire list, which includes the canonical name. For example, this
  639. returns ``u"Wikipedia"`` if *ns_id* = ``4`` and *all* is ``False`` on
  640. ``enwiki``; returns ``[u"Wikipedia", u"Project", u"WP"]`` if *ns_id* =
  641. ``4`` and *all* is ``True``.
  642. Raises :py:exc:`~earwigbot.exceptions.NamespaceNotFoundError` if the ID
  643. is not found.
  644. """
  645. try:
  646. if all:
  647. return self._namespaces[ns_id]
  648. else:
  649. return self._namespaces[ns_id][0]
  650. except KeyError:
  651. e = "There is no namespace with id {0}.".format(ns_id)
  652. raise exceptions.NamespaceNotFoundError(e)
  653. def namespace_name_to_id(self, name):
  654. """Given a namespace name, returns the associated ID.
  655. Like :py:meth:`namespace_id_to_name`, but reversed. Case is ignored,
  656. because namespaces are assumed to be case-insensitive.
  657. Raises :py:exc:`~earwigbot.exceptions.NamespaceNotFoundError` if the
  658. name is not found.
  659. """
  660. lname = name.lower()
  661. for ns_id, names in self._namespaces.items():
  662. lnames = [n.lower() for n in names] # Be case-insensitive
  663. if lname in lnames:
  664. return ns_id
  665. e = "There is no namespace with name '{0}'.".format(name)
  666. raise exceptions.NamespaceNotFoundError(e)
  667. def get_page(self, title, follow_redirects=False, pageid=None):
  668. """Return a :py:class:`Page` object for the given title.
  669. *follow_redirects* is passed directly to
  670. :py:class:`~earwigbot.wiki.page.Page`'s constructor. Also, this will
  671. return a :py:class:`~earwigbot.wiki.category.Category` object instead
  672. if the given title is in the category namespace. As
  673. :py:class:`~earwigbot.wiki.category.Category` is a subclass of
  674. :py:class:`~earwigbot.wiki.page.Page`, this should not cause problems.
  675. Note that this doesn't do any direct checks for existence or
  676. redirect-following: :py:class:`~earwigbot.wiki.page.Page`'s methods
  677. provide that.
  678. """
  679. title = self._unicodeify(title)
  680. prefixes = self.namespace_id_to_name(constants.NS_CATEGORY, all=True)
  681. prefix = title.split(":", 1)[0]
  682. if prefix != title: # Avoid a page that is simply "Category"
  683. if prefix in prefixes:
  684. return Category(self, title, follow_redirects, pageid,
  685. self._logger)
  686. return Page(self, title, follow_redirects, pageid, self._logger)
  687. def get_category(self, catname, follow_redirects=False, pageid=None):
  688. """Return a :py:class:`Category` object for the given category name.
  689. *catname* should be given *without* a namespace prefix. This method is
  690. really just shorthand for :py:meth:`get_page("Category:" + catname)
  691. <get_page>`.
  692. """
  693. catname = self._unicodeify(catname)
  694. prefix = self.namespace_id_to_name(constants.NS_CATEGORY)
  695. pagename = u':'.join((prefix, catname))
  696. return Category(self, pagename, follow_redirects, pageid, self._logger)
  697. def get_user(self, username=None):
  698. """Return a :py:class:`User` object for the given username.
  699. If *username* is left as ``None``, then a
  700. :py:class:`~earwigbot.wiki.user.User` object representing the currently
  701. logged-in (or anonymous!) user is returned.
  702. """
  703. if username:
  704. username = self._unicodeify(username)
  705. else:
  706. username = self._get_username()
  707. return User(self, username, self._logger)
  708. def delegate(self, services, args=None, kwargs=None):
  709. """Delegate a task to either the API or SQL depending on conditions.
  710. *services* should be a dictionary in which the key is the service name
  711. (:py:attr:`self.SERVICE_API <SERVICE_API>` or
  712. :py:attr:`self.SERVICE_SQL <SERVICE_SQL>`), and the value is the
  713. function to call for this service. All functions will be passed the
  714. same arguments the tuple *args* and the dict **kwargs**, which are both
  715. empty by default. The service order is determined by
  716. :py:meth:`_get_service_order`.
  717. Not every service needs an entry in the dictionary. Will raise
  718. :py:exc:`~earwigbot.exceptions.NoServiceError` if an appropriate
  719. service cannot be found.
  720. """
  721. if not args:
  722. args = ()
  723. if not kwargs:
  724. kwargs = {}
  725. order = self._get_service_order()
  726. for srv in order:
  727. if srv in services:
  728. try:
  729. return services[srv](*args, **kwargs)
  730. except exceptions.ServiceError:
  731. continue
  732. raise exceptions.NoServiceError(services)