A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. # -*- coding: utf-8 -*-
  2. from cookielib import CookieJar
  3. from gzip import GzipFile
  4. from json import loads
  5. from re import escape as re_escape, match as re_match
  6. from StringIO import StringIO
  7. from urllib import unquote_plus, urlencode
  8. from urllib2 import build_opener, HTTPCookieProcessor, URLError
  9. from urlparse import urlparse
  10. from wiki.category import Category
  11. from wiki.constants import *
  12. from wiki.exceptions import *
  13. from wiki.page import Page
  14. from wiki.user import User
  15. class Site(object):
  16. """
  17. EarwigBot's Wiki Toolset: Site Class
  18. Represents a Site, with support for API queries and returning Pages, Users,
  19. and Categories. The constructor takes a bunch of arguments and you probably
  20. won't need to call it directly, rather tools.get_site() for returning Site
  21. instances, tools.add_site() for adding new ones to config, and
  22. tools.del_site() for removing old ones from config, should suffice.
  23. Public methods:
  24. name -- returns our name (or "wikiid"), like "enwiki"
  25. project -- returns our project name, like "wikipedia"
  26. lang -- returns our language code, like "en"
  27. domain -- returns our web domain, like "en.wikipedia.org"
  28. api_query -- does an API query with the given kwargs as params
  29. namespace_id_to_name -- given a namespace ID, returns associated name(s)
  30. namespace_name_to_id -- given a namespace name, returns associated id
  31. get_page -- returns a Page object for the given title
  32. get_category -- returns a Category object for the given title
  33. get_user -- returns a User object for the given username
  34. """
  35. def __init__(self, name=None, project=None, lang=None, base_url=None,
  36. article_path=None, script_path=None, sql=(None, None),
  37. namespaces=None, login=(None, None), cookiejar=None,
  38. user_agent=None):
  39. """Constructor for new Site instances.
  40. This probably isn't necessary to call yourself unless you're building a
  41. Site that's not in your config and you don't want to add it - normally
  42. all you need is tools.get_site(name), which creates the Site for you
  43. based on your config file. We accept a bunch of kwargs, but the only
  44. ones you really "need" are `base_url` and `script_path` - this is
  45. enough to figure out an API url. `login`, a tuple of
  46. (username, password), is highly recommended. `cookiejar` will be used
  47. to store cookies, and we'll use a normal CookieJar if none is given.
  48. First, we'll store the given arguments as attributes, then set up our
  49. URL opener. We'll load any of the attributes that weren't given from
  50. the API, and then log in if a username/pass was given and we aren't
  51. already logged in.
  52. """
  53. # Attributes referring to site information, filled in by an API query
  54. # if they are missing (and an API url can be determined):
  55. self._name = name
  56. self._project = project
  57. self._lang = lang
  58. self._base_url = base_url
  59. self._article_path = article_path
  60. self._script_path = script_path
  61. self._sql = sql
  62. self._namespaces = namespaces
  63. # Set up cookiejar and URL opener for making API queries:
  64. if cookiejar is not None:
  65. self._cookiejar = cookiejar
  66. else:
  67. self._cookiejar = CookieJar()
  68. if user_agent is None:
  69. user_agent = USER_AGENT # Set default UA from wiki.constants
  70. self._opener = build_opener(HTTPCookieProcessor(self._cookiejar))
  71. self._opener.addheaders = [("User-Agent", user_agent),
  72. ("Accept-Encoding", "gzip")]
  73. # Get all of the above attributes that were not specified as arguments:
  74. self._load_attributes()
  75. # If we have a name/pass and the API says we're not logged in, log in:
  76. self._login_info = name, password = login
  77. if name is not None and password is not None:
  78. logged_in_as = self._get_username_from_cookies()
  79. if logged_in_as is None or name != logged_in_as:
  80. self._login(login)
  81. def _api_query(self, params):
  82. """Do an API query with `params` as a dict of parameters.
  83. This will first attempt to construct an API url from self._base_url and
  84. self._script_path. We need both of these, or else we'll raise
  85. SiteAPIError.
  86. We'll encode the given params, adding format=json along the way, and
  87. make the request through self._opener, which has built-in cookie
  88. support via self._cookiejar, a User-Agent (wiki.constants.USER_AGENT),
  89. and Accept-Encoding set to "gzip".
  90. Assuming everything went well, we'll gunzip the data (if compressed),
  91. load it as a JSON object, and return it.
  92. If our request failed, we'll raise SiteAPIError with details.
  93. There's helpful MediaWiki API documentation at
  94. <http://www.mediawiki.org/wiki/API>.
  95. """
  96. if self._base_url is None or self._script_path is None:
  97. e = "Tried to do an API query, but no API URL is known."
  98. raise SiteAPIError(e)
  99. url = ''.join((self._base_url, self._script_path, "/api.php"))
  100. params["format"] = "json" # This is the only format we understand
  101. data = urlencode(params)
  102. print url, data # debug code
  103. try:
  104. response = self._opener.open(url, data)
  105. except URLError as error:
  106. if hasattr(error, "reason"):
  107. e = "API query at {0} failed because {1}."
  108. e = e.format(error.geturl, error.reason)
  109. elif hasattr(error, "code"):
  110. e = "API query at {0} failed; got an error code of {1}."
  111. e = e.format(error.geturl, error.code)
  112. else:
  113. e = "API query failed."
  114. raise SiteAPIError(e)
  115. else:
  116. result = response.read()
  117. if response.headers.get("Content-Encoding") == "gzip":
  118. stream = StringIO(result)
  119. gzipper = GzipFile(fileobj=stream)
  120. result = gzipper.read()
  121. return loads(result) # Parse as a JSON object
  122. def _load_attributes(self, force=False):
  123. """Load data about our Site from the API.
  124. This function is called by __init__() when one of the site attributes
  125. was not given as a keyword argument. We'll do an API query to get the
  126. missing data, but only if there actually *is* missing data.
  127. Additionally, you can call this with `force=True` to forcibly reload
  128. all attributes.
  129. """
  130. # All attributes to be loaded, except _namespaces, which is a special
  131. # case because it requires additional params in the API query:
  132. attrs = [self._name, self._project, self._lang, self._base_url,
  133. self._article_path, self._script_path]
  134. params = {"action": "query", "meta": "siteinfo"}
  135. if not self._namespaces or force:
  136. params["siprop"] = "general|namespaces|namespacealiases"
  137. result = self._api_query(params)
  138. self._load_namespaces(result)
  139. elif all(attrs): # Everything is already specified and we're not told
  140. return # to force a reload, so do nothing
  141. else: # We're only loading attributes other than _namespaces
  142. params["siprop"] = "general"
  143. result = self._api_query(params)
  144. res = result["query"]["general"]
  145. self._name = res["wikiid"]
  146. self._project = res["sitename"].lower()
  147. self._lang = res["lang"]
  148. self._base_url = res["server"]
  149. self._article_path = res["articlepath"]
  150. self._script_path = res["scriptpath"]
  151. def _load_namespaces(self, result):
  152. """Fill self._namespaces with a dict of namespace IDs and names.
  153. Called by _load_attributes() with API data as `result` when
  154. self._namespaces was not given as an kwarg to __init__().
  155. """
  156. self._namespaces = {}
  157. for namespace in result["query"]["namespaces"].values():
  158. ns_id = namespace["id"]
  159. name = namespace["*"]
  160. try:
  161. canonical = namespace["canonical"]
  162. except KeyError:
  163. self._namespaces[ns_id] = [name]
  164. else:
  165. if name != canonical:
  166. self._namespaces[ns_id] = [name, canonical]
  167. else:
  168. self._namespaces[ns_id] = [name]
  169. for namespace in result["query"]["namespacealiases"]:
  170. ns_id = namespace["id"]
  171. alias = namespace["*"]
  172. self._namespaces[ns_id].append(alias)
  173. def _get_cookie(self, name, domain):
  174. """Return the named cookie unless it is expired or doesn't exist."""
  175. for cookie in self._cookiejar:
  176. if cookie.name == name and cookie.domain == domain:
  177. if cookie.is_expired():
  178. break
  179. return cookie
  180. def _get_username_from_cookies(self):
  181. """Try to return our username based solely on cookies.
  182. First, we'll look for a cookie named self._name + "Token", like
  183. "enwikiToken". If it exists and isn't expired, we'll assume it's valid
  184. and try to return the value of the cookie self._name + "UserName" (like
  185. "enwikiUserName"). This should work fine on wikis without single-user
  186. login.
  187. If `enwikiToken` doesn't exist, we'll try to find a cookie named
  188. `centralauth_Token`. If this exists and is not expired, we'll try to
  189. return the value of `centralauth_User`.
  190. If we didn't get any matches, we'll return None. Our goal here isn't to
  191. return the most likely username, or what we *want* our username to be
  192. (for that, we'd do self._login_info[0]), but rather to get our current
  193. username without an unnecessary ?action=query&meta=userinfo API query.
  194. """
  195. domain = self.domain()
  196. name = ''.join((self._name, "Token"))
  197. cookie = self._get_cookie(name, domain)
  198. if cookie is not None:
  199. name = ''.join((self._name, "UserName"))
  200. user_name = self._get_cookie(name, domain)
  201. if user_name is not None:
  202. return user_name.value
  203. name = "centralauth_Token"
  204. for cookie in self._cookiejar:
  205. if cookie.domain_initial_dot is False or cookie.is_expired():
  206. continue
  207. if cookie.name != name:
  208. continue
  209. # Build a regex that will match domains this cookie affects:
  210. search = ''.join(("(.*?)", re_escape(cookie.domain)))
  211. if re_match(search, domain): # Test it against our site
  212. user_name = self._get_cookie("centralauth_User", cookie.domain)
  213. if user_name is not None:
  214. return user_name.value
  215. def _get_username_from_api(self):
  216. """Do a simple API query to get our username and return it.
  217. This is a reliable way to make sure we are actually logged in, because
  218. it doesn't deal with annoying cookie logic, but it results in an API
  219. query that is unnecessary in some cases.
  220. Called by _get_username() (in turn called by get_user() with no
  221. username argument) when cookie lookup fails, probably indicating that
  222. we are logged out.
  223. """
  224. params = {"action": "query", "meta": "userinfo"}
  225. result = self._api_query(params)
  226. return result["query"]["userinfo"]["name"]
  227. def _get_username(self):
  228. """Return the name of the current user, whether logged in or not.
  229. First, we'll try to deduce it solely from cookies, to avoid an
  230. unnecessary API query. For the cookie-detection method, see
  231. _get_username_from_cookies()'s docs.
  232. If our username isn't in cookies, then we're probably not logged in, or
  233. something fishy is going on (like forced logout). In this case, do a
  234. single API query for our username (or IP address) and return that.
  235. """
  236. name = self._get_username_from_cookies()
  237. if name is not None:
  238. return name
  239. return self._get_username_from_api()
  240. def _save_cookiejar(self):
  241. """Try to save our cookiejar after doing a (normal) login or logout.
  242. Calls the standard .save() method with no filename. Don't fret if our
  243. cookiejar doesn't support saving (CookieJar raises AttributeError,
  244. FileCookieJar raises NotImplementedError) or no default filename was
  245. given (LWPCookieJar and MozillaCookieJar raise ValueError).
  246. """
  247. try:
  248. self._cookiejar.save()
  249. except (AttributeError, NotImplementedError, ValueError):
  250. pass
  251. def _login(self, login, token=None, attempt=0):
  252. """Safely login through the API.
  253. Normally, this is called by __init__() if a username and password have
  254. been provided and no valid login cookies were found. The only other
  255. time it needs to be called is when those cookies expire, which is done
  256. automatically by api_query() if a query fails.
  257. Recent versions of MediaWiki's API have fixed a CSRF vulnerability,
  258. requiring login to be done in two separate requests. If the response
  259. from from our initial request is "NeedToken", we'll do another one with
  260. the token. If login is successful, we'll try to save our cookiejar.
  261. Raises LoginError on login errors (duh), like bad passwords and
  262. nonexistent usernames.
  263. `login` is a (username, password) tuple. `token` is the token returned
  264. from our first request, and `attempt` is to prevent getting stuck in a
  265. loop if MediaWiki isn't acting right.
  266. """
  267. name, password = login
  268. params = {"action": "login", "lgname": name, "lgpassword": password}
  269. if token is not None:
  270. params["lgtoken"] = token
  271. result = self._api_query(params)
  272. res = result["login"]["result"]
  273. if res == "Success":
  274. self._save_cookiejar()
  275. elif res == "NeedToken" and attempt == 0:
  276. token = result["login"]["token"]
  277. return self._login(login, token, attempt=1)
  278. else:
  279. if res == "Illegal":
  280. e = "The provided username is illegal."
  281. elif res == "NotExists":
  282. e = "The provided username does not exist."
  283. elif res == "EmptyPass":
  284. e = "No password was given."
  285. elif res == "WrongPass" or res == "WrongPluginPass":
  286. e = "The given password is incorrect."
  287. else:
  288. e = "Couldn't login; server says '{0}'.".format(res)
  289. raise LoginError(e)
  290. def _logout(self):
  291. """Safely logout through the API.
  292. We'll do a simple API request (api.php?action=logout), clear our
  293. cookiejar (which probably contains now-invalidated cookies) and try to
  294. save it, if it supports that sort of thing.
  295. """
  296. params = {"action": "logout"}
  297. self._api_query(params)
  298. self._cookiejar.clear()
  299. self._save_cookiejar()
  300. def api_query(self, **kwargs):
  301. """Do an API query with `kwargs` as the parameters.
  302. See _api_query()'s documentation for details.
  303. """
  304. return self._api_query(kwargs)
  305. def name(self):
  306. """Returns the Site's name (or "wikiid" in the API), like "enwiki"."""
  307. return self._name
  308. def project(self):
  309. """Returns the Site's project name in lowercase, like "wikipedia"."""
  310. return self._project
  311. def lang(self):
  312. """Returns the Site's language code, like "en" or "es"."""
  313. return self._lang
  314. def domain(self):
  315. """Returns the Site's web domain, like "en.wikipedia.org"."""
  316. return urlparse(self._base_url).netloc
  317. def namespace_id_to_name(self, ns_id, all=False):
  318. """Given a namespace ID, returns associated namespace names.
  319. If all is False (default), we'll return the first name in the list,
  320. which is usually the localized version. Otherwise, we'll return the
  321. entire list, which includes the canonical name.
  322. For example, returns u"Wikipedia" if ns_id=4 and all=False on enwiki;
  323. returns [u"Wikipedia", u"Project"] if ns_id=4 and all=True.
  324. Raises NamespaceNotFoundError if the ID is not found.
  325. """
  326. try:
  327. if all:
  328. return self._namespaces[ns_id]
  329. else:
  330. return self._namespaces[ns_id][0]
  331. except KeyError:
  332. e = "There is no namespace with id {0}.".format(ns_id)
  333. raise NamespaceNotFoundError(e)
  334. def namespace_name_to_id(self, name):
  335. """Given a namespace name, returns the associated ID.
  336. Like namespace_id_to_name(), but reversed. Case is ignored, because
  337. namespaces are assumed to be case-insensitive.
  338. Raises NamespaceNotFoundError if the name is not found.
  339. """
  340. lname = name.lower()
  341. for ns_id, names in self._namespaces.items():
  342. lnames = [n.lower() for n in names] # Be case-insensitive
  343. if lname in lnames:
  344. return ns_id
  345. e = "There is no namespace with name '{0}'.".format(name)
  346. raise NamespaceNotFoundError(e)
  347. def get_page(self, title, follow_redirects=False):
  348. """Returns a Page object for the given title (pagename).
  349. Will return a Category object instead if the given title is in the
  350. category namespace. As Category is a subclass of Page, this should not
  351. cause problems.
  352. Note that this doesn't do any direct checks for existence or
  353. redirect-following - Page's methods provide that.
  354. """
  355. prefixes = self.namespace_id_to_name(NS_CATEGORY, all=True)
  356. prefix = title.split(":", 1)[0]
  357. if prefix != title: # Avoid a page that is simply "Category"
  358. if prefix in prefixes:
  359. return Category(self, title, follow_redirects)
  360. return Page(self, title, follow_redirects)
  361. def get_category(self, catname, follow_redirects=False):
  362. """Returns a Category object for the given category name.
  363. `catname` should be given *without* a namespace prefix. This method is
  364. really just shorthand for get_page("Category:" + catname).
  365. """
  366. prefix = self.namespace_id_to_name(NS_CATEGORY)
  367. pagename = ':'.join((prefix, catname))
  368. return Category(self, pagename, follow_redirects)
  369. def get_user(self, username=None):
  370. """Returns a User object for the given username.
  371. If `username` is left as None, then a User object representing the
  372. currently logged-in (or anonymous!) user is returned.
  373. """
  374. if username is None:
  375. username = self._get_username()
  376. return User(self, username)