A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

358 lines
13 KiB

  1. # -*- coding: utf-8 -*-
  2. from cookielib import CookieJar
  3. from gzip import GzipFile
  4. from json import loads
  5. from re import escape as re_escape, match as re_match
  6. from StringIO import StringIO
  7. from urllib import unquote_plus, urlencode
  8. from urllib2 import build_opener, HTTPCookieProcessor, URLError
  9. from urlparse import urlparse
  10. from wiki.tools.category import Category
  11. from wiki.tools.constants import *
  12. from wiki.tools.exceptions import *
  13. from wiki.tools.page import Page
  14. from wiki.tools.user import User
  15. class Site(object):
  16. """
  17. EarwigBot's Wiki Toolset: Site Class
  18. """
  19. def __init__(self, name=None, project=None, lang=None, base_url=None,
  20. article_path=None, script_path=None, sql=(None, None),
  21. namespaces=None, login=(None, None), cookiejar=None):
  22. """
  23. Docstring needed
  24. """
  25. # attributes referring to site information, filled in by an API query
  26. # if they are missing (and an API url can be determined)
  27. self._name = name
  28. self._project = project
  29. self._lang = lang
  30. self._base_url = base_url
  31. self._article_path = article_path
  32. self._script_path = script_path
  33. self._sql = sql
  34. self._namespaces = namespaces
  35. # set up cookiejar and URL opener for making API queries
  36. if cookiejar is not None:
  37. self._cookiejar = cookiejar
  38. else:
  39. self._cookiejar = CookieJar()
  40. self._opener = build_opener(HTTPCookieProcessor(self._cookiejar))
  41. self._opener.addheaders = [("User-Agent", USER_AGENT),
  42. ("Accept-Encoding", "gzip")]
  43. # get all of the above attributes that were not specified as arguments
  44. self._load_attributes()
  45. # if we have a name/pass and the API says we're not logged in, log in
  46. self._login_info = name, password = login
  47. if name is not None and password is not None:
  48. logged_in_as = self._get_username_from_cookies()
  49. if logged_in_as is None or name != logged_in_as:
  50. self._login(login)
  51. def _load_attributes(self, force=False):
  52. """
  53. Docstring needed
  54. """
  55. # all attributes to be loaded, except _namespaces, which is a special
  56. # case because it requires additional params in the API query
  57. attrs = [self._name, self._project, self._lang, self._base_url,
  58. self._article_path, self._script_path]
  59. params = {"action": "query", "meta": "siteinfo"}
  60. if self._namespaces is None or force:
  61. params["siprop"] = "general|namespaces|namespacealiases"
  62. result = self.api_query(params)
  63. self._load_namespaces(result)
  64. elif all(attrs): # everything is already specified and we're not told
  65. return # to force a reload, so do nothing
  66. else: # we're only loading attributes other than _namespaces
  67. params["siprop"] = "general"
  68. result = self.api_query(params)
  69. res = result["query"]["general"]
  70. self._name = res["wikiid"]
  71. self._project = res["sitename"].lower()
  72. self._lang = res["lang"]
  73. self._base_url = res["server"]
  74. self._article_path = res["articlepath"]
  75. self._script_path = res["scriptpath"]
  76. def _load_namespaces(self, result):
  77. """
  78. Docstring needed
  79. """
  80. self._namespaces = {}
  81. for namespace in result["query"]["namespaces"].values():
  82. ns_id = namespace["id"]
  83. name = namespace["*"]
  84. try:
  85. canonical = namespace["canonical"]
  86. except KeyError:
  87. self._namespaces[ns_id] = [name]
  88. else:
  89. if name != canonical:
  90. self._namespaces[ns_id] = [name, canonical]
  91. else:
  92. self._namespaces[ns_id] = [name]
  93. for namespace in result["query"]["namespacealiases"]:
  94. ns_id = namespace["id"]
  95. alias = namespace["*"]
  96. self._namespaces[ns_id].append(alias)
  97. def _get_cookie(self, name, domain):
  98. """Return the cookie `name` in `domain`, unless it is expired. Return
  99. None if no cookie was found.
  100. """
  101. for cookie in self._cookiejar:
  102. if cookie.name == name and cookie.domain == domain:
  103. if cookie.is_expired():
  104. break
  105. return cookie
  106. return None
  107. def _get_username_from_cookies(self):
  108. """Try to return our username based solely on cookies.
  109. First, we'll look for a cookie named self._name + "Token", like
  110. "enwikiToken". If it exists and isn't expired, we'll assume it's valid
  111. and try to return the value of the cookie self._name + "UserName" (like
  112. "enwikiUserName"). This should work fine on wikis without single-user
  113. login.
  114. If `enwikiToken` doesn't exist, we'll try to find a cookie named
  115. `centralauth_Token`. If this exists and is not expired, we'll try to
  116. return the value of `centralauth_User`.
  117. If we didn't get any matches, we'll return None. Our goal here isn't to
  118. return the most likely username, or what we *want* our username to be
  119. (for that, we'd do self._login_info[0]), but rather to get our current
  120. username without an unnecessary ?action=query&meta=userinfo API query.
  121. """
  122. domain = self.domain()
  123. name = ''.join((self._name, "Token"))
  124. cookie = self._get_cookie(name, domain)
  125. if cookie is not None:
  126. name = ''.join((self._name, "UserName"))
  127. user_name = self._get_cookie(name, domain)
  128. if user_name is not None:
  129. return user_name.value
  130. name = "centralauth_Token"
  131. for cookie in self._cookiejar:
  132. if cookie.domain_initial_dot is False or cookie.is_expired():
  133. continue
  134. if cookie.name != name:
  135. continue
  136. # build a regex that will match domains this cookie affects
  137. search = ''.join(("(.*?)", re_escape(cookie.domain)))
  138. if re_match(search, domain): # test it against our site
  139. user_name = self._get_cookie("centralauth_User", cookie.domain)
  140. if user_name is not None:
  141. return user_name.value
  142. return None
  143. def _get_username_from_api(self):
  144. """Do a simple API query to get our username and return it.
  145. This is a reliable way to make sure we are actually logged in, because
  146. it doesn't deal with annoying cookie logic, but it results in an API
  147. query that is unnecessary in many cases.
  148. Called by _get_username() (in turn called by get_user() with no
  149. username argument) when cookie lookup fails, probably indicating that
  150. we are logged out.
  151. """
  152. params = {"action": "query", "meta": "userinfo"}
  153. result = self.api_query(params)
  154. return result["query"]["userinfo"]["name"]
  155. def _get_username(self):
  156. """Return the name of the current user, whether logged in or not.
  157. First, we'll try to deduce it solely from cookies, to avoid an
  158. unnecessary API query. For the cookie-detection method, see
  159. _get_username_from_cookies()'s docs.
  160. If our username isn't in cookies, then we're probably not logged in, or
  161. something fishy is going on (like forced logout). In this case, do a
  162. single API query for our username (or IP address) and return that.
  163. """
  164. name = self._get_username_from_cookies()
  165. if name is not None:
  166. return name
  167. return self._get_username_from_api()
  168. def _save_cookiejar(self):
  169. """Try to save our cookiejar after doing a (normal) login or logout.
  170. Calls the standard .save() method with no filename. Don't fret if our
  171. cookiejar doesn't support saving (CookieJar raises AttributeError,
  172. FileCookieJar raises NotImplementedError) or no default filename was
  173. given (LWPCookieJar and MozillaCookieJar raise ValueError).
  174. """
  175. try:
  176. self._cookiejar.save()
  177. except (AttributeError, NotImplementedError, ValueError):
  178. pass
  179. def _login(self, login, token=None, attempt=0):
  180. """
  181. Docstring needed
  182. """
  183. name, password = login
  184. params = {"action": "login", "lgname": name, "lgpassword": password}
  185. if token is not None:
  186. params["lgtoken"] = token
  187. result = self.api_query(params)
  188. res = result["login"]["result"]
  189. if res == "Success":
  190. self._save_cookiejar()
  191. elif res == "NeedToken" and attempt == 0:
  192. token = result["login"]["token"]
  193. return self._login(login, token, attempt=1)
  194. else:
  195. if res == "Illegal":
  196. e = "The provided username is illegal."
  197. elif res == "NotExists":
  198. e = "The provided username does not exist."
  199. elif res == "EmptyPass":
  200. e = "No password was given."
  201. elif res == "WrongPass" or res == "WrongPluginPass":
  202. e = "The given password is incorrect."
  203. else:
  204. e = "Couldn't login; server says '{0}'.".format(res)
  205. raise LoginError(e)
  206. def _logout(self):
  207. """
  208. Docstring needed
  209. """
  210. params = {"action": "logout"}
  211. self.api_query(params)
  212. self._cookiejar.clear()
  213. self._save_cookiejar()
  214. def api_query(self, params):
  215. """
  216. Docstring needed
  217. """
  218. if self._base_url is None or self._script_path is None:
  219. e = "Tried to do an API query, but no API URL is known."
  220. raise SiteAPIError(e)
  221. url = ''.join((self._base_url, self._script_path, "/api.php"))
  222. params["format"] = "json" # this is the only format we understand
  223. data = urlencode(params)
  224. print url, data # debug code
  225. try:
  226. response = self._opener.open(url, data)
  227. except URLError as error:
  228. if hasattr(error, "reason"):
  229. e = "API query at {0} failed because {1}."
  230. e = e.format(error.geturl, error.reason)
  231. elif hasattr(error, "code"):
  232. e = "API query at {0} failed; got an error code of {1}."
  233. e = e.format(error.geturl, error.code)
  234. else:
  235. e = "API query failed."
  236. raise SiteAPIError(e)
  237. else:
  238. result = response.read()
  239. if response.headers.get("Content-Encoding") == "gzip":
  240. stream = StringIO(result)
  241. gzipper = GzipFile(fileobj=stream)
  242. result = gzipper.read()
  243. return loads(result) # parse as a JSON object
  244. def name(self):
  245. """
  246. Docstring needed
  247. """
  248. return self._name
  249. def project(self):
  250. """
  251. Docstring needed
  252. """
  253. return self._project
  254. def lang(self):
  255. """
  256. Docstring needed
  257. """
  258. return self._lang
  259. def domain(self):
  260. """
  261. Docstring needed
  262. """
  263. return urlparse(self._base_url).netloc
  264. def namespace_id_to_name(self, ns_id, all=False):
  265. """
  266. Docstring needed
  267. """
  268. try:
  269. if all:
  270. return self._namespaces[ns_id]
  271. else:
  272. return self._namespaces[ns_id][0]
  273. except KeyError:
  274. e = "There is no namespace with id {0}.".format(ns_id)
  275. raise NamespaceNotFoundError(e)
  276. def namespace_name_to_id(self, name):
  277. """
  278. Docstring needed
  279. """
  280. lname = name.lower()
  281. for ns_id, names in self._namespaces.items():
  282. lnames = [n.lower() for n in names] # be case-insensitive
  283. if lname in lnames:
  284. return ns_id
  285. e = "There is no namespace with name '{0}'.".format(name)
  286. raise NamespaceNotFoundError(e)
  287. def get_page(self, pagename):
  288. """
  289. Docstring needed
  290. """
  291. prefixes = self.namespace_id_to_name(NS_CATEGORY, all=True)
  292. prefix = pagename.split(":", 1)[0]
  293. if prefix != pagename: # avoid a page that is simply "Category"
  294. if prefix in prefixes:
  295. return Category(self, pagename)
  296. return Page(self, pagename)
  297. def get_category(self, catname):
  298. """
  299. Docstring needed
  300. """
  301. prefix = self.namespace_id_to_name(NS_CATEGORY)
  302. pagename = "{0}:{1}".format(prefix, catname)
  303. return Category(self, pagename)
  304. def get_user(self, username=None):
  305. """
  306. Docstring needed
  307. """
  308. if username is None:
  309. username = self._get_username()
  310. return User(self, username)