A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

351 lines
12 KiB

  1. # -*- coding: utf-8 -*-
  2. from cookielib import CookieJar
  3. from json import loads
  4. from re import escape as re_escape, match as re_match
  5. from urllib import unquote_plus, urlencode
  6. from urllib2 import build_opener, HTTPCookieProcessor, URLError
  7. from urlparse import urlparse
  8. from wiki.tools.category import Category
  9. from wiki.tools.constants import *
  10. from wiki.tools.exceptions import *
  11. from wiki.tools.page import Page
  12. from wiki.tools.user import User
  13. class Site(object):
  14. """
  15. EarwigBot's Wiki Toolset: Site Class
  16. """
  17. def __init__(self, name=None, project=None, lang=None, base_url=None,
  18. article_path=None, script_path=None, sql=(None, None),
  19. namespaces=None, login=(None, None), cookiejar=None):
  20. """
  21. Docstring needed
  22. """
  23. # attributes referring to site information, filled in by an API query
  24. # if they are missing (and an API url can be determined)
  25. self._name = name
  26. self._project = project
  27. self._lang = lang
  28. self._base_url = base_url
  29. self._article_path = article_path
  30. self._script_path = script_path
  31. self._sql = sql
  32. self._namespaces = namespaces
  33. # set up cookiejar and URL opener for making API queries
  34. if cookiejar is not None:
  35. self._cookiejar = cookiejar
  36. else:
  37. self._cookiejar = CookieJar()
  38. self._opener = build_opener(HTTPCookieProcessor(self._cookiejar))
  39. self._opener.addheaders = [('User-agent', USER_AGENT)]
  40. # get all of the above attributes that were not specified as arguments
  41. self._load_attributes()
  42. # if we have a name/pass and the API says we're not logged in, log in
  43. self._login_info = name, password = login
  44. if name is not None and password is not None:
  45. logged_in_as = self._get_username_from_cookies()
  46. if logged_in_as is None or name != logged_in_as:
  47. self._login(login)
  48. def _load_attributes(self, force=False):
  49. """
  50. Docstring needed
  51. """
  52. # all attributes to be loaded, except _namespaces, which is a special
  53. # case because it requires additional params in the API query
  54. attrs = [self._name, self._project, self._lang, self._base_url,
  55. self._article_path, self._script_path]
  56. params = {"action": "query", "meta": "siteinfo"}
  57. if self._namespaces is None or force:
  58. params["siprop"] = "general|namespaces|namespacealiases"
  59. result = self.api_query(params)
  60. self._load_namespaces(result)
  61. elif all(attrs): # everything is already specified and we're not told
  62. return # to force a reload, so do nothing
  63. else: # we're only loading attributes other than _namespaces
  64. params["siprop"] = "general"
  65. result = self.api_query(params)
  66. res = result["query"]["general"]
  67. self._name = res["wikiid"]
  68. self._project = res["sitename"].lower()
  69. self._lang = res["lang"]
  70. self._base_url = res["server"]
  71. self._article_path = res["articlepath"]
  72. self._script_path = res["scriptpath"]
  73. def _load_namespaces(self, result):
  74. """
  75. Docstring needed
  76. """
  77. self._namespaces = {}
  78. for namespace in result["query"]["namespaces"].values():
  79. ns_id = namespace["id"]
  80. name = namespace["*"]
  81. try:
  82. canonical = namespace["canonical"]
  83. except KeyError:
  84. self._namespaces[ns_id] = [name]
  85. else:
  86. if name != canonical:
  87. self._namespaces[ns_id] = [name, canonical]
  88. else:
  89. self._namespaces[ns_id] = [name]
  90. for namespace in result["query"]["namespacealiases"]:
  91. ns_id = namespace["id"]
  92. alias = namespace["*"]
  93. self._namespaces[ns_id].append(alias)
  94. def _get_cookie(self, name, domain):
  95. """Return the cookie `name` in `domain`, unless it is expired. Return
  96. None if no cookie was found.
  97. """
  98. for cookie in self._cookiejar:
  99. if cookie.name == name and cookie.domain == domain:
  100. if cookie.is_expired():
  101. break
  102. return cookie
  103. return None
  104. def _get_username_from_cookies(self):
  105. """Try to return our username based solely on cookies.
  106. First, we'll look for a cookie named self._name + "Token", like
  107. "enwikiToken". If it exists and isn't expired, we'll assume it's valid
  108. and try to return the value of the cookie self._name + "UserName" (like
  109. "enwikiUserName"). This should work fine on wikis without single-user
  110. login.
  111. If `enwikiToken` doesn't exist, we'll try to find a cookie named
  112. `centralauth_Token`. If this exists and is not expired, we'll try to
  113. return the value of `centralauth_User`.
  114. If we didn't get any matches, we'll return None. Our goal here isn't to
  115. return the most likely username, or what we *want* our username to be
  116. (for that, we'd do self._login_info[0]), but rather to get our current
  117. username without an unnecessary ?action=query&meta=userinfo API query.
  118. """
  119. domain = self.domain()
  120. name = ''.join((self._name, "Token"))
  121. cookie = self._get_cookie(name, domain)
  122. if cookie is not None:
  123. name = ''.join((self._name, "UserName"))
  124. user_name = self._get_cookie(name, domain)
  125. if user_name is not None:
  126. return user_name.value
  127. name = "centralauth_Token"
  128. for cookie in self._cookiejar:
  129. if cookie.domain_initial_dot is False or cookie.is_expired():
  130. continue
  131. if cookie.name != name:
  132. continue
  133. # build a regex that will match domains this cookie affects
  134. search = ''.join(("(.*?)", re_escape(cookie.domain)))
  135. if re_match(search, domain): # test it against our site
  136. user_name = self._get_cookie("centralauth_User", cookie.domain)
  137. if user_name is not None:
  138. return user_name.value
  139. return None
  140. def _get_username_from_api(self):
  141. """Do a simple API query to get our username and return it.
  142. This is a reliable way to make sure we are actually logged in, because
  143. it doesn't deal with annoying cookie logic, but it results in an API
  144. query that is unnecessary in many cases.
  145. Called by _get_username() (in turn called by get_user() with no
  146. username argument) when cookie lookup fails, probably indicating that
  147. we are logged out.
  148. """
  149. params = {"action": "query", "meta": "userinfo"}
  150. result = self.api_query(params)
  151. return result["query"]["userinfo"]["name"]
  152. def _get_username(self):
  153. """Return the name of the current user, whether logged in or not.
  154. First, we'll try to deduce it solely from cookies, to avoid an
  155. unnecessary API query. For the cookie-detection method, see
  156. _get_username_from_cookies()'s docs.
  157. If our username isn't in cookies, then we're probably not logged in, or
  158. something fishy is going on (like forced logout). In this case, do a
  159. single API query for our username (or IP address) and return that.
  160. """
  161. name = self._get_username_from_cookies()
  162. if name is not None:
  163. return name
  164. return self._get_username_from_api()
  165. def _save_cookiejar(self):
  166. """Try to save our cookiejar after doing a (normal) login or logout.
  167. Calls the standard .save() method with no filename. Don't fret if our
  168. cookiejar doesn't support saving (CookieJar raises AttributeError,
  169. FileCookieJar raises NotImplementedError) or no default filename was
  170. given (LWPCookieJar and MozillaCookieJar raise ValueError).
  171. """
  172. try:
  173. self._cookiejar.save()
  174. except (AttributeError, NotImplementedError, ValueError):
  175. pass
  176. def _login(self, login, token=None, attempt=0):
  177. """
  178. Docstring needed
  179. """
  180. name, password = login
  181. params = {"action": "login", "lgname": name, "lgpassword": password}
  182. if token is not None:
  183. params["lgtoken"] = token
  184. result = self.api_query(params)
  185. res = result["login"]["result"]
  186. if res == "Success":
  187. self._save_cookiejar()
  188. elif res == "NeedToken" and attempt == 0:
  189. token = result["login"]["token"]
  190. return self._login(login, token, attempt=1)
  191. else:
  192. if res == "Illegal":
  193. e = "The provided username is illegal."
  194. elif res == "NotExists":
  195. e = "The provided username does not exist."
  196. elif res == "EmptyPass":
  197. e = "No password was given."
  198. elif res == "WrongPass" or res == "WrongPluginPass":
  199. e = "The given password is incorrect."
  200. else:
  201. e = "Couldn't login; server says '{0}'.".format(res)
  202. raise LoginError(e)
  203. def _logout(self):
  204. """
  205. Docstring needed
  206. """
  207. params = {"action": "logout"}
  208. self.api_query(params)
  209. self._cookiejar.clear()
  210. self._save_cookiejar()
  211. def api_query(self, params):
  212. """
  213. Docstring needed
  214. """
  215. if self._base_url is None or self._script_path is None:
  216. e = "Tried to do an API query, but no API URL is known."
  217. raise SiteAPIError(e)
  218. url = ''.join((self._base_url, self._script_path, "/api.php"))
  219. params["format"] = "json" # this is the only format we understand
  220. data = urlencode(params)
  221. print url, data # debug code
  222. try:
  223. response = self._opener.open(url, data)
  224. except URLError as error:
  225. if hasattr(error, "reason"):
  226. e = "API query at {0} failed because {1}."
  227. e = e.format(error.geturl, error.reason)
  228. elif hasattr(error, "code"):
  229. e = "API query at {0} failed; got an error code of {1}."
  230. e = e.format(error.geturl, error.code)
  231. else:
  232. e = "API query failed."
  233. raise SiteAPIError(e)
  234. else:
  235. result = response.read()
  236. return loads(result) # parse as a JSON object
  237. def name(self):
  238. """
  239. Docstring needed
  240. """
  241. return self._name
  242. def project(self):
  243. """
  244. Docstring needed
  245. """
  246. return self._project
  247. def lang(self):
  248. """
  249. Docstring needed
  250. """
  251. return self._lang
  252. def domain(self):
  253. """
  254. Docstring needed
  255. """
  256. return urlparse(self._base_url).netloc
  257. def namespace_id_to_name(self, ns_id, all=False):
  258. """
  259. Docstring needed
  260. """
  261. try:
  262. if all:
  263. return self._namespaces[ns_id]
  264. else:
  265. return self._namespaces[ns_id][0]
  266. except KeyError:
  267. e = "There is no namespace with id {0}.".format(ns_id)
  268. raise NamespaceNotFoundError(e)
  269. def namespace_name_to_id(self, name):
  270. """
  271. Docstring needed
  272. """
  273. lname = name.lower()
  274. for ns_id, names in self._namespaces.items():
  275. lnames = [n.lower() for n in names] # be case-insensitive
  276. if lname in lnames:
  277. return ns_id
  278. e = "There is no namespace with name '{0}'.".format(name)
  279. raise NamespaceNotFoundError(e)
  280. def get_page(self, pagename):
  281. """
  282. Docstring needed
  283. """
  284. prefixes = self.namespace_id_to_name(NS_CATEGORY, all=True)
  285. prefix = pagename.split(":", 1)[0]
  286. if prefix != pagename: # avoid a page that is simply "Category"
  287. if prefix in prefixes:
  288. return Category(self, pagename)
  289. return Page(self, pagename)
  290. def get_category(self, catname):
  291. """
  292. Docstring needed
  293. """
  294. prefix = self.namespace_id_to_name(NS_CATEGORY)
  295. pagename = "{0}:{1}".format(prefix, catname)
  296. return Category(self, pagename)
  297. def get_user(self, username=None):
  298. """
  299. Docstring needed
  300. """
  301. if username is None:
  302. username = self._get_username()
  303. return User(self, username)