A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

656 lines
26 KiB

  1. # -*- coding: utf-8 -*-
  2. from hashlib import md5
  3. import re
  4. from time import gmtime, strftime
  5. from urllib import quote
  6. from wiki.exceptions import *
  7. class Page(object):
  8. """
  9. EarwigBot's Wiki Toolset: Page Class
  10. Represents a Page on a given Site. Has methods for getting information
  11. about the page, getting page content, and so on. Category is a subclass of
  12. Page with additional methods.
  13. Public methods:
  14. title -- returns the page's title, or pagename
  15. exists -- returns whether the page exists
  16. pageid -- returns an integer ID representing the page
  17. url -- returns the page's URL
  18. namespace -- returns the page's namespace as an integer
  19. protection -- returns the page's current protection status
  20. creator -- returns the page's creator (first user to edit)
  21. is_talkpage -- returns True if the page is a talkpage, else False
  22. is_redirect -- returns True if the page is a redirect, else False
  23. toggle_talk -- returns a content page's talk page, or vice versa
  24. get -- returns page content
  25. get_redirect_target -- if the page is a redirect, returns its destination
  26. edit -- replaces the page's content or creates a new page
  27. add_section -- add a new section at the bottom of the page
  28. """
  29. def __init__(self, site, title, follow_redirects=False):
  30. """Constructor for new Page instances.
  31. Takes three arguments: a Site object, the Page's title (or pagename),
  32. and whether or not to follow redirects (optional, defaults to False).
  33. As with User, site.get_page() is preferred. Site's method has support
  34. for a default `follow_redirects` value in our config, while __init__
  35. always defaults to False.
  36. __init__ will not do any API queries, but it will use basic namespace
  37. logic to determine our namespace ID and if we are a talkpage.
  38. """
  39. self._site = site
  40. self._title = title.strip()
  41. self._follow_redirects = self._keep_following = follow_redirects
  42. self._exists = 0
  43. self._pageid = None
  44. self._is_redirect = None
  45. self._lastrevid = None
  46. self._protection = None
  47. self._fullurl = None
  48. self._content = None
  49. self._creator = None
  50. # Attributes used for editing/deleting/protecting/etc:
  51. self._token = None
  52. self._basetimestamp = None
  53. self._starttimestamp = None
  54. # Try to determine the page's namespace using our site's namespace
  55. # converter:
  56. prefix = self._title.split(":", 1)[0]
  57. if prefix != title: # ignore a page that's titled "Category" or "User"
  58. try:
  59. self._namespace = self._site.namespace_name_to_id(prefix)
  60. except NamespaceNotFoundError:
  61. self._namespace = 0
  62. else:
  63. self._namespace = 0
  64. # Is this a talkpage? Talkpages have odd IDs, while content pages have
  65. # even IDs, excluding the "special" namespaces:
  66. if self._namespace < 0:
  67. self._is_talkpage = False
  68. else:
  69. self._is_talkpage = self._namespace % 2 == 1
  70. def __repr__(self):
  71. """Returns the canonical string representation of the Page."""
  72. res = ", ".join(("Page(title={0!r}", "follow_redirects={1!r}",
  73. "site={2!r})"))
  74. return res.format(self._title, self._follow_redirects, self._site)
  75. def __str__(self):
  76. """Returns a nice string representation of the Page."""
  77. return '<Page "{0}" of {1}>'.format(self.title(), str(self._site))
  78. def _force_validity(self):
  79. """Used to ensure that our page's title is valid.
  80. If this method is called when our page is not valid (and after
  81. _load_attributes() has been called), InvalidPageError will be raised.
  82. Note that validity != existence. If a page's title is invalid (e.g, it
  83. contains "[") it will always be invalid, and cannot be edited.
  84. """
  85. if self._exists == 1:
  86. e = "Page '{0}' is invalid.".format(self._title)
  87. raise InvalidPageError(e)
  88. def _force_existence(self):
  89. """Used to ensure that our page exists.
  90. If this method is called when our page doesn't exist (and after
  91. _load_attributes() has been called), PageNotFoundError will be raised.
  92. It will also call _force_validity() beforehand.
  93. """
  94. self._force_validity()
  95. if self._exists == 2:
  96. e = "Page '{0}' does not exist.".format(self._title)
  97. raise PageNotFoundError(e)
  98. def _load_wrapper(self):
  99. """Calls _load_attributes() and follows redirects if we're supposed to.
  100. This method will only follow redirects if follow_redirects=True was
  101. passed to __init__() (perhaps indirectly passed by site.get_page()).
  102. It avoids the API's &redirects param in favor of manual following,
  103. so we can act more realistically (we don't follow double redirects, and
  104. circular redirects don't break us).
  105. This will raise RedirectError if we have a problem following, but that
  106. is a bug and should NOT happen.
  107. If we're following a redirect, this will make a grand total of three
  108. API queries. It's a lot, but each one is quite small.
  109. """
  110. self._load_attributes()
  111. if self._keep_following and self._is_redirect:
  112. self._title = self.get_redirect_target()
  113. self._keep_following = False # don't follow double redirects
  114. self._content = None # reset the content we just loaded
  115. self._load_attributes()
  116. def _load_attributes(self, result=None):
  117. """Loads various data from the API in a single query.
  118. Loads self._title, ._exists, ._is_redirect, ._pageid, ._fullurl,
  119. ._protection, ._namespace, ._is_talkpage, ._creator, ._lastrevid,
  120. ._token, and ._starttimestamp using the API. It will do a query of
  121. its own unless `result` is provided, in which case we'll pretend
  122. `result` is what the query returned.
  123. Assuming the API is sound, this should not raise any exceptions.
  124. """
  125. if result is None:
  126. params = {"action": "query", "rvprop": "user", "intoken": "edit",
  127. "prop": "info|revisions", "rvlimit": 1, "rvdir": "newer",
  128. "titles": self._title, "inprop": "protection|url"}
  129. result = self._site._api_query(params)
  130. res = result["query"]["pages"].values()[0]
  131. # Normalize our pagename/title thing:
  132. self._title = res["title"]
  133. try:
  134. res["redirect"]
  135. except KeyError:
  136. self._is_redirect = False
  137. else:
  138. self._is_redirect = True
  139. self._pageid = result["query"]["pages"].keys()[0]
  140. if int(self._pageid) < 0:
  141. try:
  142. res["missing"]
  143. except KeyError:
  144. # If it has a negative ID and it's invalid, then break here,
  145. # because there's no other data for us to get:
  146. self._exists = 1
  147. return
  148. else:
  149. # If it has a negative ID and it's missing; we can still get
  150. # data like the namespace, protection, and URL:
  151. self._exists = 2
  152. else:
  153. self._exists = 3
  154. self._fullurl = res["fullurl"]
  155. self._protection = res["protection"]
  156. try:
  157. self._token = res["edittoken"]
  158. except KeyError:
  159. pass
  160. else:
  161. self._starttimestamp = strftime("%Y-%m-%dT%H:%M:%SZ", gmtime())
  162. # We've determined the namespace and talkpage status in __init__()
  163. # based on the title, but now we can be sure:
  164. self._namespace = res["ns"]
  165. self._is_talkpage = self._namespace % 2 == 1 # talkpages have odd IDs
  166. # These last two fields will only be specified if the page exists:
  167. self._lastrevid = res.get("lastrevid")
  168. try:
  169. self._creator = res['revisions'][0]['user']
  170. except KeyError:
  171. pass
  172. def _load_content(self, result=None):
  173. """Loads current page content from the API.
  174. If `result` is provided, we'll pretend that is the result of an API
  175. query and try to get content from that. Otherwise, we'll do an API
  176. query on our own.
  177. Don't call this directly, ever - use .get(force=True) if you want to
  178. force content reloading.
  179. """
  180. if result is None:
  181. params = {"action": "query", "prop": "revisions", "rvlimit": 1,
  182. "rvprop": "content|timestamp", "titles": self._title}
  183. result = self._site._api_query(params)
  184. res = result["query"]["pages"].values()[0]
  185. try:
  186. self._content = res["revisions"][0]["*"]
  187. self._basetimestamp = res["revisions"][0]["timestamp"]
  188. except KeyError:
  189. # This can only happen if the page was deleted since we last called
  190. # self._load_attributes(). In that case, some of our attributes are
  191. # outdated, so force another self._load_attributes():
  192. self._load_attributes()
  193. self._force_existence()
  194. def _edit(self, params=None, text=None, summary=None, minor=None, bot=None,
  195. force=None, section=None, captcha_id=None, captcha_word=None,
  196. tries=0):
  197. """Edit the page!
  198. If `params` is given, we'll use it as our API query parameters.
  199. Otherwise, we'll build params using the given kwargs via
  200. _build_edit_params().
  201. We'll then try to do the API query, and catch any errors the API raises
  202. in _handle_edit_errors(). We'll then throw these back as subclasses of
  203. EditError.
  204. """
  205. # Try to get our edit token, and die if we can't:
  206. if not self._token:
  207. self._load_attributes()
  208. if not self._token:
  209. e = "You don't have permission to edit this page."
  210. raise PermissionsError(e)
  211. # Weed out invalid pages before we get too far:
  212. self._force_validity()
  213. # Build our API query string:
  214. if not params:
  215. params = self._build_edit_params(text, summary, minor, bot, force,
  216. section, captcha_id, captcha_word)
  217. else: # Make sure we have the right token:
  218. params["token"] = self._token
  219. # Try the API query, catching most errors with our handler:
  220. try:
  221. result = self._site._api_query(params)
  222. except SiteAPIError as error:
  223. if not hasattr(error, "code"):
  224. raise # We can only handle errors with a code attribute
  225. result = self._handle_edit_errors(error, params, tries)
  226. # If everything was successful, reset invalidated attributes:
  227. if result["edit"]["result"] == "Success":
  228. self._content = None
  229. self._basetimestamp = None
  230. self._exists = 0
  231. return
  232. # If we're here, then the edit failed. If it's because of AssertEdit,
  233. # handle that. Otherwise, die - something odd is going on:
  234. try:
  235. assertion = result["edit"]["assert"]
  236. except KeyError:
  237. raise EditError(result["edit"])
  238. self._handle_assert_edit(assertion, params, tries)
  239. def _build_edit_params(self, text, summary, minor, bot, force, section,
  240. captcha_id, captcha_word):
  241. """Given some keyword arguments, build an API edit query string."""
  242. hashed = md5(text).hexdigest() # Checksum to ensure text is correct
  243. params = {"action": "edit", "title": self._title, "text": text,
  244. "token": self._token, "summary": summary, "md5": hashed}
  245. if section:
  246. params["section"] = section
  247. if captcha_id and captcha_word:
  248. params["captchaid"] = captcha_id
  249. params["captchaword"] = captcha_word
  250. if minor:
  251. params["minor"] = "true"
  252. else:
  253. params["notminor"] = "true"
  254. if bot:
  255. params["bot"] = "true"
  256. if not force:
  257. params["starttimestamp"] = self._starttimestamp
  258. if self._basetimestamp:
  259. params["basetimestamp"] = self._basetimestamp
  260. if self._exists == 2:
  261. # Page does not exist; don't edit if it already exists:
  262. params["createonly"] = "true"
  263. else:
  264. params["recreate"] = "true"
  265. return params
  266. def _handle_edit_errors(self, error, params, tries):
  267. """If our edit fails due to some error, try to handle it.
  268. We'll either raise an appropriate exception (for example, if the page
  269. is protected), or we'll try to fix it (for example, if we can't edit
  270. due to being logged out, we'll try to log in).
  271. """
  272. if error.code in ["noedit", "cantcreate", "protectedtitle",
  273. "noimageredirect"]:
  274. raise PermissionsError(error.info)
  275. elif error.code in ["noedit-anon", "cantcreate-anon",
  276. "noimageredirect-anon"]:
  277. if not all(self._site._login_info):
  278. # Insufficient login info:
  279. raise PermissionsError(error.info)
  280. if tries == 0:
  281. # We have login info; try to login:
  282. self._site._login(self._site._login_info)
  283. self._token = None # Need a new token; old one is invalid now
  284. return self._edit(params=params, tries=1)
  285. else:
  286. # We already tried to log in and failed!
  287. e = "Although we should be logged in, we are not. This may be a cookie problem or an odd bug."
  288. raise LoginError(e)
  289. elif error.code in ["editconflict", "pagedeleted", "articleexists"]:
  290. # These attributes are now invalidated:
  291. self._content = None
  292. self._basetimestamp = None
  293. self._exists = 0
  294. raise EditConflictError(error.info)
  295. elif error.code in ["emptypage", "emptynewsection"]:
  296. raise NoContentError(error.info)
  297. elif error.code == "contenttoobig":
  298. raise ContentTooBigError(error.info)
  299. elif error.code == "spamdetected":
  300. raise SpamDetectedError(error.info)
  301. elif error.code == "filtered":
  302. raise FilteredError(error.info)
  303. raise EditError(": ".join((error.code, error.info)))
  304. def _handle_assert_edit(self, assertion, params, tries):
  305. """If we can't edit due to a failed AssertEdit assertion, handle that.
  306. If the assertion was 'user' and we have valid login information, try to
  307. log in. Otherwise, raise PermissionsError with details.
  308. """
  309. if assertion == "user":
  310. if not all(self._site._login_info):
  311. # Insufficient login info:
  312. e = "AssertEdit: user assertion failed, and no login info was provided."
  313. raise PermissionsError(e)
  314. if tries == 0:
  315. # We have login info; try to login:
  316. self._site._login(self._site._login_info)
  317. self._token = None # Need a new token; old one is invalid now
  318. return self._edit(params=params, tries=1)
  319. else:
  320. # We already tried to log in and failed!
  321. e = "Although we should be logged in, we are not. This may be a cookie problem or an odd bug."
  322. raise LoginError(e)
  323. elif assertion == "bot":
  324. e = "AssertEdit: bot assertion failed; we don't have a bot flag!"
  325. raise PermissionsError(e)
  326. # Unknown assertion, maybe "true", "false", or "exists":
  327. e = "AssertEdit: assertion '{0}' failed.".format(assertion)
  328. raise PermissionsError(e)
  329. def title(self, force=False):
  330. """Returns the Page's title, or pagename.
  331. This won't do any API queries on its own unless force is True, in which
  332. case the title will be forcibly reloaded from the API (normalizing it,
  333. and following redirects if follow_redirects=True was passed to
  334. __init__()). Any other methods that do API queries will reload title on
  335. their own, however, like exists() and get().
  336. """
  337. if force:
  338. self._load_wrapper()
  339. return self._title
  340. def exists(self, force=False):
  341. """Returns information about whether the Page exists or not.
  342. The returned "information" is a tuple with two items. The first is a
  343. bool, either True if the page exists or False if it does not. The
  344. second is a string giving more information, either "invalid", (title
  345. is invalid, e.g. it contains "["), "missing", or "exists".
  346. Makes an API query if force is True or if we haven't already made one.
  347. """
  348. cases = {
  349. 0: (None, "unknown"),
  350. 1: (False, "invalid"),
  351. 2: (False, "missing"),
  352. 3: (True, "exists"),
  353. }
  354. if self._exists == 0 or force:
  355. self._load_wrapper()
  356. return cases[self._exists]
  357. def pageid(self, force=False):
  358. """Returns an integer ID representing the Page.
  359. Makes an API query if force is True or if we haven't already made one.
  360. Raises InvalidPageError or PageNotFoundError if the page name is
  361. invalid or the page does not exist, respectively.
  362. """
  363. if self._exists == 0 or force:
  364. self._load_wrapper()
  365. self._force_existence() # missing pages do not have IDs
  366. return self._pageid
  367. def url(self, force=False):
  368. """Returns the page's URL.
  369. Like title(), this won't do any API queries on its own unless force is
  370. True. If the API was never queried for this page, we will attempt to
  371. determine the URL ourselves based on the title.
  372. """
  373. if force:
  374. self._load_wrapper()
  375. if self._fullurl is not None:
  376. return self._fullurl
  377. else:
  378. slug = quote(self._title.replace(" ", "_"), safe="/:")
  379. path = self._site._article_path.replace("$1", slug)
  380. return ''.join((self._site._base_url, path))
  381. def namespace(self, force=False):
  382. """Returns the page's namespace ID (an integer).
  383. Like title(), this won't do any API queries on its own unless force is
  384. True. If the API was never queried for this page, we will attempt to
  385. determine the namespace ourselves based on the title.
  386. """
  387. if force:
  388. self._load_wrapper()
  389. return self._namespace
  390. def protection(self, force=False):
  391. """Returns the page's current protection status.
  392. Makes an API query if force is True or if we haven't already made one.
  393. Raises InvalidPageError if the page name is invalid. Will not raise an
  394. error if the page is missing because those can still be protected.
  395. """
  396. if self._exists == 0 or force:
  397. self._load_wrapper()
  398. self._force_validity() # invalid pages cannot be protected
  399. return self._protection
  400. def creator(self, force=False):
  401. """Returns the page's creator (i.e., the first user to edit the page).
  402. Makes an API query if force is True or if we haven't already made one.
  403. Normally, we can get the creator along with everything else (except
  404. content) in self._load_attributes(). However, due to a limitation in
  405. the API (can't get the editor of one revision and the content of
  406. another at both ends of the history), if our other attributes were only
  407. loaded from get(), we'll have to do another API query. This is done
  408. by calling ourselves again with force=True.
  409. Raises InvalidPageError or PageNotFoundError if the page name is
  410. invalid or the page does not exist, respectively.
  411. """
  412. if self._exists == 0 or force:
  413. self._load_wrapper()
  414. self._force_existence()
  415. if not self._creator and not force:
  416. self.creator(force=True)
  417. return self._creator
  418. def is_talkpage(self, force=False):
  419. """Returns True if the page is a talkpage, else False.
  420. Like title(), this won't do any API queries on its own unless force is
  421. True. If the API was never queried for this page, we will attempt to
  422. determine the talkpage status ourselves based on its namespace ID.
  423. """
  424. if force:
  425. self._load_wrapper()
  426. return self._is_talkpage
  427. def is_redirect(self, force=False):
  428. """Returns True if the page is a redirect, else False.
  429. Makes an API query if force is True or if we haven't already made one.
  430. We will return False even if the page does not exist or is invalid.
  431. """
  432. if self._exists == 0 or force:
  433. self._load_wrapper()
  434. return self._is_redirect
  435. def toggle_talk(self, force=False, follow_redirects=None):
  436. """Returns a content page's talk page, or vice versa.
  437. The title of the new page is determined by namespace logic, not API
  438. queries. We won't make any API queries on our own unless force is True,
  439. and the only reason then would be to forcibly update the title or
  440. follow redirects if we haven't already made an API query.
  441. If `follow_redirects` is anything other than None (the default), it
  442. will be passed to the new Page's __init__(). Otherwise, we'll use the
  443. value passed to our own __init__().
  444. Will raise InvalidPageError if we try to get the talk page of a special
  445. page (in the Special: or Media: namespaces), but we won't raise an
  446. exception if our page is otherwise missing or invalid.
  447. """
  448. if force:
  449. self._load_wrapper()
  450. if self._namespace < 0:
  451. ns = self._site.namespace_id_to_name(self._namespace)
  452. e = "Pages in the {0} namespace can't have talk pages.".format(ns)
  453. raise InvalidPageError(e)
  454. if self._is_talkpage:
  455. new_ns = self._namespace - 1
  456. else:
  457. new_ns = self._namespace + 1
  458. try:
  459. body = self._title.split(":", 1)[1]
  460. except IndexError:
  461. body = self._title
  462. new_prefix = self._site.namespace_id_to_name(new_ns)
  463. # If the new page is in namespace 0, don't do ":Title" (it's correct,
  464. # but unnecessary), just do "Title":
  465. if new_prefix:
  466. new_title = ':'.join((new_prefix, body))
  467. else:
  468. new_title = body
  469. if follow_redirects is None:
  470. follow_redirects = self._follow_redirects
  471. return Page(self._site, new_title, follow_redirects)
  472. def get(self, force=False):
  473. """Returns page content, which is cached if you try to call get again.
  474. Use `force` to forcibly reload page content even if we've already
  475. loaded some. This is good if you want to edit a page multiple times,
  476. and you want to get updated content before you make your second edit.
  477. Raises InvalidPageError or PageNotFoundError if the page name is
  478. invalid or the page does not exist, respectively.
  479. """
  480. if force or self._exists == 0:
  481. # Kill two birds with one stone by doing an API query for both our
  482. # attributes and our page content:
  483. params = {"action": "query", "rvlimit": 1, "titles": self._title,
  484. "prop": "info|revisions", "inprop": "protection|url",
  485. "intoken": "edit", "rvprop": "content|timestamp"}
  486. result = self._site._api_query(params)
  487. self._load_attributes(result=result)
  488. self._force_existence()
  489. self._load_content(result=result)
  490. # Follow redirects if we're told to:
  491. if self._keep_following and self._is_redirect:
  492. self._title = self.get_redirect_target()
  493. self._keep_following = False # don't follow double redirects
  494. self._content = None # reset the content we just loaded
  495. self.get(force=True)
  496. return self._content
  497. # Make sure we're dealing with a real page here. This may be outdated
  498. # if the page was deleted since we last called self._load_attributes(),
  499. # but self._load_content() can handle that:
  500. self._force_existence()
  501. if self._content is None:
  502. self._load_content()
  503. return self._content
  504. def get_redirect_target(self, force=False):
  505. """If the page is a redirect, returns its destination.
  506. Use `force` to forcibly reload content even if we've already loaded
  507. some before. Note that this method calls get() for page content.
  508. Raises InvalidPageError or PageNotFoundError if the page name is
  509. invalid or the page does not exist, respectively. Raises RedirectError
  510. if the page is not a redirect.
  511. """
  512. content = self.get(force)
  513. regexp = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"
  514. try:
  515. return re.findall(regexp, content, flags=re.IGNORECASE)[0]
  516. except IndexError:
  517. e = "The page does not appear to have a redirect target."
  518. raise RedirectError(e)
  519. def edit(self, text, summary, minor=False, bot=True, force=False):
  520. """Replaces the page's content or creates a new page.
  521. `text` is the new page content, with `summary` as the edit summary.
  522. If `minor` is True, the edit will be marked as minor. If `bot` is true,
  523. the edit will be marked as a bot edit, but only if we actually have a
  524. bot flag.
  525. Use `force` to push the new content even if there's an edit conflict or
  526. the page was deleted/recreated between getting our edit token and
  527. editing our page. Be careful with this!
  528. """
  529. self._edit(text=text, summary=summary, minor=minor, bot=bot,
  530. force=force)
  531. def add_section(self, text, title, minor=False, bot=True, force=False):
  532. """Adds a new section to the bottom of the page.
  533. The arguments for this are the same as those for edit(), but instead of
  534. providing a summary, you provide a section title.
  535. Likewise, raised exceptions are the same as edit()'s.
  536. This should create the page if it does not already exist, with just the
  537. new section as content.
  538. """
  539. self._edit(text=text, summary=title, minor=minor, bot=bot, force=force,
  540. section="new")