A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.

732 satır
30 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2014 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from hashlib import md5
  23. from logging import getLogger, NullHandler
  24. import re
  25. from time import gmtime, strftime
  26. from urllib import quote
  27. import mwparserfromhell
  28. from earwigbot import exceptions
  29. from earwigbot.wiki.copyvios import CopyvioMixIn
  30. __all__ = ["Page"]
  31. class Page(CopyvioMixIn):
  32. """
  33. **EarwigBot: Wiki Toolset: Page**
  34. Represents a page on a given :py:class:`~earwigbot.wiki.site.Site`. Has
  35. methods for getting information about the page, getting page content, and
  36. so on. :py:class:`~earwigbot.wiki.category.Category` is a subclass of
  37. :py:class:`Page` with additional methods.
  38. *Attributes:*
  39. - :py:attr:`site`: the page's corresponding Site object
  40. - :py:attr:`title`: the page's title, or pagename
  41. - :py:attr:`exists`: whether or not the page exists
  42. - :py:attr:`pageid`: an integer ID representing the page
  43. - :py:attr:`url`: the page's URL
  44. - :py:attr:`namespace`: the page's namespace as an integer
  45. - :py:attr:`protection`: the page's current protection status
  46. - :py:attr:`is_talkpage`: ``True`` if this is a talkpage, else ``False``
  47. - :py:attr:`is_redirect`: ``True`` if this is a redirect, else ``False``
  48. *Public methods:*
  49. - :py:meth:`reload`: forcibly reloads the page's attributes
  50. - :py:meth:`toggle_talk`: returns a content page's talk page, or vice versa
  51. - :py:meth:`get`: returns the page's content
  52. - :py:meth:`get_redirect_target`: returns the page's destination if it is a
  53. redirect
  54. - :py:meth:`get_creator`: returns a User object representing the first
  55. person to edit the page
  56. - :py:meth:`parse`: parses the page content for templates, links, etc
  57. - :py:meth:`edit`: replaces the page's content or creates a new page
  58. - :py:meth:`add_section`: adds a new section at the bottom of the page
  59. - :py:meth:`check_exclusion`: checks whether or not we are allowed to edit
  60. the page, per ``{{bots}}``/``{{nobots}}``
  61. - :py:meth:`~earwigbot.wiki.copyvios.CopyrightMixIn.copyvio_check`:
  62. checks the page for copyright violations
  63. - :py:meth:`~earwigbot.wiki.copyvios.CopyrightMixIn.copyvio_compare`:
  64. checks the page like :py:meth:`copyvio_check`, but against a specific URL
  65. """
  66. PAGE_UNKNOWN = 0
  67. PAGE_INVALID = 1
  68. PAGE_MISSING = 2
  69. PAGE_EXISTS = 3
  70. def __init__(self, site, title, follow_redirects=False, pageid=None,
  71. logger=None):
  72. """Constructor for new Page instances.
  73. Takes four arguments: a Site object, the Page's title (or pagename),
  74. whether or not to follow redirects (optional, defaults to False), and
  75. a page ID to supplement the title (optional, defaults to None - i.e.,
  76. we will have to query the API to get it).
  77. As with User, site.get_page() is preferred.
  78. __init__() will not do any API queries, but it will use basic namespace
  79. logic to determine our namespace ID and if we are a talkpage.
  80. """
  81. super(Page, self).__init__(site)
  82. self._site = site
  83. self._title = title.strip()
  84. self._follow_redirects = self._keep_following = follow_redirects
  85. self._pageid = pageid
  86. # Set up our internal logger:
  87. if logger:
  88. self._logger = logger
  89. else: # Just set up a null logger to eat up our messages:
  90. self._logger = getLogger("earwigbot.wiki")
  91. self._logger.addHandler(NullHandler())
  92. # Attributes to be loaded through the API:
  93. self._exists = self.PAGE_UNKNOWN
  94. self._is_redirect = None
  95. self._lastrevid = None
  96. self._protection = None
  97. self._fullurl = None
  98. self._content = None
  99. self._creator = None
  100. # Attributes used for editing/deleting/protecting/etc:
  101. self._token = None
  102. self._basetimestamp = None
  103. self._starttimestamp = None
  104. # Try to determine the page's namespace using our site's namespace
  105. # converter:
  106. prefix = self._title.split(":", 1)[0]
  107. if prefix != title: # ignore a page that's titled "Category" or "User"
  108. try:
  109. self._namespace = self.site.namespace_name_to_id(prefix)
  110. except exceptions.NamespaceNotFoundError:
  111. self._namespace = 0
  112. else:
  113. self._namespace = 0
  114. # Is this a talkpage? Talkpages have odd IDs, while content pages have
  115. # even IDs, excluding the "special" namespaces:
  116. if self._namespace < 0:
  117. self._is_talkpage = False
  118. else:
  119. self._is_talkpage = self._namespace % 2 == 1
  120. def __repr__(self):
  121. """Return the canonical string representation of the Page."""
  122. res = "Page(title={0!r}, follow_redirects={1!r}, site={2!r})"
  123. return res.format(self._title, self._follow_redirects, self._site)
  124. def __str__(self):
  125. """Return a nice string representation of the Page."""
  126. return '<Page "{0}" of {1}>'.format(self.title, str(self.site))
  127. def _assert_validity(self):
  128. """Used to ensure that our page's title is valid.
  129. If this method is called when our page is not valid (and after
  130. _load_attributes() has been called), InvalidPageError will be raised.
  131. Note that validity != existence. If a page's title is invalid (e.g, it
  132. contains "[") it will always be invalid, and cannot be edited.
  133. """
  134. if self._exists == self.PAGE_INVALID:
  135. e = u"Page '{0}' is invalid.".format(self._title)
  136. raise exceptions.InvalidPageError(e)
  137. def _assert_existence(self):
  138. """Used to ensure that our page exists.
  139. If this method is called when our page doesn't exist (and after
  140. _load_attributes() has been called), PageNotFoundError will be raised.
  141. It will also call _assert_validity() beforehand.
  142. """
  143. self._assert_validity()
  144. if self._exists == self.PAGE_MISSING:
  145. e = u"Page '{0}' does not exist.".format(self._title)
  146. raise exceptions.PageNotFoundError(e)
  147. def _load(self):
  148. """Call _load_attributes() and follows redirects if we're supposed to.
  149. This method will only follow redirects if follow_redirects=True was
  150. passed to __init__() (perhaps indirectly passed by site.get_page()).
  151. It avoids the API's &redirects param in favor of manual following,
  152. so we can act more realistically (we don't follow double redirects, and
  153. circular redirects don't break us).
  154. This will raise RedirectError if we have a problem following, but that
  155. is a bug and should NOT happen.
  156. If we're following a redirect, this will make a grand total of three
  157. API queries. It's a lot, but each one is quite small.
  158. """
  159. self._load_attributes()
  160. if self._keep_following and self._is_redirect:
  161. self._title = self.get_redirect_target()
  162. self._keep_following = False # don't follow double redirects
  163. self._content = None # reset the content we just loaded
  164. self._load_attributes()
  165. def _load_attributes(self, result=None):
  166. """Load various data from the API in a single query.
  167. Loads self._title, ._exists, ._is_redirect, ._pageid, ._fullurl,
  168. ._protection, ._namespace, ._is_talkpage, ._creator, ._lastrevid,
  169. ._token, and ._starttimestamp using the API. It will do a query of
  170. its own unless *result* is provided, in which case we'll pretend
  171. *result* is what the query returned.
  172. Assuming the API is sound, this should not raise any exceptions.
  173. """
  174. if not result:
  175. query = self.site.api_query
  176. result = query(action="query", rvprop="user", intoken="edit",
  177. prop="info|revisions", rvlimit=1, rvdir="newer",
  178. titles=self._title, inprop="protection|url")
  179. res = result["query"]["pages"].values()[0]
  180. self._title = res["title"] # Normalize our pagename/title
  181. self._is_redirect = "redirect" in res
  182. self._pageid = int(result["query"]["pages"].keys()[0])
  183. if self._pageid < 0:
  184. if "missing" in res:
  185. # If it has a negative ID and it's missing; we can still get
  186. # data like the namespace, protection, and URL:
  187. self._exists = self.PAGE_MISSING
  188. else:
  189. # If it has a negative ID and it's invalid, then break here,
  190. # because there's no other data for us to get:
  191. self._exists = self.PAGE_INVALID
  192. return
  193. else:
  194. self._exists = self.PAGE_EXISTS
  195. self._fullurl = res["fullurl"]
  196. self._protection = res["protection"]
  197. try:
  198. self._token = res["edittoken"]
  199. except KeyError:
  200. pass
  201. else:
  202. self._starttimestamp = strftime("%Y-%m-%dT%H:%M:%SZ", gmtime())
  203. # We've determined the namespace and talkpage status in __init__()
  204. # based on the title, but now we can be sure:
  205. self._namespace = res["ns"]
  206. self._is_talkpage = self._namespace % 2 == 1 # talkpages have odd IDs
  207. # These last two fields will only be specified if the page exists:
  208. self._lastrevid = res.get("lastrevid")
  209. try:
  210. self._creator = res['revisions'][0]['user']
  211. except KeyError:
  212. pass
  213. def _load_content(self, result=None):
  214. """Load current page content from the API.
  215. If *result* is provided, we'll pretend that is the result of an API
  216. query and try to get content from that. Otherwise, we'll do an API
  217. query on our own.
  218. Don't call this directly, ever; use reload() followed by get() if you
  219. want to force content reloading.
  220. """
  221. if not result:
  222. query = self.site.api_query
  223. result = query(action="query", prop="revisions", rvlimit=1,
  224. rvprop="content|timestamp", titles=self._title)
  225. res = result["query"]["pages"].values()[0]
  226. try:
  227. self._content = res["revisions"][0]["*"]
  228. self._basetimestamp = res["revisions"][0]["timestamp"]
  229. except KeyError:
  230. # This can only happen if the page was deleted since we last called
  231. # self._load_attributes(). In that case, some of our attributes are
  232. # outdated, so force another self._load_attributes():
  233. self._load_attributes()
  234. self._assert_existence()
  235. def _edit(self, params=None, text=None, summary=None, minor=None, bot=None,
  236. force=None, section=None, captcha_id=None, captcha_word=None):
  237. """Edit the page!
  238. If *params* is given, we'll use it as our API query parameters.
  239. Otherwise, we'll build params using the given kwargs via
  240. _build_edit_params().
  241. We'll then try to do the API query, and catch any errors the API raises
  242. in _handle_edit_errors(). We'll then throw these back as subclasses of
  243. EditError.
  244. """
  245. # Try to get our edit token, and die if we can't:
  246. if not self._token:
  247. self._load_attributes()
  248. if not self._token:
  249. e = "You don't have permission to edit this page."
  250. raise exceptions.PermissionsError(e)
  251. # Weed out invalid pages before we get too far:
  252. self._assert_validity()
  253. # Build our API query string:
  254. if not params:
  255. params = self._build_edit_params(text, summary, minor, bot, force,
  256. section, captcha_id, captcha_word)
  257. else: # Make sure we have the right token:
  258. params["token"] = self._token
  259. self._token = None # Token now invalid
  260. # Try the API query, catching most errors with our handler:
  261. try:
  262. result = self.site.api_query(**params)
  263. except exceptions.APIError as error:
  264. if not hasattr(error, "code"):
  265. raise # We can only handle errors with a code attribute
  266. result = self._handle_edit_errors(error, params)
  267. # If everything was successful, reset invalidated attributes:
  268. if result["edit"]["result"] == "Success":
  269. self._content = None
  270. self._basetimestamp = None
  271. self._exists = self.PAGE_UNKNOWN
  272. return
  273. # Otherwise, there was some kind of problem. Throw an exception:
  274. raise exceptions.EditError(result["edit"])
  275. def _build_edit_params(self, text, summary, minor, bot, force, section,
  276. captcha_id, captcha_word):
  277. """Given some keyword arguments, build an API edit query string."""
  278. unitxt = text.encode("utf8") if isinstance(text, unicode) else text
  279. hashed = md5(unitxt).hexdigest() # Checksum to ensure text is correct
  280. params = {"action": "edit", "title": self._title, "text": text,
  281. "token": self._token, "summary": summary, "md5": hashed}
  282. if section:
  283. params["section"] = section
  284. if captcha_id and captcha_word:
  285. params["captchaid"] = captcha_id
  286. params["captchaword"] = captcha_word
  287. if minor:
  288. params["minor"] = "true"
  289. else:
  290. params["notminor"] = "true"
  291. if bot:
  292. params["bot"] = "true"
  293. if not force:
  294. params["starttimestamp"] = self._starttimestamp
  295. if self._basetimestamp:
  296. params["basetimestamp"] = self._basetimestamp
  297. if self._exists == self.PAGE_MISSING:
  298. # Page does not exist; don't edit if it already exists:
  299. params["createonly"] = "true"
  300. else:
  301. params["recreate"] = "true"
  302. return params
  303. def _handle_edit_errors(self, error, params, retry=True):
  304. """If our edit fails due to some error, try to handle it.
  305. We'll either raise an appropriate exception (for example, if the page
  306. is protected), or we'll try to fix it (for example, if the token is
  307. invalid, we'll try to get a new one).
  308. """
  309. perms = ["noedit", "noedit-anon", "cantcreate", "cantcreate-anon",
  310. "protectedtitle", "noimageredirect", "noimageredirect-anon",
  311. "blocked"]
  312. if error.code in perms:
  313. raise exceptions.PermissionsError(error.info)
  314. elif error.code in ["editconflict", "pagedeleted", "articleexists"]:
  315. # These attributes are now invalidated:
  316. self._content = None
  317. self._basetimestamp = None
  318. self._exists = self.PAGE_UNKNOWN
  319. raise exceptions.EditConflictError(error.info)
  320. elif error.code == "badtoken" and retry:
  321. params["token"] = self.site.get_token("edit")
  322. try:
  323. return self.site.api_query(**params)
  324. except exceptions.APIError as error:
  325. if not hasattr(error, "code"):
  326. raise # We can only handle errors with a code attribute
  327. result = self._handle_edit_errors(error, params, retry=False)
  328. elif error.code in ["emptypage", "emptynewsection"]:
  329. raise exceptions.NoContentError(error.info)
  330. elif error.code == "contenttoobig":
  331. raise exceptions.ContentTooBigError(error.info)
  332. elif error.code == "spamdetected":
  333. raise exceptions.SpamDetectedError(error.info)
  334. elif error.code == "filtered":
  335. raise exceptions.FilteredError(error.info)
  336. raise exceptions.EditError(": ".join((error.code, error.info)))
  337. @property
  338. def site(self):
  339. """The page's corresponding Site object."""
  340. return self._site
  341. @property
  342. def title(self):
  343. """The page's title, or "pagename".
  344. This won't do any API queries on its own. Any other attributes or
  345. methods that do API queries will reload the title, however, like
  346. :py:attr:`exists` and :py:meth:`get`, potentially "normalizing" it or
  347. following redirects if :py:attr:`self._follow_redirects` is ``True``.
  348. """
  349. return self._title
  350. @property
  351. def exists(self):
  352. """Whether or not the page exists.
  353. This will be a number; its value does not matter, but it will equal
  354. one of :py:attr:`self.PAGE_INVALID <PAGE_INVALID>`,
  355. :py:attr:`self.PAGE_MISSING <PAGE_MISSING>`, or
  356. :py:attr:`self.PAGE_EXISTS <PAGE_EXISTS>`.
  357. Makes an API query only if we haven't already made one.
  358. """
  359. if self._exists == self.PAGE_UNKNOWN:
  360. self._load()
  361. return self._exists
  362. @property
  363. def pageid(self):
  364. """An integer ID representing the page.
  365. Makes an API query only if we haven't already made one and the *pageid*
  366. parameter to :py:meth:`__init__` was left as ``None``, which should be
  367. true for all cases except when pages are returned by an SQL generator
  368. (like :py:meth:`category.get_members()
  369. <earwigbot.wiki.category.Category.get_members>`).
  370. Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or
  371. :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is
  372. invalid or the page does not exist, respectively.
  373. """
  374. if self._pageid:
  375. return self._pageid
  376. if self._exists == self.PAGE_UNKNOWN:
  377. self._load()
  378. self._assert_existence() # Missing pages do not have IDs
  379. return self._pageid
  380. @property
  381. def url(self):
  382. """The page's URL.
  383. Like :py:meth:`title`, this won't do any API queries on its own. If the
  384. API was never queried for this page, we will attempt to determine the
  385. URL ourselves based on the title.
  386. """
  387. if self._fullurl:
  388. return self._fullurl
  389. else:
  390. encoded = self._title.encode("utf8").replace(" ", "_")
  391. slug = quote(encoded, safe="/:").decode("utf8")
  392. path = self.site._article_path.replace("$1", slug)
  393. return u"".join((self.site.url, path))
  394. @property
  395. def namespace(self):
  396. """The page's namespace ID (an integer).
  397. Like :py:meth:`title`, this won't do any API queries on its own. If the
  398. API was never queried for this page, we will attempt to determine the
  399. namespace ourselves based on the title.
  400. """
  401. return self._namespace
  402. @property
  403. def protection(self):
  404. """The page's current protection status.
  405. Makes an API query only if we haven't already made one.
  406. Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` if the page
  407. name is invalid. Won't raise an error if the page is missing because
  408. those can still be create-protected.
  409. """
  410. if self._exists == self.PAGE_UNKNOWN:
  411. self._load()
  412. self._assert_validity() # Invalid pages cannot be protected
  413. return self._protection
  414. @property
  415. def is_talkpage(self):
  416. """``True`` if the page is a talkpage, otherwise ``False``.
  417. Like :py:meth:`title`, this won't do any API queries on its own. If the
  418. API was never queried for this page, we will attempt to determine
  419. whether it is a talkpage ourselves based on its namespace.
  420. """
  421. return self._is_talkpage
  422. @property
  423. def is_redirect(self):
  424. """``True`` if the page is a redirect, otherwise ``False``.
  425. Makes an API query only if we haven't already made one.
  426. We will return ``False`` even if the page does not exist or is invalid.
  427. """
  428. if self._exists == self.PAGE_UNKNOWN:
  429. self._load()
  430. return self._is_redirect
  431. def reload(self):
  432. """Forcibly reload the page's attributes.
  433. Emphasis on *reload*: this is only necessary if there is reason to
  434. believe they have changed.
  435. """
  436. self._load()
  437. if self._content is not None:
  438. # Only reload content if it has already been loaded:
  439. self._load_content()
  440. def toggle_talk(self, follow_redirects=None):
  441. """Return a content page's talk page, or vice versa.
  442. The title of the new page is determined by namespace logic, not API
  443. queries. We won't make any API queries on our own.
  444. If *follow_redirects* is anything other than ``None`` (the default), it
  445. will be passed to the new :py:class:`~earwigbot.wiki.page.Page`
  446. object's :py:meth:`__init__`. Otherwise, we'll use the value passed to
  447. our own :py:meth:`__init__`.
  448. Will raise :py:exc:`~earwigbot.exceptions.InvalidPageError` if we try
  449. to get the talk page of a special page (in the ``Special:`` or
  450. ``Media:`` namespaces), but we won't raise an exception if our page is
  451. otherwise missing or invalid.
  452. """
  453. if self._namespace < 0:
  454. ns = self.site.namespace_id_to_name(self._namespace)
  455. e = u"Pages in the {0} namespace can't have talk pages.".format(ns)
  456. raise exceptions.InvalidPageError(e)
  457. if self._is_talkpage:
  458. new_ns = self._namespace - 1
  459. else:
  460. new_ns = self._namespace + 1
  461. try:
  462. body = self._title.split(":", 1)[1]
  463. except IndexError:
  464. body = self._title
  465. new_prefix = self.site.namespace_id_to_name(new_ns)
  466. # If the new page is in namespace 0, don't do ":Title" (it's correct,
  467. # but unnecessary), just do "Title":
  468. if new_prefix:
  469. new_title = u":".join((new_prefix, body))
  470. else:
  471. new_title = body
  472. if follow_redirects is None:
  473. follow_redirects = self._follow_redirects
  474. return Page(self.site, new_title, follow_redirects)
  475. def get(self):
  476. """Return page content, which is cached if you try to call get again.
  477. Raises InvalidPageError or PageNotFoundError if the page name is
  478. invalid or the page does not exist, respectively.
  479. """
  480. if self._exists == self.PAGE_UNKNOWN:
  481. # Kill two birds with one stone by doing an API query for both our
  482. # attributes and our page content:
  483. query = self.site.api_query
  484. result = query(action="query", rvlimit=1, titles=self._title,
  485. prop="info|revisions", inprop="protection|url",
  486. intoken="edit", rvprop="content|timestamp")
  487. self._load_attributes(result=result)
  488. self._assert_existence()
  489. self._load_content(result=result)
  490. # Follow redirects if we're told to:
  491. if self._keep_following and self._is_redirect:
  492. self._title = self.get_redirect_target()
  493. self._keep_following = False # Don't follow double redirects
  494. self._exists = self.PAGE_UNKNOWN # Force another API query
  495. self.get()
  496. return self._content
  497. # Make sure we're dealing with a real page here. This may be outdated
  498. # if the page was deleted since we last called self._load_attributes(),
  499. # but self._load_content() can handle that:
  500. self._assert_existence()
  501. if self._content is None:
  502. self._load_content()
  503. return self._content
  504. def get_redirect_target(self):
  505. """If the page is a redirect, return its destination.
  506. Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or
  507. :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is
  508. invalid or the page does not exist, respectively. Raises
  509. :py:exc:`~earwigbot.exceptions.RedirectError` if the page is not a
  510. redirect.
  511. """
  512. re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"
  513. content = self.get()
  514. try:
  515. return re.findall(re_redirect, content, flags=re.I)[0]
  516. except IndexError:
  517. e = "The page does not appear to have a redirect target."
  518. raise exceptions.RedirectError(e)
  519. def get_creator(self):
  520. """Return the User object for the first person to edit the page.
  521. Makes an API query only if we haven't already made one. Normally, we
  522. can get the creator along with everything else (except content) in
  523. :py:meth:`_load_attributes`. However, due to a limitation in the API
  524. (can't get the editor of one revision and the content of another at
  525. both ends of the history), if our other attributes were only loaded
  526. through :py:meth:`get`, we'll have to do another API query.
  527. Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or
  528. :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is
  529. invalid or the page does not exist, respectively.
  530. """
  531. if self._exists == self.PAGE_UNKNOWN:
  532. self._load()
  533. self._assert_existence()
  534. if not self._creator:
  535. self._load()
  536. self._assert_existence()
  537. return self.site.get_user(self._creator)
  538. def parse(self):
  539. """Parse the page content for templates, links, etc.
  540. Actual parsing is handled by :py:mod:`mwparserfromhell`. Raises
  541. :py:exc:`~earwigbot.exceptions.InvalidPageError` or
  542. :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is
  543. invalid or the page does not exist, respectively.
  544. """
  545. return mwparserfromhell.parse(self.get())
  546. def edit(self, text, summary, minor=False, bot=True, force=False):
  547. """Replace the page's content or creates a new page.
  548. *text* is the new page content, with *summary* as the edit summary.
  549. If *minor* is ``True``, the edit will be marked as minor. If *bot* is
  550. ``True``, the edit will be marked as a bot edit, but only if we
  551. actually have a bot flag.
  552. Use *force* to push the new content even if there's an edit conflict or
  553. the page was deleted/recreated between getting our edit token and
  554. editing our page. Be careful with this!
  555. """
  556. self._edit(text=text, summary=summary, minor=minor, bot=bot,
  557. force=force)
  558. def add_section(self, text, title, minor=False, bot=True, force=False):
  559. """Add a new section to the bottom of the page.
  560. The arguments for this are the same as those for :py:meth:`edit`, but
  561. instead of providing a summary, you provide a section title. Likewise,
  562. raised exceptions are the same as :py:meth:`edit`'s.
  563. This should create the page if it does not already exist, with just the
  564. new section as content.
  565. """
  566. self._edit(text=text, summary=title, minor=minor, bot=bot, force=force,
  567. section="new")
  568. def check_exclusion(self, username=None, optouts=None):
  569. """Check whether or not we are allowed to edit the page.
  570. Return ``True`` if we *are* allowed to edit this page, and ``False`` if
  571. we aren't.
  572. *username* is used to determine whether we are part of a specific list
  573. of allowed or disallowed bots (e.g. ``{{bots|allow=EarwigBot}}`` or
  574. ``{{bots|deny=FooBot,EarwigBot}}``). It's ``None`` by default, which
  575. will swipe our username from :py:meth:`site.get_user()
  576. <earwigbot.wiki.site.Site.get_user>`.\
  577. :py:attr:`~earwigbot.wiki.user.User.name`.
  578. *optouts* is a list of messages to consider this check as part of for
  579. the purpose of opt-out; it defaults to ``None``, which ignores the
  580. parameter completely. For example, if *optouts* is ``["nolicense"]``,
  581. we'll return ``False`` on ``{{bots|optout=nolicense}}`` or
  582. ``{{bots|optout=all}}``, but `True` on
  583. ``{{bots|optout=orfud,norationale,replaceable}}``.
  584. """
  585. def parse_param(template, param):
  586. value = template.get(param).value
  587. return [item.strip().lower() for item in value.split(",")]
  588. if not username:
  589. username = self.site.get_user().name
  590. # Lowercase everything:
  591. username = username.lower()
  592. optouts = [optout.lower() for optout in optouts] if optouts else []
  593. r_bots = "\{\{\s*(no)?bots\s*(\||\}\})"
  594. filter = self.parse().ifilter_templates(recursive=True, matches=r_bots)
  595. for template in filter:
  596. if template.has_param("deny"):
  597. denies = parse_param(template, "deny")
  598. if "all" in denies or username in denies:
  599. return False
  600. if template.has_param("allow"):
  601. allows = parse_param(template, "allow")
  602. if "all" in allows or username in allows:
  603. continue
  604. if optouts and template.has_param("optout"):
  605. tasks = parse_param(template, "optout")
  606. matches = [optout in tasks for optout in optouts]
  607. if "all" in tasks or any(matches):
  608. return False
  609. if template.name.strip().lower() == "nobots":
  610. return False
  611. return True