# -*- coding: utf-8 -*- # # Copyright (C) 2009-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from hashlib import md5 from logging import getLogger, NullHandler import re from time import gmtime, strftime from urllib.parse import quote import mwparserfromhell from earwigbot import exceptions from earwigbot.wiki.copyvios import CopyvioMixIn __all__ = ["Page"] class Page(CopyvioMixIn): """ **EarwigBot: Wiki Toolset: Page** Represents a page on a given :py:class:`~earwigbot.wiki.site.Site`. Has methods for getting information about the page, getting page content, and so on. :py:class:`~earwigbot.wiki.category.Category` is a subclass of :py:class:`Page` with additional methods. *Attributes:* - :py:attr:`site`: the page's corresponding Site object - :py:attr:`title`: the page's title, or pagename - :py:attr:`exists`: whether or not the page exists - :py:attr:`pageid`: an integer ID representing the page - :py:attr:`url`: the page's URL - :py:attr:`namespace`: the page's namespace as an integer - :py:attr:`lastrevid`: the ID of the page's most recent revision - :py:attr:`protection`: the page's current protection status - :py:attr:`is_talkpage`: ``True`` if this is a talkpage, else ``False`` - :py:attr:`is_redirect`: ``True`` if this is a redirect, else ``False`` *Public methods:* - :py:meth:`reload`: forcibly reloads the page's attributes - :py:meth:`toggle_talk`: returns a content page's talk page, or vice versa - :py:meth:`get`: returns the page's content - :py:meth:`get_redirect_target`: returns the page's destination if it is a redirect - :py:meth:`get_creator`: returns a User object representing the first person to edit the page - :py:meth:`parse`: parses the page content for templates, links, etc - :py:meth:`edit`: replaces the page's content or creates a new page - :py:meth:`add_section`: adds a new section at the bottom of the page - :py:meth:`check_exclusion`: checks whether or not we are allowed to edit the page, per ``{{bots}}``/``{{nobots}}`` - :py:meth:`~earwigbot.wiki.copyvios.CopyrightMixIn.copyvio_check`: checks the page for copyright violations - :py:meth:`~earwigbot.wiki.copyvios.CopyrightMixIn.copyvio_compare`: checks the page like :py:meth:`copyvio_check`, but against a specific URL """ PAGE_UNKNOWN = 0 PAGE_INVALID = 1 PAGE_MISSING = 2 PAGE_EXISTS = 3 def __init__(self, site, title, follow_redirects=False, pageid=None, logger=None): """Constructor for new Page instances. Takes four arguments: a Site object, the Page's title (or pagename), whether or not to follow redirects (optional, defaults to False), and a page ID to supplement the title (optional, defaults to None - i.e., we will have to query the API to get it). As with User, site.get_page() is preferred. __init__() will not do any API queries, but it will use basic namespace logic to determine our namespace ID and if we are a talkpage. """ super().__init__(site) self._site = site self._title = title.strip() self._follow_redirects = self._keep_following = follow_redirects self._pageid = pageid # Set up our internal logger: if logger: self._logger = logger else: # Just set up a null logger to eat up our messages: self._logger = getLogger("earwigbot.wiki") self._logger.addHandler(NullHandler()) # Attributes to be loaded through the API: self._exists = self.PAGE_UNKNOWN self._is_redirect = None self._lastrevid = None self._protection = None self._fullurl = None self._content = None self._creator = None # Attributes used for editing/deleting/protecting/etc: self._basetimestamp = None self._starttimestamp = None # Try to determine the page's namespace using our site's namespace # converter: prefix = self._title.split(":", 1)[0] if prefix != title: # ignore a page that's titled "Category" or "User" try: self._namespace = self.site.namespace_name_to_id(prefix) except exceptions.NamespaceNotFoundError: self._namespace = 0 else: self._namespace = 0 # Is this a talkpage? Talkpages have odd IDs, while content pages have # even IDs, excluding the "special" namespaces: if self._namespace < 0: self._is_talkpage = False else: self._is_talkpage = self._namespace % 2 == 1 def __repr__(self): """Return the canonical string representation of the Page.""" res = "Page(title={0!r}, follow_redirects={1!r}, site={2!r})" return res.format(self._title, self._follow_redirects, self._site) def __str__(self): """Return a nice string representation of the Page.""" return ''.format(self.title, str(self.site)) def _assert_validity(self): """Used to ensure that our page's title is valid. If this method is called when our page is not valid (and after _load_attributes() has been called), InvalidPageError will be raised. Note that validity != existence. If a page's title is invalid (e.g, it contains "[") it will always be invalid, and cannot be edited. """ if self._exists == self.PAGE_INVALID: e = "Page '{0}' is invalid.".format(self._title) raise exceptions.InvalidPageError(e) def _assert_existence(self): """Used to ensure that our page exists. If this method is called when our page doesn't exist (and after _load_attributes() has been called), PageNotFoundError will be raised. It will also call _assert_validity() beforehand. """ self._assert_validity() if self._exists == self.PAGE_MISSING: e = "Page '{0}' does not exist.".format(self._title) raise exceptions.PageNotFoundError(e) def _load(self): """Call _load_attributes() and follows redirects if we're supposed to. This method will only follow redirects if follow_redirects=True was passed to __init__() (perhaps indirectly passed by site.get_page()). It avoids the API's &redirects param in favor of manual following, so we can act more realistically (we don't follow double redirects, and circular redirects don't break us). This will raise RedirectError if we have a problem following, but that is a bug and should NOT happen. If we're following a redirect, this will make a grand total of three API queries. It's a lot, but each one is quite small. """ self._load_attributes() if self._keep_following and self._is_redirect: self._title = self.get_redirect_target() self._keep_following = False # don't follow double redirects self._content = None # reset the content we just loaded self._load_attributes() def _load_attributes(self, result=None): """Load various data from the API in a single query. Loads self._title, ._exists, ._is_redirect, ._pageid, ._fullurl, ._protection, ._namespace, ._is_talkpage, ._creator, ._lastrevid, and ._starttimestamp using the API. It will do a query of its own unless *result* is provided, in which case we'll pretend *result* is what the query returned. Assuming the API is sound, this should not raise any exceptions. """ if not result: query = self.site.api_query result = query(action="query", prop="info|revisions", inprop="protection|url", rvprop="user", rvlimit=1, rvdir="newer", titles=self._title) if "interwiki" in result["query"]: self._title = result["query"]["interwiki"][0]["title"] self._exists = self.PAGE_INVALID return res = list(result["query"]["pages"].values())[0] self._title = res["title"] # Normalize our pagename/title self._is_redirect = "redirect" in res self._pageid = int(list(result["query"]["pages"].keys())[0]) if self._pageid < 0: if "missing" in res: # If it has a negative ID and it's missing; we can still get # data like the namespace, protection, and URL: self._exists = self.PAGE_MISSING else: # If it has a negative ID and it's invalid, then break here, # because there's no other data for us to get: self._exists = self.PAGE_INVALID return else: self._exists = self.PAGE_EXISTS self._fullurl = res["fullurl"] self._protection = res["protection"] self._starttimestamp = strftime("%Y-%m-%dT%H:%M:%SZ", gmtime()) # We've determined the namespace and talkpage status in __init__() # based on the title, but now we can be sure: self._namespace = res["ns"] self._is_talkpage = self._namespace % 2 == 1 # talkpages have odd IDs # These last two fields will only be specified if the page exists: self._lastrevid = res.get("lastrevid") try: self._creator = res['revisions'][0]['user'] except KeyError: pass def _load_content(self, result=None): """Load current page content from the API. If *result* is provided, we'll pretend that is the result of an API query and try to get content from that. Otherwise, we'll do an API query on our own. Don't call this directly, ever; use reload() followed by get() if you want to force content reloading. """ if not result: query = self.site.api_query result = query(action="query", prop="revisions", rvlimit=1, rvprop="content|timestamp", rvslots="main", titles=self._title) res = list(result["query"]["pages"].values())[0] try: revision = res["revisions"][0] self._content = revision["slots"]["main"]["*"] self._basetimestamp = revision["timestamp"] except (KeyError, IndexError): # This can only happen if the page was deleted since we last called # self._load_attributes(). In that case, some of our attributes are # outdated, so force another self._load_attributes(): self._load_attributes() self._assert_existence() def _edit(self, params=None, text=None, summary=None, minor=None, bot=None, force=None, section=None, captcha_id=None, captcha_word=None): """Edit the page! If *params* is given, we'll use it as our API query parameters. Otherwise, we'll build params using the given kwargs via _build_edit_params(). We'll then try to do the API query, and catch any errors the API raises in _handle_edit_errors(). We'll then throw these back as subclasses of EditError. """ # Weed out invalid pages before we get too far: self._assert_validity() # Build our API query string: if not params: params = self._build_edit_params(text, summary, minor, bot, force, section, captcha_id, captcha_word) else: # Make sure we have the right token: params["token"] = self.site.get_token() # Try the API query, catching most errors with our handler: try: result = self.site.api_query(**params) except exceptions.APIError as error: if not hasattr(error, "code"): raise # We can only handle errors with a code attribute result = self._handle_edit_errors(error, params) # If everything was successful, reset invalidated attributes: if result["edit"]["result"] == "Success": self._content = None self._basetimestamp = None self._exists = self.PAGE_UNKNOWN return # Otherwise, there was some kind of problem. Throw an exception: raise exceptions.EditError(result["edit"]) def _build_edit_params(self, text, summary, minor, bot, force, section, captcha_id, captcha_word): """Given some keyword arguments, build an API edit query string.""" unitxt = text.encode("utf8") if isinstance(text, str) else text hashed = md5(unitxt).hexdigest() # Checksum to ensure text is correct params = { "action": "edit", "title": self._title, "text": text, "token": self.site.get_token(), "summary": summary, "md5": hashed} if section: params["section"] = section if captcha_id and captcha_word: params["captchaid"] = captcha_id params["captchaword"] = captcha_word if minor: params["minor"] = "true" else: params["notminor"] = "true" if bot: params["bot"] = "true" if not force: if self._starttimestamp: params["starttimestamp"] = self._starttimestamp if self._basetimestamp: params["basetimestamp"] = self._basetimestamp if self._exists == self.PAGE_MISSING: # Page does not exist; don't edit if it already exists: params["createonly"] = "true" else: params["recreate"] = "true" return params def _handle_edit_errors(self, error, params, retry=True): """If our edit fails due to some error, try to handle it. We'll either raise an appropriate exception (for example, if the page is protected), or we'll try to fix it (for example, if the token is invalid, we'll try to get a new one). """ perms = ["noedit", "noedit-anon", "cantcreate", "cantcreate-anon", "protectedtitle", "noimageredirect", "noimageredirect-anon", "blocked"] if error.code in perms: raise exceptions.PermissionsError(error.info) elif error.code in ["editconflict", "pagedeleted", "articleexists"]: # These attributes are now invalidated: self._content = None self._basetimestamp = None self._exists = self.PAGE_UNKNOWN raise exceptions.EditConflictError(error.info) elif error.code == "badtoken" and retry: params["token"] = self.site.get_token(force=True) try: return self.site.api_query(**params) except exceptions.APIError as err: if not hasattr(err, "code"): raise # We can only handle errors with a code attribute return self._handle_edit_errors(err, params, retry=False) elif error.code in ["emptypage", "emptynewsection"]: raise exceptions.NoContentError(error.info) elif error.code == "contenttoobig": raise exceptions.ContentTooBigError(error.info) elif error.code == "spamdetected": raise exceptions.SpamDetectedError(error.info) elif error.code == "filtered": raise exceptions.FilteredError(error.info) raise exceptions.EditError(": ".join((error.code, error.info))) @property def site(self): """The page's corresponding Site object.""" return self._site @property def title(self): """The page's title, or "pagename". This won't do any API queries on its own. Any other attributes or methods that do API queries will reload the title, however, like :py:attr:`exists` and :py:meth:`get`, potentially "normalizing" it or following redirects if :py:attr:`self._follow_redirects` is ``True``. """ return self._title @property def exists(self): """Whether or not the page exists. This will be a number; its value does not matter, but it will equal one of :py:attr:`self.PAGE_INVALID `, :py:attr:`self.PAGE_MISSING `, or :py:attr:`self.PAGE_EXISTS `. Makes an API query only if we haven't already made one. """ if self._exists == self.PAGE_UNKNOWN: self._load() return self._exists @property def pageid(self): """An integer ID representing the page. Makes an API query only if we haven't already made one and the *pageid* parameter to :py:meth:`__init__` was left as ``None``, which should be true for all cases except when pages are returned by an SQL generator (like :py:meth:`category.get_members() `). Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is invalid or the page does not exist, respectively. """ if self._pageid: return self._pageid if self._exists == self.PAGE_UNKNOWN: self._load() self._assert_existence() # Missing pages do not have IDs return self._pageid @property def url(self): """The page's URL. Like :py:meth:`title`, this won't do any API queries on its own. If the API was never queried for this page, we will attempt to determine the URL ourselves based on the title. """ if self._fullurl: return self._fullurl else: encoded = self._title.encode("utf8").replace(" ", "_") slug = quote(encoded, safe="/:").decode("utf8") path = self.site._article_path.replace("$1", slug) return "".join((self.site.url, path)) @property def namespace(self): """The page's namespace ID (an integer). Like :py:meth:`title`, this won't do any API queries on its own. If the API was never queried for this page, we will attempt to determine the namespace ourselves based on the title. """ return self._namespace @property def lastrevid(self): """The ID of the page's most recent revision. Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is invalid or the page does not exist, respectively. """ if self._exists == self.PAGE_UNKNOWN: self._load() self._assert_existence() # Missing pages don't have revisions return self._lastrevid @property def protection(self): """The page's current protection status. Makes an API query only if we haven't already made one. Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` if the page name is invalid. Won't raise an error if the page is missing because those can still be create-protected. """ if self._exists == self.PAGE_UNKNOWN: self._load() self._assert_validity() # Invalid pages cannot be protected return self._protection @property def is_talkpage(self): """``True`` if the page is a talkpage, otherwise ``False``. Like :py:meth:`title`, this won't do any API queries on its own. If the API was never queried for this page, we will attempt to determine whether it is a talkpage ourselves based on its namespace. """ return self._is_talkpage @property def is_redirect(self): """``True`` if the page is a redirect, otherwise ``False``. Makes an API query only if we haven't already made one. We will return ``False`` even if the page does not exist or is invalid. """ if self._exists == self.PAGE_UNKNOWN: self._load() return self._is_redirect def reload(self): """Forcibly reload the page's attributes. Emphasis on *reload*: this is only necessary if there is reason to believe they have changed. """ self._load() if self._content is not None: # Only reload content if it has already been loaded: self._load_content() def toggle_talk(self, follow_redirects=None): """Return a content page's talk page, or vice versa. The title of the new page is determined by namespace logic, not API queries. We won't make any API queries on our own. If *follow_redirects* is anything other than ``None`` (the default), it will be passed to the new :py:class:`~earwigbot.wiki.page.Page` object's :py:meth:`__init__`. Otherwise, we'll use the value passed to our own :py:meth:`__init__`. Will raise :py:exc:`~earwigbot.exceptions.InvalidPageError` if we try to get the talk page of a special page (in the ``Special:`` or ``Media:`` namespaces), but we won't raise an exception if our page is otherwise missing or invalid. """ if self._namespace < 0: ns = self.site.namespace_id_to_name(self._namespace) e = "Pages in the {0} namespace can't have talk pages.".format(ns) raise exceptions.InvalidPageError(e) if self._is_talkpage: new_ns = self._namespace - 1 else: new_ns = self._namespace + 1 if self._namespace != 0: body = self._title.split(":", 1)[1] else: body = self._title new_prefix = self.site.namespace_id_to_name(new_ns) # If the new page is in namespace 0, don't do ":Title" (it's correct, # but unnecessary), just do "Title": if new_prefix: new_title = ":".join((new_prefix, body)) else: new_title = body if follow_redirects is None: follow_redirects = self._follow_redirects return Page(self.site, new_title, follow_redirects) def get(self): """Return page content, which is cached if you try to call get again. Raises InvalidPageError or PageNotFoundError if the page name is invalid or the page does not exist, respectively. """ if self._exists == self.PAGE_UNKNOWN: # Kill two birds with one stone by doing an API query for both our # attributes and our page content: query = self.site.api_query result = query(action="query", rvlimit=1, titles=self._title, prop="info|revisions", inprop="protection|url", rvprop="content|timestamp", rvslots="main") self._load_attributes(result=result) self._assert_existence() self._load_content(result=result) # Follow redirects if we're told to: if self._keep_following and self._is_redirect: self._title = self.get_redirect_target() self._keep_following = False # Don't follow double redirects self._exists = self.PAGE_UNKNOWN # Force another API query self.get() return self._content # Make sure we're dealing with a real page here. This may be outdated # if the page was deleted since we last called self._load_attributes(), # but self._load_content() can handle that: self._assert_existence() if self._content is None: self._load_content() return self._content def get_redirect_target(self): """If the page is a redirect, return its destination. Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is invalid or the page does not exist, respectively. Raises :py:exc:`~earwigbot.exceptions.RedirectError` if the page is not a redirect. """ re_redirect = r"^\s*\#\s*redirect\s*\[\[(.*?)\]\]" content = self.get() try: return re.findall(re_redirect, content, flags=re.I)[0] except IndexError: e = "The page does not appear to have a redirect target." raise exceptions.RedirectError(e) def get_creator(self): """Return the User object for the first person to edit the page. Makes an API query only if we haven't already made one. Normally, we can get the creator along with everything else (except content) in :py:meth:`_load_attributes`. However, due to a limitation in the API (can't get the editor of one revision and the content of another at both ends of the history), if our other attributes were only loaded through :py:meth:`get`, we'll have to do another API query. Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is invalid or the page does not exist, respectively. """ if self._exists == self.PAGE_UNKNOWN: self._load() self._assert_existence() if not self._creator: self._load() self._assert_existence() return self.site.get_user(self._creator) def parse(self): """Parse the page content for templates, links, etc. Actual parsing is handled by :py:mod:`mwparserfromhell`. Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or :py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is invalid or the page does not exist, respectively. """ return mwparserfromhell.parse(self.get()) def edit(self, text, summary, minor=False, bot=True, force=False): """Replace the page's content or creates a new page. *text* is the new page content, with *summary* as the edit summary. If *minor* is ``True``, the edit will be marked as minor. If *bot* is ``True``, the edit will be marked as a bot edit, but only if we actually have a bot flag. Use *force* to push the new content even if there's an edit conflict or the page was deleted/recreated between getting our edit token and editing our page. Be careful with this! """ self._edit(text=text, summary=summary, minor=minor, bot=bot, force=force) def add_section(self, text, title, minor=False, bot=True, force=False): """Add a new section to the bottom of the page. The arguments for this are the same as those for :py:meth:`edit`, but instead of providing a summary, you provide a section title. Likewise, raised exceptions are the same as :py:meth:`edit`'s. This should create the page if it does not already exist, with just the new section as content. """ self._edit(text=text, summary=title, minor=minor, bot=bot, force=force, section="new") def check_exclusion(self, username=None, optouts=None): """Check whether or not we are allowed to edit the page. Return ``True`` if we *are* allowed to edit this page, and ``False`` if we aren't. *username* is used to determine whether we are part of a specific list of allowed or disallowed bots (e.g. ``{{bots|allow=EarwigBot}}`` or ``{{bots|deny=FooBot,EarwigBot}}``). It's ``None`` by default, which will swipe our username from :py:meth:`site.get_user() `.\ :py:attr:`~earwigbot.wiki.user.User.name`. *optouts* is a list of messages to consider this check as part of for the purpose of opt-out; it defaults to ``None``, which ignores the parameter completely. For example, if *optouts* is ``["nolicense"]``, we'll return ``False`` on ``{{bots|optout=nolicense}}`` or ``{{bots|optout=all}}``, but `True` on ``{{bots|optout=orfud,norationale,replaceable}}``. """ def parse_param(template, param): value = template.get(param).value return [item.strip().lower() for item in value.split(",")] if not username: username = self.site.get_user().name # Lowercase everything: username = username.lower() optouts = [optout.lower() for optout in optouts] if optouts else [] r_bots = r"\{\{\s*(no)?bots\s*(\||\}\})" filter = self.parse().ifilter_templates(recursive=True, matches=r_bots) for template in filter: if template.has_param("deny"): denies = parse_param(template, "deny") if "all" in denies or username in denies: return False if template.has_param("allow"): allows = parse_param(template, "allow") if "all" in allows or username in allows: continue if optouts and template.has_param("optout"): tasks = parse_param(template, "optout") matches = [optout in tasks for optout in optouts] if "all" in tasks or any(matches): return False if template.name.strip().lower() == "nobots": return False return True