Преглед изворни кода

Major additions to Page in wikitools.

* Page: added about 400 lines of rambling nonsense.
* Exceptions: added RedirectError.
tags/v0.1^2
Ben Kurtovic пре 12 година
родитељ
комит
e4f8fb2e21
2 измењених фајлова са 401 додато и 16 уклоњено
  1. +4
    -0
      wiki/tools/exceptions.py
  2. +397
    -16
      wiki/tools/page.py

+ 4
- 0
wiki/tools/exceptions.py Прегледај датотеку

@@ -38,6 +38,10 @@ class InvalidPageError(WikiToolsetError):
"""Attempting to get certain information about a page whose title is
invalid."""

class RedirectError(WikiToolsetError):
"""Page's get_redirect_target() method failed because the page is either
not a redirect, or it is malformed."""

class UserNotFoundError(WikiToolsetError):
"""Attempting to get certain information about a user that does not
exist."""

+ 397
- 16
wiki/tools/page.py Прегледај датотеку

@@ -1,33 +1,414 @@
# -*- coding: utf-8 -*-

import re
from urllib import quote

from wiki.tools.exceptions import *

class Page(object):
"""
EarwigBot's Wiki Toolset: Page Class

Represents a Page on a given Site. Has methods for getting information
about the page, getting page content, and so on. Category is a subclass of
Page with additional methods.

Public methods:
title -- returns the page's title, or pagename
exists -- returns whether the page exists
pageid -- returns an integer ID representing the page
url -- returns the page's URL
namespace -- returns the page's namespace as an integer
protection -- returns the page's current protection status
is_talkpage -- returns True if the page is a talkpage, else False
is_redirect -- returns True if the page is a redirect, else False
toggle_talk -- returns a content page's talk page, or vice versa
get -- returns page content
get_redirect_target -- if the page is a redirect, returns its destination
"""

def __init__(self, site, title):
"""
Docstring needed
def __init__(self, site, title, follow_redirects=False):
"""Constructor for new Page instances.

Takes three arguments: a Site object, the Page's title (or pagename),
and whether or not to follow redirects (optional, defaults to False).

As with User, site.get_page() is preferred. Site's method has support
for a default `follow_redirects` value in our config, while __init__
always defaults to False.

__init__ will not do any API queries, but it will use basic namespace
logic to determine our namespace ID and if we are a talkpage.
"""
self.site = site
self.title = title
self._site = site
self._title = title.strip()
self._follow_redirects = self._keep_following = follow_redirects

self._exists = 0
self._pageid = None
self._is_redirect = None
self._lastrevid = None
self._protection = None
self._fullurl = None
self._content = None

def exists(self):
# Try to determine the page's namespace using our site's namespace
# converter:
prefix = self._title.split(":", 1)[0]
if prefix != title: # ignore a page that's titled "Category" or "User"
try:
self._namespace = self._site.namespace_name_to_id(prefix)
except NamespaceNotFoundError:
self._namespace = 0
else:
self._namespace = 0

# Is this a talkpage? Talkpages have odd IDs, while content pages have
# even IDs, excluding the "special" namespaces:
if self._namespace < 0:
self._is_talkpage = False
else:
self._is_talkpage = self._namespace % 2 == 1

def _force_validity(self):
"""Used to ensure that our page's title is valid.

If this method is called when our page is not valid (and after
_load_attributes() has been called), InvalidPageError will be raised.

Note that validity != existence. If a page's title is invalid (e.g, it
contains "[") it will always be invalid, and cannot be edited.
"""
Docstring needed
if self._exists == 1:
e = "Page '{0}' is invalid.".format(self._title)
raise InvalidPageError(e)

def _force_existence(self):
"""Used to ensure that our page exists.

If this method is called when our page doesn't exist (and after
_load_attributes() has been called), PageNotFoundError will be raised.
It will also call _force_validity() beforehand.
"""
pass
self._force_validity()
if self._exists == 2:
e = "Page '{0}' does not exist.".format(self._title)
raise PageNotFoundError(e)

def _load_wrapper(self):
"""Calls _load_attributes() and follows redirects if we're supposed to.

This method will only follow redirects if follow_redirects=True was
passed to __init__() (perhaps indirectly passed by site.get_page()).
It avoids the API's &redirects param in favor of manual following,
so we can act more realistically (we don't follow double redirects, and
circular redirects don't break us).

def get(self, force_reload=False):
This will raise RedirectError if we have a problem following, but that
is a bug and should NOT happen.

If we're following a redirect, this will make a grand total of three
API queries. It's a lot, but each one is quite small.
"""
Docstring needed
self._load_attributes()

if self._keep_following and self._is_redirect:
self._title = self.get_redirect_target()
self._keep_following = False # don't follow double redirects
self._content = None # reset the content we just loaded
self._load_attributes()

def _load_attributes(self, result=None):
"""Loads various data from the API in a single query.

Loads self._title, ._exists, ._is_redirect, ._pageid, ._fullurl,
._protection, ._namespace, ._is_talkpage, and ._lastrevid using the
API. It will do a query of its own unless `result` is provided, in
which case we'll pretend `result` is what the query returned.

Assuming the API is sound, this should not raise any exceptions.
"""
if self._content is None or force_reload:
params = {"action": "query", "prop": "revisions",
"rvprop": "content", "rvlimit": 1, "titles": self.title}
result = self.site.api_query(params)
content = result["query"]["pages"].values()[0]["revisions"][0]["*"]
if result is None:
params = {"action": "query", "prop": "info", "titles": self._title,
"inprop": "protection|url"}
result = self._site._api_query(params)

res = result["query"]["pages"].values()[0]

# Normalize our pagename/title thing:
self._title = res["title"]

try:
res["redirect"]
except KeyError:
self._is_redirect = False
else:
self._is_redirect = True

self._pageid = result["query"]["pages"].keys()[0]
if int(self._pageid) < 0:
try:
res["missing"]
except KeyError:
# If it has a negative ID and it's invalid, then break here,
# because there's no other data for us to get:
self._exists = 1
return
else:
# If it has a negative ID and it's missing; we can still get
# data like the namespace, protection, and URL:
self._exists = 2
else:
self._exists = 3

self._fullurl = res["fullurl"]
self._protection = res["protection"]

# We've determined the namespace and talkpage status in __init__()
# based on the title, but now we can be sure:
self._namespace = res["ns"]
self._is_talkpage = self._namespace % 2 == 1 # talkpages have odd IDs

# This last field will only be specified if the page exists:
try:
self._lastrevid = res["lastrevid"]
except KeyError:
pass

def _load_content(self, result=None):
"""Loads current page content from the API.

If `result` is provided, we'll pretend that is the result of an API
query and try to get content from that. Otherwise, we'll do an API
query on our own.

Don't call this directly, ever - use .get(force=True) if you want to
force content reloading.
"""
if result is None:
params = {"action": "query", "prop": "revisions", "rvlimit": 1,
"rvprop": "content", "titles": self._title}
result = self._site._api_query(params)

res = result["query"]["pages"].values()[0]
try:
content = res["revisions"][0]["*"]
self._content = content
return content
except KeyError:
# This can only happen if the page was deleted since we last called
# self._load_attributes(). In that case, some of our attributes are
# outdated, so force another self._load_attributes():
self._load_attributes()
self._force_existence()

def title(self, force=False):
"""Returns the Page's title, or pagename.

This won't do any API queries on its own unless force is True, in which
case the title will be forcibly reloaded from the API (normalizing it,
and following redirects if follow_redirects=True was passed to
__init__()). Any other methods that do API queries will reload title on
their own, however, like exists() and get().
"""
if force:
self._load_wrapper()
return self._title

def exists(self, force=False):
"""Returns information about whether the Page exists or not.

The returned "information" is a tuple with two items. The first is a
bool, either True if the page exists or False if it does not. The
second is a string giving more information, either "invalid", (title
is invalid, e.g. it contains "["), "missing", or "exists".

Makes an API query if force is True or if we haven't already made one.
"""
cases = {
0: (None, "unknown"),
1: (False, "invalid"),
2: (False, "missing"),
3: (True, "exists"),
}
if self._exists == 0 or force:
self._load_wrapper()
return cases[self._exists]

def pageid(self, force=False):
"""Returns an integer ID representing the Page.

Makes an API query if force is True or if we haven't already made one.

Raises InvalidPageError or PageNotFoundError if the page name is
invalid or the page does not exist, respectively.
"""
if self._exists == 0 or force:
self._load_wrapper()
self._force_existence() # missing pages do not have IDs
return self._pageid

def url(self, force=False):
"""Returns the page's URL.

Like title(), this won't do any API queries on its own unless force is
True. If the API was never queried for this page, we will attempt to
determine the URL ourselves based on the title.
"""
if force:
self._load_wrapper()
if self._fullurl is not None:
return self._fullurl
else:
slug = quote(self._title.replace(" ", "_"), safe="/:")
path = self._site._article_path.replace("$1", slug)
return ''.join((self._site._base_url, path))

def namespace(self, force=False):
"""Returns the page's namespace ID (an integer).

Like title(), this won't do any API queries on its own unless force is
True. If the API was never queried for this page, we will attempt to
determine the namespace ourselves based on the title.
"""
if force:
self._load_wrapper()
return self._namespace

def protection(self, force=False):
"""Returns the page's current protection status.

Makes an API query if force is True or if we haven't already made one.

Raises InvalidPageError if the page name is invalid. Will not raise an
error if the page is missing because those can still be protected.
"""
if self._exists == 0 or force:
self._load_wrapper()
self._force_validity() # invalid pages cannot be protected
return self._protection

def is_talkpage(self, force=False):
"""Returns True if the page is a talkpage, else False.

Like title(), this won't do any API queries on its own unless force is
True. If the API was never queried for this page, we will attempt to
determine the talkpage status ourselves based on its namespace ID.
"""
if force:
self._load_wrapper()
return self._is_talkpage

def is_redirect(self, force=False):
"""Returns True if the page is a redirect, else False.

Makes an API query if force is True or if we haven't already made one.

We will return False even if the page does not exist or is invalid.
"""
if self._exists == 0 or force:
self._load_wrapper()
return self._is_redirect

def toggle_talk(self, force=False, follow_redirects=None):
"""Returns a content page's talk page, or vice versa.

The title of the new page is determined by namespace logic, not API
queries. We won't make any API queries on our own unless force is True,
and the only reason then would be to forcibly update the title or
follow redirects if we haven't already made an API query.

If `follow_redirects` is anything other than None (the default), it
will be passed to the new Page's __init__(). Otherwise, we'll use the
value passed to our own __init__().

Will raise InvalidPageError if we try to get the talk page of a special
page (in the Special: or Media: namespaces), but we won't raise an
exception if our page is otherwise missing or invalid.
"""
if force:
self._load_wrapper()
if self._namespace < 0:
ns = self._site.namespace_id_to_name(self._namespace)
e = "Pages in the {0} namespace can't have talk pages.".format(ns)
raise InvalidPageError(e)

if self._is_talkpage:
new_ns = self._namespace - 1
else:
new_ns = self._namespace + 1

try:
body = self._title.split(":", 1)[1]
except IndexError:
body = self._title

new_prefix = self._site.namespace_id_to_name(new_ns)

# If the new page is in namespace 0, don't do ":Title" (it's correct,
# but unnecessary), just do "Title":
if new_prefix:
new_title = ':'.join((new_prefix, body))
else:
new_title = body

if follow_redirects is None:
follow_redirects = self._follow_redirects
return Page(self._site, new_title, follow_redirects)

def get(self, force=False):
"""Returns page content, which is cached if you try to call get again.

Use `force` to forcibly reload page content even if we've already
loaded some. This is good if you want to edit a page multiple times,
and you want to get updated content before you make your second edit.

Raises InvalidPageError or PageNotFoundError if the page name is
invalid or the page does not exist, respectively.
"""
if force or self._exists == 0:
# Kill two birds with one stone by doing an API query for both our
# attributes and our page content:
params = {"action": "query", "rvprop": "content", "rvlimit": 1,
"prop": "info|revisions", "inprop": "protection|url",
"titles": self._title}
result = self._site._api_query(params)
self._load_attributes(result=result)
self._force_existence()
self._load_content(result=result)

# Follow redirects if we're told to:
if self._keep_following and self._is_redirect:
self._title = self.get_redirect_target()
self._keep_following = False # don't follow double redirects
self._content = None # reset the content we just loaded
self.get(force=True)

return self._content

# Make sure we're dealing with a real page here. This may be outdated
# if the page was deleted since we last called self._load_attributes(),
# but self._load_content() can handle that:
self._force_existence()

if self._content is None:
self._load_content()

return self._content

def get_redirect_target(self, force=False):
"""If the page is a redirect, returns its destination.

Use `force` to forcibly reload content even if we've already loaded
some before. Note that this method calls get() for page content.

Raises InvalidPageError or PageNotFoundError if the page name is
invalid or the page does not exist, respectively. Raises RedirectError
if the page is not a redirect.
"""
content = self.get(force)
regexp = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"
try:
return re.findall(regexp, content, flags=re.IGNORECASE)[0]
except IndexError:
e = "The page does not appear to have a redirect target."
raise RedirectError(e)

Loading…
Откажи
Сачувај