A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

209 lines
8.8 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from earwigbot.wiki.page import Page
  23. __all__ = ["Category"]
  24. class Category(Page):
  25. """
  26. **EarwigBot: Wiki Toolset: Category**
  27. Represents a category on a given :py:class:`~earwigbot.wiki.site.Site`, a
  28. subclass of :py:class:`~earwigbot.wiki.page.Page`. Provides additional
  29. methods, but :py:class:`~earwigbot.wiki.page.Page`'s own methods should
  30. work fine on :py:class:`Category` objects. :py:meth:`site.get_page()
  31. <earwigbot.wiki.site.Site.get_page>` will return a :py:class:`Category`
  32. instead of a :py:class:`~earwigbot.wiki.page.Page` if the given title is in
  33. the category namespace; :py:meth:`~earwigbot.wiki.site.Site.get_category`
  34. is shorthand, accepting category names without the namespace prefix.
  35. *Attributes:*
  36. - :py:attr:`size`: the total number of members in the category
  37. - :py:attr:`pages`: the number of pages in the category
  38. - :py:attr:`files`: the number of files in the category
  39. - :py:attr:`subcats`: the number of subcategories in the category
  40. *Public methods:*
  41. - :py:meth:`get_members`: iterates over Pages in the category
  42. """
  43. def __repr__(self):
  44. """Return the canonical string representation of the Category."""
  45. res = "Category(title={0!r}, follow_redirects={1!r}, site={2!r})"
  46. return res.format(self._title, self._follow_redirects, self._site)
  47. def __str__(self):
  48. """Return a nice string representation of the Category."""
  49. return '<Category "{0}" of {1}>'.format(self.title, str(self.site))
  50. def __iter__(self):
  51. """Iterate over all members of the category."""
  52. return self.get_members()
  53. def _get_members_via_api(self, limit, follow):
  54. """Iterate over Pages in the category using the API."""
  55. params = {"action": "query", "list": "categorymembers",
  56. "cmtitle": self.title, "continue": ""}
  57. while 1:
  58. params["cmlimit"] = limit if limit else "max"
  59. result = self.site.api_query(**params)
  60. for member in result["query"]["categorymembers"]:
  61. title = member["title"]
  62. yield self.site.get_page(title, follow_redirects=follow)
  63. if "continue" in result:
  64. params.update(result["continue"])
  65. if limit:
  66. limit -= len(result["query"]["categorymembers"])
  67. else:
  68. break
  69. def _get_members_via_sql(self, limit, follow):
  70. """Iterate over Pages in the category using SQL."""
  71. query = """SELECT page_title, page_namespace, page_id FROM page
  72. JOIN categorylinks ON page_id = cl_from
  73. WHERE cl_to = ?"""
  74. title = self.title.replace(" ", "_").split(":", 1)[1]
  75. if limit:
  76. query += " LIMIT ?"
  77. result = self.site.sql_query(query, (title, limit), buffsize=0)
  78. else:
  79. result = self.site.sql_query(query, (title,), buffsize=0)
  80. members = list(result)
  81. for row in members:
  82. base = row[0].replace("_", " ").decode("utf8")
  83. namespace = self.site.namespace_id_to_name(row[1])
  84. if namespace:
  85. title = ":".join((namespace, base))
  86. else: # Avoid doing a silly (albeit valid) ":Pagename" thing
  87. title = base
  88. yield self.site.get_page(title, follow_redirects=follow,
  89. pageid=row[2])
  90. def _get_size_via_api(self, member_type):
  91. """Return the size of the category using the API."""
  92. result = self.site.api_query(action="query", prop="categoryinfo",
  93. titles=self.title)
  94. info = list(result["query"]["pages"].values())[0]["categoryinfo"]
  95. return info[member_type]
  96. def _get_size_via_sql(self, member_type):
  97. """Return the size of the category using SQL."""
  98. query = "SELECT COUNT(*) FROM categorylinks WHERE cl_to = ?"
  99. title = self.title.replace(" ", "_").split(":", 1)[1]
  100. if member_type == "size":
  101. result = self.site.sql_query(query, (title,))
  102. else:
  103. query += " AND cl_type = ?"
  104. result = self.site.sql_query(query, (title, member_type[:-1]))
  105. return list(result)[0][0]
  106. def _get_size(self, member_type):
  107. """Return the size of the category."""
  108. services = {
  109. self.site.SERVICE_API: self._get_size_via_api,
  110. self.site.SERVICE_SQL: self._get_size_via_sql
  111. }
  112. return self.site.delegate(services, (member_type,))
  113. @property
  114. def size(self):
  115. """The total number of members in the category.
  116. Includes pages, files, and subcats. Equal to :py:attr:`pages` +
  117. :py:attr:`files` + :py:attr:`subcats`. This will use either the API or
  118. SQL depending on which are enabled and the amount of lag on each. This
  119. is handled by :py:meth:`site.delegate()
  120. <earwigbot.wiki.site.Site.delegate>`.
  121. """
  122. return self._get_size("size")
  123. @property
  124. def pages(self):
  125. """The number of pages in the category.
  126. This will use either the API or SQL depending on which are enabled and
  127. the amount of lag on each. This is handled by :py:meth:`site.delegate()
  128. <earwigbot.wiki.site.Site.delegate>`.
  129. """
  130. return self._get_size("pages")
  131. @property
  132. def files(self):
  133. """The number of files in the category.
  134. This will use either the API or SQL depending on which are enabled and
  135. the amount of lag on each. This is handled by :py:meth:`site.delegate()
  136. <earwigbot.wiki.site.Site.delegate>`.
  137. """
  138. return self._get_size("files")
  139. @property
  140. def subcats(self):
  141. """The number of subcategories in the category.
  142. This will use either the API or SQL depending on which are enabled and
  143. the amount of lag on each. This is handled by :py:meth:`site.delegate()
  144. <earwigbot.wiki.site.Site.delegate>`.
  145. """
  146. return self._get_size("subcats")
  147. def get_members(self, limit=None, follow_redirects=None):
  148. """Iterate over Pages in the category.
  149. If *limit* is given, we will provide this many pages, or less if the
  150. category is smaller. By default, *limit* is ``None``, meaning we will
  151. keep iterating over members until the category is exhausted.
  152. *follow_redirects* is passed directly to :py:meth:`site.get_page()
  153. <earwigbot.wiki.site.Site.get_page>`; it defaults to ``None``, which
  154. will use the value passed to our :py:meth:`__init__`.
  155. This will use either the API or SQL depending on which are enabled and
  156. the amount of lag on each. This is handled by :py:meth:`site.delegate()
  157. <earwigbot.wiki.site.Site.delegate>`.
  158. .. note::
  159. Be careful when iterating over very large categories with no limit.
  160. If using the API, at best, you will make one query per 5000 pages,
  161. which can add up significantly for categories with hundreds of
  162. thousands of members. As for SQL, note that *all page titles are
  163. stored internally* as soon as the query is made, so the site-wide
  164. SQL lock can be freed and unrelated queries can be made without
  165. requiring a separate connection to be opened. This is generally not
  166. an issue unless your category's size approaches several hundred
  167. thousand, in which case the sheer number of titles in memory becomes
  168. problematic.
  169. """
  170. services = {
  171. self.site.SERVICE_API: self._get_members_via_api,
  172. self.site.SERVICE_SQL: self._get_members_via_sql
  173. }
  174. if follow_redirects is None:
  175. follow_redirects = self._follow_redirects
  176. return self.site.delegate(services, (limit, follow_redirects))