A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

206 řádky
8.8 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from earwigbot.wiki.page import Page
  23. __all__ = ["Category"]
  24. class Category(Page):
  25. """
  26. **EarwigBot: Wiki Toolset: Category**
  27. Represents a category on a given :py:class:`~earwigbot.wiki.site.Site`, a
  28. subclass of :py:class:`~earwigbot.wiki.page.Page`. Provides additional
  29. methods, but :py:class:`~earwigbot.wiki.page.Page`'s own methods should
  30. work fine on :py:class:`Category` objects. :py:meth:`site.get_page()
  31. <earwigbot.wiki.site.Site.get_page>` will return a :py:class:`Category`
  32. instead of a :py:class:`~earwigbot.wiki.page.Page` if the given title is in
  33. the category namespace; :py:meth:`~earwigbot.wiki.site.Site.get_category`
  34. is shorthand, accepting category names without the namespace prefix.
  35. *Attributes:*
  36. - :py:attr:`size`: the total number of members in the category
  37. - :py:attr:`pages`: the number of pages in the category
  38. - :py:attr:`files`: the number of files in the category
  39. - :py:attr:`subcats`: the number of subcategories in the category
  40. *Public methods:*
  41. - :py:meth:`get_members`: iterates over Pages in the category
  42. """
  43. def __repr__(self):
  44. """Return the canonical string representation of the Category."""
  45. res = "Category(title={0!r}, follow_redirects={1!r}, site={2!r})"
  46. return res.format(self._title, self._follow_redirects, self._site)
  47. def __str__(self):
  48. """Return a nice string representation of the Category."""
  49. return '<Category "{0}" of {1}>'.format(self.title, str(self.site))
  50. def _get_members_via_api(self, limit, follow):
  51. """Iterate over Pages in the category using the API."""
  52. params = {"action": "query", "list": "categorymembers",
  53. "cmtitle": self.title}
  54. while 1:
  55. params["cmlimit"] = limit if limit else "max"
  56. result = self.site.api_query(**params)
  57. for member in result["query"]["categorymembers"]:
  58. title = member["title"]
  59. yield self.site.get_page(title, follow_redirects=follow)
  60. if "query-continue" in result:
  61. qcontinue = result["query-continue"]["categorymembers"]
  62. params["cmcontinue"] = qcontinue["cmcontinue"]
  63. if limit:
  64. limit -= len(result["query"]["categorymembers"])
  65. else:
  66. break
  67. def _get_members_via_sql(self, limit, follow):
  68. """Iterate over Pages in the category using SQL."""
  69. query = """SELECT page_title, page_namespace, page_id FROM page
  70. JOIN categorylinks ON page_id = cl_from
  71. WHERE cl_to = ?"""
  72. title = self.title.replace(" ", "_").split(":", 1)[1]
  73. if limit:
  74. query += " LIMIT ?"
  75. result = self.site.sql_query(query, (title, limit))
  76. else:
  77. result = self.site.sql_query(query, (title,))
  78. members = list(result)
  79. for row in members:
  80. base = row[0].replace("_", " ").decode("utf8")
  81. namespace = self.site.namespace_id_to_name(row[1])
  82. if namespace:
  83. title = u":".join((namespace, base))
  84. else: # Avoid doing a silly (albeit valid) ":Pagename" thing
  85. title = base
  86. yield self.site.get_page(title, follow_redirects=follow,
  87. pageid=row[2])
  88. def _get_size_via_api(self, member_type):
  89. """Return the size of the category using the API."""
  90. result = self.site.api_query(action="query", prop="categoryinfo",
  91. titles=self.title)
  92. info = result["query"]["pages"].values()[0]["categoryinfo"]
  93. return info[member_type]
  94. def _get_size_via_sql(self, member_type):
  95. """Return the size of the category using SQL."""
  96. query = "SELECT COUNT(*) FROM categorylinks WHERE cl_to = ?"
  97. title = self.title.replace(" ", "_").split(":", 1)[1]
  98. if member_type == "size":
  99. result = self.site.sql_query(query, (title,))
  100. else:
  101. query += " AND cl_type = ?"
  102. result = self.site.sql_query(query, (title, member_type[:-1]))
  103. return list(result)[0][0]
  104. def _get_size(self, member_type):
  105. """Return the size of the category."""
  106. services = {
  107. self.site.SERVICE_API: self._get_size_via_api,
  108. self.site.SERVICE_SQL: self._get_size_via_sql
  109. }
  110. return self.site.delegate(services, (member_type,))
  111. @property
  112. def size(self):
  113. """The total number of members in the category.
  114. Includes pages, files, and subcats. Equal to :py:attr:`pages` +
  115. :py:attr:`files` + :py:attr:`subcats`. This will use either the API or
  116. SQL depending on which are enabled and the amount of lag on each. This
  117. is handled by :py:meth:`site.delegate()
  118. <earwigbot.wiki.site.Site.delegate>`.
  119. """
  120. return self._get_size("size")
  121. @property
  122. def pages(self):
  123. """The number of pages in the category.
  124. This will use either the API or SQL depending on which are enabled and
  125. the amount of lag on each. This is handled by :py:meth:`site.delegate()
  126. <earwigbot.wiki.site.Site.delegate>`.
  127. """
  128. return self._get_size("pages")
  129. @property
  130. def files(self):
  131. """The number of files in the category.
  132. This will use either the API or SQL depending on which are enabled and
  133. the amount of lag on each. This is handled by :py:meth:`site.delegate()
  134. <earwigbot.wiki.site.Site.delegate>`.
  135. """
  136. return self._get_size("files")
  137. @property
  138. def subcats(self):
  139. """The number of subcategories in the category.
  140. This will use either the API or SQL depending on which are enabled and
  141. the amount of lag on each. This is handled by :py:meth:`site.delegate()
  142. <earwigbot.wiki.site.Site.delegate>`.
  143. """
  144. return self._get_size("subcats")
  145. def get_members(self, limit=None, follow_redirects=None):
  146. """Iterate over Pages in the category.
  147. If *limit* is given, we will provide this many pages, or less if the
  148. category is smaller. By default, *limit* is ``None``, meaning we will
  149. keep iterating over members until the category is exhausted.
  150. *follow_redirects* is passed directly to :py:meth:`site.get_page()
  151. <earwigbot.wiki.site.Site.get_page>`; it defaults to ``None``, which
  152. will use the value passed to our :py:meth:`__init__`.
  153. This will use either the API or SQL depending on which are enabled and
  154. the amount of lag on each. This is handled by :py:meth:`site.delegate()
  155. <earwigbot.wiki.site.Site.delegate>`.
  156. .. note::
  157. Be careful when iterating over very large categories with no limit.
  158. If using the API, at best, you will make one query per 5000 pages,
  159. which can add up significantly for categories with hundreds of
  160. thousands of members. As for SQL, note that *all page titles are
  161. stored internally* as soon as the query is made, so the site-wide
  162. SQL lock can be freed and unrelated queries can be made without
  163. requiring a separate connection to be opened. This is generally not
  164. an issue unless your category's size approaches several hundred
  165. thousand, in which case the sheer number of titles in memory becomes
  166. problematic.
  167. """
  168. services = {
  169. self.site.SERVICE_API: self._get_members_via_api,
  170. self.site.SERVICE_SQL: self._get_members_via_sql
  171. }
  172. if follow_redirects is None:
  173. follow_redirects = self._follow_redirects
  174. return self.site.delegate(services, (limit, follow_redirects))