A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

169 rindas
7.2 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from earwigbot.wiki.page import Page
  23. __all__ = ["Category"]
  24. class Category(Page):
  25. """
  26. **EarwigBot: Wiki Toolset: Category**
  27. Represents a category on a given :py:class:`~earwigbot.wiki.site.Site`, a
  28. subclass of :py:class:`~earwigbot.wiki.page.Page`. Provides additional
  29. methods, but :py:class:`~earwigbot.wiki.page.Page`'s own methods should
  30. work fine on :py:class:`Category` objects. :py:meth:`site.get_page()
  31. <earwigbot.wiki.site.Site.get_page>` will return a :py:class:`Category`
  32. instead of a :py:class:`~earwigbot.wiki.page.Page` if the given title is in
  33. the category namespace; :py:meth:`~earwigbot.wiki.site.Site.get_category`
  34. is shorthand, accepting category names without the namespace prefix.
  35. *Public methods:*
  36. - :py:meth:`get_members`: iterates over Pages in the category
  37. """
  38. def __repr__(self):
  39. """Return the canonical string representation of the Category."""
  40. res = "Category(title={0!r}, follow_redirects={1!r}, site={2!r})"
  41. return res.format(self._title, self._follow_redirects, self._site)
  42. def __str__(self):
  43. """Return a nice string representation of the Category."""
  44. return '<Category "{0}" of {1}>'.format(self.title, str(self.site))
  45. def _get_members_via_sql(self, limit, follow):
  46. """Iterate over Pages in the category using SQL."""
  47. query = """SELECT page_title, page_namespace, page_id FROM page
  48. JOIN categorylinks ON page_id = cl_from
  49. WHERE cl_to = ?"""
  50. title = self.title.replace(" ", "_").split(":", 1)[1]
  51. if limit:
  52. query += " LIMIT ?"
  53. result = self.site.sql_query(query, (title, limit))
  54. else:
  55. result = self.site.sql_query(query, (title,))
  56. members = list(result)
  57. for row in members:
  58. base = row[0].replace("_", " ").decode("utf8")
  59. namespace = self.site.namespace_id_to_name(row[1])
  60. if namespace:
  61. title = u":".join((namespace, base))
  62. else: # Avoid doing a silly (albeit valid) ":Pagename" thing
  63. title = base
  64. yield self.site.get_page(title, follow_redirects=follow,
  65. pageid=row[2])
  66. def _get_members_via_api(self, limit, follow):
  67. """Iterate over Pages in the category using the API."""
  68. params = {"action": "query", "list": "categorymembers",
  69. "cmtitle": self.title}
  70. while 1:
  71. params["cmlimit"] = limit if limit else "max"
  72. result = self.site.api_query(**params)
  73. for member in result["query"]["categorymembers"]:
  74. title = member["title"]
  75. yield self.site.get_page(title, follow_redirects=follow)
  76. if "query-continue" in result:
  77. qcontinue = result["query-continue"]["categorymembers"]
  78. params["cmcontinue"] = qcontinue["cmcontinue"]
  79. if limit:
  80. limit -= len(result["query"]["categorymembers"])
  81. else:
  82. break
  83. def _get_size_via_sql(self, member_type):
  84. query = "SELECT COUNT(*) FROM categorylinks WHERE cl_to = ?"
  85. title = self.title.replace(" ", "_").split(":", 1)[1]
  86. if member_type == "size":
  87. result = self.site.sql_query(query, (title,))
  88. else:
  89. query += " AND cl_type = ?"
  90. result = self.site.sql_query(query, (title, member_type[:-1]))
  91. return list(result)[0]
  92. def _get_size_via_sql(self, member_type):
  93. result = self.site.api_query(action="query", prop="categoryinfo",
  94. cmtitle=self.title)
  95. info = result["query"]["pages"].values()[0]["categoryinfo"]
  96. return info[member_type]
  97. def _get_size(self, member_type):
  98. services = {
  99. self.site.SERVICE_API: self._size_via_api,
  100. self.site.SERVICE_SQL: self._size_via_sql
  101. }
  102. return self.site.delegate(services, (member_type,))
  103. @property
  104. def size(self):
  105. return self._get_size("size")
  106. @property
  107. def pages(self):
  108. return self._get_size("pages")
  109. @property
  110. def files(self):
  111. return self._get_size("files")
  112. @property
  113. def subcats(self):
  114. return self._get_size("subcats")
  115. def get_members(self, use_sql=False, limit=None, follow_redirects=None):
  116. """Iterate over Pages in the category.
  117. If *use_sql* is ``True``, we will use a SQL query instead of the API.
  118. Note that pages are retrieved from the API in chunks (by default, in
  119. 500-page chunks for normal users and 5000-page chunks for bots and
  120. admins), so queries may be made as we go along. If *limit* is given, we
  121. will provide this many pages, or less if the category is smaller. By
  122. default, *limit* is ``None``, meaning we will keep iterating over
  123. members until the category is exhausted. *follow_redirects* is passed
  124. directly to :py:meth:`site.get_page()
  125. <earwigbot.wiki.site.Site.get_page>`; it defaults to ``None``, which
  126. will use the value passed to our :py:meth:`__init__`.
  127. .. note::
  128. Be careful when iterating over very large categories with no limit.
  129. If using the API, at best, you will make one query per 5000 pages,
  130. which can add up significantly for categories with hundreds of
  131. thousands of members. As for SQL, note that *all page titles are
  132. stored internally* as soon as the query is made, so the site-wide
  133. SQL lock can be freed and unrelated queries can be made without
  134. requiring a separate connection to be opened. This is generally not
  135. an issue unless your category's size approaches several hundred
  136. thousand, in which case the sheer number of titles in memory becomes
  137. problematic.
  138. """
  139. if follow_redirects is None:
  140. follow_redirects = self._follow_redirects
  141. if use_sql:
  142. return self._get_members_via_sql(limit, follow_redirects)
  143. else:
  144. return self._get_members_via_api(limit, follow_redirects)