A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

330 lines
13 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. import re
  23. from earwigbot import exceptions
  24. from earwigbot.tasks import Task
  25. from earwigbot.wiki import constants
  26. class WikiProjectTagger(Task):
  27. """A task to tag talk pages with WikiProject banners.
  28. Usage: :command:`earwigbot -t wikiproject_tagger PATH
  29. --banner BANNER (--category CAT | --file FILE) [--summary SUM]
  30. [--append TEXT] [--autoassess] [--nocreate] [--recursive NUM]
  31. [--site SITE]`
  32. .. glossary::
  33. ``--banner BANNER``
  34. the page name of the banner to add, without a namespace (unless the
  35. namespace is something other than ``Template``) so
  36. ``--banner WikiProject Biography`` for ``{{WikiProject Biography}}``
  37. ``--category CAT`` or ``--file FILE``
  38. determines which pages to tag; either all pages in a category (to
  39. include subcategories as well, see ``--recursive``) or all
  40. pages/categories in a file (utf-8 encoded and path relative to the
  41. current directory)
  42. ``--summary SUM``
  43. an optional edit summary to use; defaults to
  44. ``"Adding WikiProject banner {{BANNER}}."``
  45. ``--append TEXT``
  46. optional text to append to the banner (after an autoassessment, if
  47. any), like ``|importance=low``
  48. ``--autoassess``
  49. try to assess each article's class automatically based on the class of
  50. other banners on the same page
  51. ``--nocreate``
  52. don't create new talk pages with just a banner if the page doesn't
  53. already exist
  54. ``--recursive NUM``
  55. recursively go through subcategories up to a maximum depth of ``NUM``,
  56. or if ``NUM`` isn't provided, go infinitely (this can be dangerous)
  57. ``--site SITE``
  58. the ID of the site to tag pages on, defaulting to the... default site
  59. """
  60. name = "wikiproject_tagger"
  61. # Regexes for template names that should always go above the banner, based
  62. # on [[Wikipedia:Talk page layout]]:
  63. TOP_TEMPS = [
  64. r"skip ?to ?(toc|talk|toctalk)$",
  65. r"ga ?nominee$",
  66. r"(user ?)?talk ?(header|page|page ?header)$",
  67. r"community ?article ?probation$",
  68. r"censor(-nudity)?$",
  69. r"blp(o| ?others?)?$",
  70. r"controvers(ial2?|y)$",
  71. r"(not ?(a ?)?)?forum$",
  72. r"tv(episode|series)talk$",
  73. r"recurring ?themes$",
  74. r"faq$",
  75. r"(round ?in ?)?circ(les|ular)$",
  76. r"ar(ti|it)cle ?(history|milestones)$",
  77. r"failed ?ga$",
  78. r"old ?prod( ?full)?$",
  79. r"(old|previous) ?afd$",
  80. r"((wikiproject|wp) ?)?bio(graph(y|ies))?$",
  81. ]
  82. def _upperfirst(self, text):
  83. """Try to uppercase the first letter of a string."""
  84. try:
  85. return text[0].upper() + text[1:]
  86. except IndexError:
  87. return text
  88. def run(self, **kwargs):
  89. """Main entry point for the bot task."""
  90. if "file" not in kwargs and "category" not in kwargs:
  91. log = "No pages to tag; I need either a 'category' or a 'file' passed as kwargs"
  92. self.logger.error(log)
  93. return
  94. if "banner" not in kwargs:
  95. log = "Needs a banner to add passed as the 'banner' kwarg"
  96. self.logger.error(log)
  97. return
  98. site = self.bot.wiki.get_site(name=kwargs.get("site"))
  99. banner = kwargs["banner"]
  100. summary = kwargs.get("summary", "Adding WikiProject banner $3.")
  101. append = kwargs.get("append")
  102. autoassess = kwargs.get("autoassess", False)
  103. nocreate = kwargs.get("nocreate", False)
  104. recursive = kwargs.get("recursive", 0)
  105. banner, names = self.get_names(site, banner)
  106. if not names:
  107. return
  108. job = _Job(banner, names, summary, append, autoassess, nocreate)
  109. try:
  110. self.run_job(kwargs, site, job, recursive)
  111. except _ShutoffEnabled:
  112. return
  113. def run_job(self, kwargs, site, job, recursive):
  114. """Run a tagging *job* on a given *site*."""
  115. if "category" in kwargs:
  116. title = kwargs["category"]
  117. title = self.guess_namespace(site, title, constants.NS_CATEGORY)
  118. self.process_category(site.get_page(title), job, recursive)
  119. if "file" in kwargs:
  120. with open(kwargs["file"], "r") as fileobj:
  121. for line in fileobj:
  122. if line.strip():
  123. line = line.decode("utf8")
  124. if line.startswith("[[") and line.endswith("]]"):
  125. line = line[2:-2]
  126. page = site.get_page(line)
  127. if page.namespace == constants.NS_CATEGORY:
  128. self.process_category(page, job, recursive)
  129. else:
  130. self.process_page(page, job)
  131. def guess_namespace(self, site, title, assumed):
  132. """If the given *title* does not have an explicit namespace, guess it.
  133. For example, when transcluding templates, the namespace is guessed to
  134. be ``NS_TEMPLATE`` unless one is explicitly declared (so ``{{foo}}`` ->
  135. ``[[Template:Foo]]``, but ``{{:foo}}`` -> ``[[Foo]]``).
  136. """
  137. prefix = title.split(":", 1)[0]
  138. if prefix == title:
  139. return u":".join((site.namespace_id_to_name(assumed), title))
  140. try:
  141. site.namespace_name_to_id(prefix)
  142. except exceptions.NamespaceNotFoundError:
  143. return u":".join((site.namespace_id_to_name(assumed), title))
  144. return title
  145. def get_names(self, site, banner):
  146. """Return all possible aliases for a given *banner* template."""
  147. title = self.guess_namespace(site, banner, constants.NS_TEMPLATE)
  148. if title == banner:
  149. banner = banner.split(":", 1)[1]
  150. page = site.get_page(title)
  151. if page.exists != page.PAGE_EXISTS:
  152. self.logger.error(u"Banner [[{0}]] does not exist".format(title))
  153. return banner, None
  154. if banner == title:
  155. names = [self._upperfirst(banner)]
  156. else:
  157. names = [self._upperfirst(banner), self._upperfirst(title)]
  158. result = site.api_query(action="query", list="backlinks", bllimit=500,
  159. blfilterredir="redirects", bltitle=title)
  160. for backlink in result["query"]["backlinks"]:
  161. names.append(backlink["title"])
  162. if backlink["ns"] == constants.NS_TEMPLATE:
  163. names.append(backlink["title"].split(":", 1)[1])
  164. log = u"Found {0} aliases for banner [[{1}]]".format(len(names), title)
  165. self.logger.debug(log)
  166. return banner, names
  167. def process_category(self, page, job, recursive):
  168. """Try to tag all pages in the given category."""
  169. self.logger.info(u"Processing category: [[{0]]".format(page.title))
  170. for member in page.get_members():
  171. if member.namespace == constants.NS_CATEGORY:
  172. if recursive is True:
  173. self.process_category(member, job, True)
  174. elif recursive:
  175. self.process_category(member, job, recursive - 1)
  176. else:
  177. self.process_page(member, job)
  178. def process_page(self, page, job):
  179. """Try to tag a specific *page* using the *job* description."""
  180. if job.counter % 10 == 0: # Do a shutoff check every ten pages
  181. if self.shutoff_enabled(page.site):
  182. raise _ShutoffEnabled()
  183. job.counter += 1
  184. if not page.is_talkpage:
  185. page = page.toggle_talk()
  186. try:
  187. code = page.parse()
  188. except exceptions.PageNotFoundError:
  189. if job.nocreate:
  190. log = u"Skipping nonexistent page: [[{0}]]".format(page.title)
  191. self.logger.info(log)
  192. else:
  193. log = u"Tagging new page: [[{0}]]".format(page.title)
  194. self.logger.info(log)
  195. banner = "{{" + job.banner + job.append + "}}"
  196. summary = job.summary.replace("$3", banner)
  197. page.edit(banner, self.make_summary(summary))
  198. return
  199. except exceptions.InvalidPageError:
  200. log = u"Skipping invalid page: [[{0}]]".format(page.title)
  201. self.logger.error(log)
  202. return
  203. for template in code.ifilter_templates(recursive=True):
  204. name = self._upperfirst(template.name.strip())
  205. if name in job.names:
  206. log = u"Skipping page: [[{0}]]; already tagged with '{1}'"
  207. self.logger.info(log.format(page.title, name))
  208. return
  209. banner = self.make_banner(job, code)
  210. shell = self.get_banner_shell(code)
  211. if shell:
  212. if shell.has_param(1):
  213. shell.get(1).value.insert(0, banner + "\n")
  214. else:
  215. shell.add(1, banner)
  216. else:
  217. self.add_banner(code, banner)
  218. self.apply_genfixes(code)
  219. self.logger.info(u"Tagging page: [[{0}]]".format(page.title))
  220. summary = job.summary.replace("$3", banner)
  221. page.edit(unicode(code), self.make_summary(summary))
  222. def make_banner(self, job, code):
  223. """Return banner text to add based on a *job* and a page's *code*."""
  224. banner = "{{" + job.banner
  225. if job.autoassess:
  226. classes = {"fa": 0, "fl": 0, "ga": 0, "a": 0, "b": 0, "start": 0,
  227. "stub": 0, "list": 0, "dab": 0, "c": 0, "redirect": 0,
  228. "book": 0, "template": 0, "category": 0}
  229. for template in code.ifilter_templates(recursive=True):
  230. if template.has_param("class"):
  231. value = unicode(template.get("class").value).lower()
  232. if value in classes:
  233. classes[value] += 1
  234. values = tuple(classes.values())
  235. best = max(values)
  236. confidence = float(best) / sum(values)
  237. if confidence > 0.75:
  238. rank = tuple(classes.keys())[values.index(best)]
  239. if rank in ("fa", "fl", "ga"):
  240. banner += "|class=" + rank.upper()
  241. else:
  242. banner += "|class=" + self._upperfirst(rank)
  243. return banner + job.append + "}}"
  244. def get_banner_shell(self, code):
  245. """Return the banner shell template within *code*, else ``None``."""
  246. regex = r"^\{\{\s*((WikiProject|WP)[ _]?Banner[ _]?S(hell)?|W(BPS|PBS|PB)|Shell)"
  247. shells = code.filter_templates(matches=regex)
  248. if not shells:
  249. shells = code.filter_templates(matches=regex, recursive=True)
  250. if shells:
  251. log = u"Inserting banner into shell: {0}"
  252. self.logger.debug(log.format(shells[0].name))
  253. return shells[0]
  254. def add_banner(self, code, banner):
  255. """Add *banner* to *code*, following template order conventions."""
  256. index = 0
  257. for i, template in enumerate(code.ifilter_templates()):
  258. name = template.name.lower().replace("_", " ")
  259. for regex in self.TOP_TEMPS:
  260. if re.match(regex, name):
  261. self.logger.info("Skipping top template: {0}".format(name))
  262. index = i + 1
  263. self.logger.debug(u"Inserting banner at index {0}".format(index))
  264. code.insert(index, banner)
  265. def apply_genfixes(self, code):
  266. """Apply general fixes to *code*, such as template substitution."""
  267. regex = r"^\{\{\s*((un|no)?s(i((gn|ng)(ed3?)?|g))?|usu|tilde|forgot to sign|without signature)"
  268. for template in code.ifilter_templates(matches=regex):
  269. self.logger.debug("Applying genfix: substitute {{unsigned}}")
  270. template.name = "subst:unsigned"
  271. class _Job(object):
  272. """Represents a single wikiproject-tagging task.
  273. Stores information on the banner to add, the edit summary to use, whether
  274. or not to autoassess and create new pages from scratch, and a counter of
  275. the number of pages edited.
  276. """
  277. def __init__(self, banner, names, summary, append, autoassess, nocreate):
  278. self.banner = banner
  279. self.names = names
  280. self.summary = summary
  281. self.append = append
  282. self.autoassess = autoassess
  283. self.nocreate = nocreate
  284. self.counter = 0
  285. class _ShutoffEnabled(Exception):
  286. """Raised by process_page() if shutoff is enabled. Caught by run(), which
  287. will then stop the task."""
  288. pass