A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

wikiproject_tagger.py 13 KiB

11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. import re
  23. from earwigbot import exceptions
  24. from earwigbot.tasks import Task
  25. from earwigbot.wiki import constants
  26. class WikiProjectTagger(Task):
  27. """A task to tag talk pages with WikiProject banners.
  28. Usage: :command:`earwigbot -t wikiproject_tagger PATH
  29. --banner BANNER (--category CAT | --file FILE) [--summary SUM]
  30. [--append TEXT] [--autoassess] [--nocreate] [--recursive NUM]
  31. [--site SITE]`
  32. .. glossary::
  33. ``--banner BANNER``
  34. the page name of the banner to add, without a namespace (unless the
  35. namespace is something other than ``Template``) so
  36. ``--banner WikiProject Biography`` for ``{{WikiProject Biography}}``
  37. ``--category CAT`` or ``--file FILE``
  38. determines which pages to tag; either all pages in a category (to
  39. include subcategories as well, see ``--recursive``) or all
  40. pages/categories in a file (utf-8 encoded and path relative to the
  41. current directory)
  42. ``--summary SUM``
  43. an optional edit summary to use; defaults to
  44. ``"Adding WikiProject banner {{BANNER}}."``
  45. ``--append TEXT``
  46. optional text to append to the banner (after an autoassessment, if
  47. any), like ``|importance=low``
  48. ``--autoassess``
  49. try to assess each article's class automatically based on the class of
  50. other banners on the same page
  51. ``--nocreate``
  52. don't create new talk pages with just a banner if the page doesn't
  53. already exist
  54. ``--recursive NUM``
  55. recursively go through subcategories up to a maximum depth of ``NUM``,
  56. or if ``NUM`` isn't provided, go infinitely (this can be dangerous)
  57. ``--site SITE``
  58. the ID of the site to tag pages on, defaulting to the... default site
  59. """
  60. name = "wikiproject_tagger"
  61. # Regexes for template names that should always go above the banner, based
  62. # on [[Wikipedia:Talk page layout]]:
  63. TOP_TEMPS = [
  64. r"skip ?to ?(toc|talk|toctalk)$",
  65. r"ga ?nominee$",
  66. r"(user ?)?talk ?(header|page|page ?header)$",
  67. r"community ?article ?probation$",
  68. r"censor(-nudity)?$",
  69. r"blp(o| ?others?)?$",
  70. r"controvers(ial2?|y)$",
  71. r"(not ?(a ?)?)?forum$",
  72. r"tv(episode|series)talk$",
  73. r"recurring ?themes$",
  74. r"faq$",
  75. r"(round ?in ?)?circ(les|ular)$",
  76. r"ar(ti|it)cle ?(history|milestones)$",
  77. r"failed ?ga$",
  78. r"old ?prod( ?full)?$",
  79. r"(old|previous) ?afd$",
  80. r"((wikiproject|wp) ?)?bio(graph(y|ies))?$",
  81. ]
  82. def _upperfirst(self, text):
  83. """Try to uppercase the first letter of a string."""
  84. try:
  85. return text[0].upper() + text[1:]
  86. except IndexError:
  87. return text
  88. def run(self, **kwargs):
  89. """Main entry point for the bot task."""
  90. if "file" not in kwargs and "category" not in kwargs:
  91. log = "No pages to tag; I need either a 'category' or a 'file' passed as kwargs"
  92. self.logger.error(log)
  93. return
  94. if "banner" not in kwargs:
  95. log = "Needs a banner to add passed as the 'banner' kwarg"
  96. self.logger.error(log)
  97. return
  98. site = self.bot.wiki.get_site(name=kwargs.get("site"))
  99. banner = kwargs["banner"]
  100. summary = kwargs.get("summary", "Adding WikiProject banner $3.")
  101. append = kwargs.get("append")
  102. autoassess = kwargs.get("autoassess", False)
  103. nocreate = kwargs.get("nocreate", False)
  104. recursive = kwargs.get("recursive", 0)
  105. banner, names = self.get_names(site, banner)
  106. if not names:
  107. return
  108. job = _Job(banner, names, summary, append, autoassess, nocreate)
  109. try:
  110. self.run_job(kwargs, site, job, recursive)
  111. except _ShutoffEnabled:
  112. return
  113. def run_job(self, kwargs, site, job, recursive):
  114. """Run a tagging *job* on a given *site*."""
  115. if "category" in kwargs:
  116. title = kwargs["category"]
  117. title = self.guess_namespace(site, title, constants.NS_CATEGORY)
  118. self.process_category(site.get_page(title), job, recursive)
  119. if "file" in kwargs:
  120. with open(kwargs["file"], "r") as fileobj:
  121. for line in fileobj:
  122. if line.strip():
  123. line = line.decode("utf8")
  124. if line.startswith("[[") and line.endswith("]]"):
  125. line = line[2:-2]
  126. page = site.get_page(line)
  127. if page.namespace == constants.NS_CATEGORY:
  128. self.process_category(page, job, recursive)
  129. else:
  130. self.process_page(page, job)
  131. def guess_namespace(self, site, title, assumed):
  132. """If the given *title* does not have an explicit namespace, guess it.
  133. For example, when transcluding templates, the namespace is guessed to
  134. be ``NS_TEMPLATE`` unless one is explicitly declared (so ``{{foo}}`` ->
  135. ``[[Template:Foo]]``, but ``{{:foo}}`` -> ``[[Foo]]``).
  136. """
  137. prefix = title.split(":", 1)[0]
  138. if prefix == title:
  139. return u":".join((site.namespace_id_to_name(assumed), title))
  140. try:
  141. site.namespace_name_to_id(prefix)
  142. except exceptions.NamespaceNotFoundError:
  143. return u":".join((site.namespace_id_to_name(assumed), title))
  144. return title
  145. def get_names(self, site, banner):
  146. """Return all possible aliases for a given *banner* template."""
  147. title = self.guess_namespace(site, banner, constants.NS_TEMPLATE)
  148. if title == banner:
  149. banner = banner.split(":", 1)[1]
  150. page = site.get_page(title)
  151. if page.exists != page.PAGE_EXISTS:
  152. self.logger.error(u"Banner [[{0}]] does not exist".format(title))
  153. return banner, None
  154. if banner == title:
  155. names = [self._upperfirst(banner)]
  156. else:
  157. names = [self._upperfirst(banner), self._upperfirst(title)]
  158. result = site.api_query(action="query", list="backlinks", bllimit=500,
  159. blfilterredir="redirects", bltitle=title)
  160. for backlink in result["query"]["backlinks"]:
  161. names.append(backlink["title"])
  162. if backlink["ns"] == constants.NS_TEMPLATE:
  163. names.append(backlink["title"].split(":", 1)[1])
  164. log = u"Found {0} aliases for banner [[{1}]]".format(len(names), title)
  165. self.logger.debug(log)
  166. return banner, names
  167. def process_category(self, page, job, recursive):
  168. """Try to tag all pages in the given category."""
  169. self.logger.info(u"Processing category: [[{0]]".format(page.title))
  170. for member in page.get_members():
  171. if member.namespace == constants.NS_CATEGORY:
  172. if recursive is True:
  173. self.process_category(member, job, True)
  174. elif recursive:
  175. self.process_category(member, job, recursive - 1)
  176. else:
  177. self.process_page(member, job)
  178. def process_page(self, page, job):
  179. """Try to tag a specific *page* using the *job* description."""
  180. if job.counter % 10 == 0: # Do a shutoff check every ten pages
  181. if self.shutoff_enabled(page.site):
  182. raise _ShutoffEnabled()
  183. job.counter += 1
  184. if not page.is_talkpage:
  185. page = page.toggle_talk()
  186. try:
  187. code = page.parse()
  188. except exceptions.PageNotFoundError:
  189. if job.nocreate:
  190. log = u"Skipping nonexistent page: [[{0}]]".format(page.title)
  191. self.logger.info(log)
  192. else:
  193. log = u"Tagging new page: [[{0}]]".format(page.title)
  194. self.logger.info(log)
  195. banner = "{{" + job.banner + job.append + "}}"
  196. summary = job.summary.replace("$3", banner)
  197. page.edit(banner, self.make_summary(summary))
  198. return
  199. except exceptions.InvalidPageError:
  200. log = u"Skipping invalid page: [[{0}]]".format(page.title)
  201. self.logger.error(log)
  202. return
  203. for template in code.ifilter_templates(recursive=True):
  204. name = self._upperfirst(template.name.strip())
  205. if name in job.names:
  206. log = u"Skipping page: [[{0}]]; already tagged with '{1}'"
  207. self.logger.info(log.format(page.title, name))
  208. return
  209. banner = self.make_banner(job, code)
  210. shell = self.get_banner_shell(code)
  211. if shell:
  212. if shell.has_param(1):
  213. shell.get(1).value.insert(0, banner + "\n")
  214. else:
  215. shell.add(1, banner)
  216. else:
  217. self.add_banner(code, banner)
  218. self.apply_genfixes(code)
  219. self.logger.info(u"Tagging page: [[{0}]]".format(page.title))
  220. summary = job.summary.replace("$3", banner)
  221. page.edit(unicode(code), self.make_summary(summary))
  222. def make_banner(self, job, code):
  223. """Return banner text to add based on a *job* and a page's *code*."""
  224. banner = "{{" + job.banner
  225. if job.autoassess:
  226. classes = {"fa": 0, "fl": 0, "ga": 0, "a": 0, "b": 0, "start": 0,
  227. "stub": 0, "list": 0, "dab": 0, "c": 0, "redirect": 0,
  228. "book": 0, "template": 0, "category": 0}
  229. for template in code.ifilter_templates(recursive=True):
  230. if template.has_param("class"):
  231. value = unicode(template.get("class").value).lower()
  232. if value in classes:
  233. classes[value] += 1
  234. values = tuple(classes.values())
  235. best = max(values)
  236. confidence = float(best) / sum(values)
  237. if confidence > 0.75:
  238. rank = tuple(classes.keys())[values.index(best)]
  239. if rank in ("fa", "fl", "ga"):
  240. banner += "|class=" + rank.upper()
  241. else:
  242. banner += "|class=" + self._upperfirst(rank)
  243. return banner + job.append + "}}"
  244. def get_banner_shell(self, code):
  245. """Return the banner shell template within *code*, else ``None``."""
  246. regex = r"^\{\{\s*((WikiProject|WP)[ _]?Banner[ _]?S(hell)?|W(BPS|PBS|PB)|Shell)"
  247. shells = code.filter_templates(matches=regex)
  248. if not shells:
  249. shells = code.filter_templates(matches=regex, recursive=True)
  250. if shells:
  251. log = u"Inserting banner into shell: {0}"
  252. self.logger.debug(log.format(shells[0].name))
  253. return shells[0]
  254. def add_banner(self, code, banner):
  255. """Add *banner* to *code*, following template order conventions."""
  256. index = 0
  257. for i, template in enumerate(code.ifilter_templates()):
  258. name = template.name.lower().replace("_", " ")
  259. for regex in self.TOP_TEMPS:
  260. if re.match(regex, name):
  261. self.logger.info("Skipping top template: {0}".format(name))
  262. index = i + 1
  263. self.logger.debug(u"Inserting banner at index {0}".format(index))
  264. code.insert(index, banner)
  265. def apply_genfixes(self, code):
  266. """Apply general fixes to *code*, such as template substitution."""
  267. regex = r"^\{\{\s*((un|no)?s(i((gn|ng)(ed3?)?|g))?|usu|tilde|forgot to sign|without signature)"
  268. for template in code.ifilter_templates(matches=regex):
  269. self.logger.debug("Applying genfix: substitute {{unsigned}}")
  270. template.name = "subst:unsigned"
  271. class _Job(object):
  272. """Represents a single wikiproject-tagging task.
  273. Stores information on the banner to add, the edit summary to use, whether
  274. or not to autoassess and create new pages from scratch, and a counter of
  275. the number of pages edited.
  276. """
  277. def __init__(self, banner, names, summary, append, autoassess, nocreate):
  278. self.banner = banner
  279. self.names = names
  280. self.summary = summary
  281. self.append = append
  282. self.autoassess = autoassess
  283. self.nocreate = nocreate
  284. self.counter = 0
  285. class _ShutoffEnabled(Exception):
  286. """Raised by process_page() if shutoff is enabled. Caught by run(), which
  287. will then stop the task."""
  288. pass