A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

464 rindas
19 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. import re
  23. from earwigbot import exceptions
  24. from earwigbot.tasks import Task
  25. from earwigbot.wiki import constants
  26. class WikiProjectTagger(Task):
  27. """A task to tag talk pages with WikiProject banners.
  28. Usage: :command:`earwigbot -t wikiproject_tagger PATH
  29. --banner BANNER (--category CAT | --file FILE) [--summary SUM] [--update]
  30. [--append PARAMS] [--autoassess [CLASSES]] [--only-with BANNER]
  31. [--nocreate] [--recursive [NUM]] [--site SITE] [--dry-run]`
  32. .. glossary::
  33. ``--banner BANNER``
  34. the page name of the banner to add, without a namespace (unless the
  35. namespace is something other than ``Template``) so
  36. ``--banner "WikiProject Biography"`` for ``{{WikiProject Biography}}``
  37. ``--category CAT`` or ``--file FILE``
  38. determines which pages to tag; either all pages in a category (to
  39. include subcategories as well, see ``--recursive``) or all
  40. pages/categories in a file (utf-8 encoded and path relative to the
  41. current directory)
  42. ``--summary SUM``
  43. an optional edit summary to use; defaults to
  44. ``"Tagging with WikiProject banner {{BANNER}}."``
  45. ``--update``
  46. updates existing banners with new fields; should include at least one
  47. of ``--append`` or ``--autoassess`` to be useful
  48. ``--append PARAMS``
  49. optional comma-separated parameters to append to the banner (after an
  50. auto-assessment, if any); use syntax ``importance=low,taskforce=yes``
  51. to add ``|importance=low|taskforce=yes``
  52. ``--autoassess [CLASSES]``
  53. try to assess each article's class automatically based on the class of
  54. other banners on the same page; if CLASSES is given as a
  55. comma-separated list, only those classes will be auto-assessed
  56. ``--only-with BANNER``
  57. only tag pages that already have the given banner
  58. ``--nocreate``
  59. don't create new talk pages with just a banner if the page doesn't
  60. already exist
  61. ``--recursive NUM``
  62. recursively go through subcategories up to a maximum depth of ``NUM``,
  63. or if ``NUM`` isn't provided, go infinitely (this can be dangerous)
  64. ``--tag-categories``
  65. also tag category pages; will autoassess with ``|class=category`` if
  66. ``--autoassess`` is given
  67. ``--site SITE``
  68. the ID of the site to tag pages on, defaulting to the default site
  69. ``--dry-run``
  70. don't actually make any edits, just log the pages that would have been
  71. edited
  72. """
  73. name = "wikiproject_tagger"
  74. # Regexes for template names that should always go above the banner, based
  75. # on [[Wikipedia:Talk page layout]]:
  76. TOP_TEMPS = [
  77. r"skip ?to ?(toc|talk|toctalk)$",
  78. r"ga ?nominee$",
  79. r"(user ?)?talk ?(header|page|page ?header)$",
  80. r"community ?article ?probation$",
  81. r"censor(-nudity)?$",
  82. r"blp(o| ?others?)?$",
  83. r"controvers(ial2?|y)$",
  84. r"(not ?(a ?)?)?forum$",
  85. r"tv(episode|series)talk$",
  86. r"recurring ?themes$",
  87. r"faq$",
  88. r"(round ?in ?)?circ(les|ular)$",
  89. r"ar(ti|it)cle ?(history|milestones)$",
  90. r"failed ?ga$",
  91. r"old ?prod( ?full)?$",
  92. r"(old|previous) ?afd$",
  93. ]
  94. @staticmethod
  95. def _upperfirst(text):
  96. """Try to uppercase the first letter of a string."""
  97. try:
  98. return text[0].upper() + text[1:]
  99. except IndexError:
  100. return text
  101. def run(self, **kwargs):
  102. """Main entry point for the bot task."""
  103. if "file" not in kwargs and "category" not in kwargs:
  104. log = "No pages to tag; I need either a 'category' or a 'file' passed as kwargs"
  105. self.logger.error(log)
  106. return
  107. if "banner" not in kwargs:
  108. log = "Needs a banner to add passed as the 'banner' kwarg"
  109. self.logger.error(log)
  110. return
  111. site = self.bot.wiki.get_site(name=kwargs.get("site"))
  112. banner = kwargs["banner"]
  113. summary = kwargs.get("summary", "Tagging with WikiProject banner $3.")
  114. update = kwargs.get("update", False)
  115. append = kwargs.get("append")
  116. autoassess = kwargs.get("autoassess", False)
  117. ow_banner = kwargs.get("only-with")
  118. nocreate = kwargs.get("nocreate", False)
  119. recursive = kwargs.get("recursive", 0)
  120. tag_categories = kwargs.get("tag-categories", False)
  121. dry_run = kwargs.get("dry-run", False)
  122. banner, names = self.get_names(site, banner)
  123. if not names:
  124. return
  125. if ow_banner:
  126. _, only_with = self.get_names(site, ow_banner)
  127. if not only_with:
  128. return
  129. else:
  130. only_with = None
  131. job = _Job(banner=banner, names=names, summary=summary, update=update,
  132. append=append, autoassess=autoassess, only_with=only_with,
  133. nocreate=nocreate, tag_categories=tag_categories,
  134. dry_run=dry_run)
  135. try:
  136. self.run_job(kwargs, site, job, recursive)
  137. except _ShutoffEnabled:
  138. return
  139. def run_job(self, kwargs, site, job, recursive):
  140. """Run a tagging *job* on a given *site*."""
  141. if "category" in kwargs:
  142. title = kwargs["category"]
  143. title = self.guess_namespace(site, title, constants.NS_CATEGORY)
  144. self.process_category(site.get_page(title), job, recursive)
  145. if "file" in kwargs:
  146. with open(kwargs["file"], "r") as fileobj:
  147. for line in fileobj:
  148. if line.strip():
  149. line = line.decode("utf8")
  150. if line.startswith("[[") and line.endswith("]]"):
  151. line = line[2:-2]
  152. page = site.get_page(line)
  153. if page.namespace == constants.NS_CATEGORY:
  154. self.process_category(page, job, recursive)
  155. else:
  156. self.process_page(page, job)
  157. def guess_namespace(self, site, title, assumed):
  158. """If the given *title* does not have an explicit namespace, guess it.
  159. For example, when transcluding templates, the namespace is guessed to
  160. be ``NS_TEMPLATE`` unless one is explicitly declared (so ``{{foo}}`` ->
  161. ``[[Template:Foo]]``, but ``{{:foo}}`` -> ``[[Foo]]``).
  162. """
  163. prefix = title.split(":", 1)[0]
  164. if prefix == title:
  165. return u":".join((site.namespace_id_to_name(assumed), title))
  166. try:
  167. site.namespace_name_to_id(prefix)
  168. except exceptions.NamespaceNotFoundError:
  169. return u":".join((site.namespace_id_to_name(assumed), title))
  170. return title
  171. def get_names(self, site, banner):
  172. """Return all possible aliases for a given *banner* template."""
  173. title = self.guess_namespace(site, banner, constants.NS_TEMPLATE)
  174. if title == banner:
  175. banner = banner.split(":", 1)[1]
  176. page = site.get_page(title)
  177. if page.exists != page.PAGE_EXISTS:
  178. self.logger.error(u"Banner [[%s]] does not exist", title)
  179. return banner, None
  180. names = {banner, title}
  181. result = site.api_query(action="query", list="backlinks", bllimit=500,
  182. blfilterredir="redirects", bltitle=title)
  183. for backlink in result["query"]["backlinks"]:
  184. names.add(backlink["title"])
  185. if backlink["ns"] == constants.NS_TEMPLATE:
  186. names.add(backlink["title"].split(":", 1)[1])
  187. log = u"Found %s aliases for banner [[%s]]"
  188. self.logger.debug(log, len(names), title)
  189. return banner, names
  190. def process_category(self, page, job, recursive):
  191. """Try to tag all pages in the given category."""
  192. if page.title in job.processed_cats:
  193. self.logger.debug(u"Skipping category, already processed: [[%s]]",
  194. page.title)
  195. return
  196. self.logger.info(u"Processing category: [[%s]]", page.title)
  197. job.processed_cats.add(page.title)
  198. if job.tag_categories:
  199. self.process_page(page, job, is_category=True)
  200. for member in page.get_members():
  201. if member.namespace == constants.NS_CATEGORY:
  202. if recursive is True:
  203. self.process_category(member, job, True)
  204. elif recursive > 0:
  205. self.process_category(member, job, recursive - 1)
  206. elif job.tag_categories:
  207. self.process_page(member, job, is_category=True)
  208. else:
  209. self.process_page(member, job)
  210. def process_page(self, page, job, is_category=False):
  211. """Try to tag a specific *page* using the *job* description."""
  212. if not page.is_talkpage:
  213. page = page.toggle_talk()
  214. if page.title in job.processed_pages:
  215. self.logger.debug(u"Skipping page, already processed: [[%s]]",
  216. page.title)
  217. return
  218. job.processed_pages.add(page.title)
  219. if job.counter % 10 == 0: # Do a shutoff check every ten pages
  220. if self.shutoff_enabled(page.site):
  221. raise _ShutoffEnabled()
  222. job.counter += 1
  223. try:
  224. code = page.parse()
  225. except exceptions.PageNotFoundError:
  226. self.process_new_page(page, job)
  227. return
  228. except exceptions.InvalidPageError:
  229. self.logger.error(u"Skipping invalid page: [[%s]]", page.title)
  230. return
  231. is_update = False
  232. for template in code.ifilter_templates(recursive=True):
  233. if template.name.matches(job.names):
  234. if job.update:
  235. banner = template
  236. is_update = True
  237. break
  238. else:
  239. log = u"Skipping page: [[%s]]; already tagged with '%s'"
  240. self.logger.info(log, page.title, template.name)
  241. return
  242. if job.only_with:
  243. if not any(template.name.matches(job.only_with)
  244. for template in code.ifilter_templates(recursive=True)):
  245. log = u"Skipping page: [[%s]]; fails only-with condition"
  246. self.logger.info(log, page.title)
  247. return
  248. if is_update:
  249. old_banner = unicode(banner)
  250. self.update_banner(banner, job, code, is_category=is_category)
  251. if banner == old_banner:
  252. log = u"Skipping page: [[%s]]; already tagged and no updates"
  253. self.logger.info(log, page.title)
  254. return
  255. self.logger.info(u"Updating banner on page: [[%s]]", page.title)
  256. banner = banner.encode("utf8")
  257. else:
  258. self.logger.info(u"Tagging page: [[%s]]", page.title)
  259. banner = self.make_banner(job, code, is_category=is_category)
  260. shell = self.get_banner_shell(code)
  261. if shell:
  262. self.add_banner_to_shell(shell, banner)
  263. else:
  264. self.add_banner(code, banner)
  265. self.save_page(page, job, unicode(code), banner)
  266. def process_new_page(self, page, job):
  267. """Try to tag a *page* that doesn't exist yet using the *job*."""
  268. if job.nocreate or job.only_with:
  269. log = u"Skipping nonexistent page: [[%s]]"
  270. self.logger.info(log, page.title)
  271. else:
  272. self.logger.info(u"Tagging new page: [[%s]]", page.title)
  273. banner = self.make_banner(job)
  274. self.save_page(page, job, banner, banner)
  275. def save_page(self, page, job, text, banner):
  276. """Save a page with an updated banner."""
  277. if job.dry_run:
  278. self.logger.debug(u"[DRY RUN] Banner: %s", banner)
  279. else:
  280. summary = job.summary.replace("$3", banner)
  281. page.edit(text, self.make_summary(summary), minor=True)
  282. def make_banner(self, job, code=None, is_category=False):
  283. """Return banner text to add based on a *job* and a page's *code*."""
  284. banner = job.banner
  285. if code is not None and job.autoassess is not False:
  286. assess, reason = self.get_autoassessment(
  287. code, job.autoassess, is_category=is_category)
  288. if assess:
  289. banner += "|class=" + assess
  290. if reason:
  291. banner += "|auto=" + reason
  292. if job.append:
  293. banner += "|" + "|".join(job.append.split(","))
  294. return "{{" + banner + "}}"
  295. def update_banner(self, banner, job, code, is_category=False):
  296. """Update an existing *banner* based on a *job* and a page's *code*."""
  297. has = lambda key: (banner.has(key) and
  298. banner.get(key).value.strip() not in ("", "?"))
  299. if job.autoassess is not False:
  300. if not has("class"):
  301. assess, reason = self.get_autoassessment(
  302. code, job.autoassess, is_category=is_category)
  303. if assess:
  304. banner.add("class", assess)
  305. if reason:
  306. banner.add("auto", reason)
  307. if job.append:
  308. for param in job.append.split(","):
  309. key, value = param.split("=", 1)
  310. if not has(key):
  311. banner.add(key, value)
  312. def get_autoassessment(self, code, only_classes=None, is_category=False):
  313. """Get an autoassessment for a page.
  314. Return (assessed class as a string or None, assessment reason or None).
  315. """
  316. if only_classes is None:
  317. classnames = ["a", "b", "book", "c", "category", "dab", "fa",
  318. "fl", "ga", "list", "redirect", "start", "stub",
  319. "template"]
  320. else:
  321. classnames = [klass.strip().lower()
  322. for klass in only_classes.split(",")]
  323. if is_category:
  324. return ("category" if "category" in classnames else None), None
  325. classes = {klass: 0 for klass in classnames}
  326. for template in code.ifilter_templates(recursive=True):
  327. if template.has("class"):
  328. value = unicode(template.get("class").value).lower()
  329. if value in classes:
  330. classes[value] += 1
  331. values = tuple(classes.values())
  332. best = max(values)
  333. if best:
  334. confidence = float(best) / sum(values)
  335. if confidence > 0.75:
  336. rank = tuple(classes.keys())[values.index(best)]
  337. if rank in ("fa", "fl", "ga"):
  338. return rank.upper(), "inherit"
  339. else:
  340. return self._upperfirst(rank), "inherit"
  341. return None, None
  342. def get_banner_shell(self, code):
  343. """Return the banner shell template within *code*, else ``None``."""
  344. regex = r"^\{\{\s*((WikiProject|WP)[ _]?Banner[ _]?S(hell)?|W(BPS|PBS|PB)|Shell)"
  345. shells = code.filter_templates(matches=regex)
  346. if not shells:
  347. shells = code.filter_templates(matches=regex, recursive=True)
  348. if shells:
  349. log = u"Inserting banner into shell: %s"
  350. self.logger.debug(log, shells[0].name)
  351. return shells[0]
  352. def add_banner_to_shell(self, shell, banner):
  353. """Add *banner* to *shell*."""
  354. if shell.has_param(1):
  355. if unicode(shell.get(1).value).endswith("\n"):
  356. banner += "\n"
  357. else:
  358. banner = "\n" + banner
  359. shell.get(1).value.append(banner)
  360. else:
  361. shell.add(1, banner)
  362. def add_banner(self, code, banner):
  363. """Add *banner* to *code*, following template order conventions."""
  364. predecessor = None
  365. for template in code.ifilter_templates(recursive=False):
  366. name = template.name.lower().replace("_", " ")
  367. for regex in self.TOP_TEMPS:
  368. if re.match(regex, name):
  369. self.logger.debug(u"Skipping past top template: %s", name)
  370. predecessor = template
  371. break
  372. if "wikiproject" in name or name.startswith("wp"):
  373. self.logger.debug(u"Skipping past banner template: %s", name)
  374. predecessor = template
  375. if predecessor:
  376. self.logger.debug("Inserting banner after template")
  377. if not unicode(predecessor).endswith("\n"):
  378. banner = "\n" + banner
  379. post = code.index(predecessor) + 1
  380. if len(code.nodes) > post and not code.get(post).startswith("\n"):
  381. banner += "\n"
  382. code.insert_after(predecessor, banner)
  383. else:
  384. self.logger.debug("Inserting banner at beginning")
  385. code.insert(0, banner + "\n")
  386. class _Job(object):
  387. """Represents a single wikiproject-tagging task.
  388. Stores information on the banner to add, the edit summary to use, whether
  389. or not to autoassess and create new pages from scratch, and a counter of
  390. the number of pages edited.
  391. """
  392. def __init__(self, **kwargs):
  393. self.banner = kwargs["banner"]
  394. self.names = kwargs["names"]
  395. self.summary = kwargs["summary"]
  396. self.update = kwargs["update"]
  397. self.append = kwargs["append"]
  398. self.autoassess = kwargs["autoassess"]
  399. self.only_with = kwargs["only_with"]
  400. self.nocreate = kwargs["nocreate"]
  401. self.tag_categories = kwargs["tag_categories"]
  402. self.dry_run = kwargs["dry_run"]
  403. self.counter = 0
  404. self.processed_cats = set()
  405. self.processed_pages = set()
  406. class _ShutoffEnabled(Exception):
  407. """Raised by process_page() if shutoff is enabled. Caught by run(), which
  408. will then stop the task."""
  409. pass