A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 7 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 3 години
преди 3 години
преди 11 години
преди 11 години
преди 3 години
преди 3 години
преди 11 години
преди 3 години
преди 3 години
преди 7 години
преди 7 години
преди 7 години
преди 7 години
преди 7 години
преди 11 години
преди 3 години
преди 11 години
преди 3 години
преди 11 години
преди 3 години
преди 3 години
преди 11 години
преди 3 години
преди 7 години
преди 3 години
преди 3 години
преди 11 години
преди 3 години
преди 7 години
преди 7 години
преди 11 години
преди 3 години
преди 3 години
преди 3 години
преди 3 години
преди 7 години
преди 11 години
преди 7 години
преди 7 години
преди 7 години
преди 7 години
преди 6 години
преди 7 години
преди 3 години
преди 11 години
преди 11 години
преди 11 години
преди 11 години
преди 3 години
преди 11 години
преди 7 години
преди 3 години
преди 7 години
преди 11 години
преди 7 години
преди 11 години
преди 3 години
преди 7 години
преди 3 години
преди 11 години
преди 3 години
преди 7 години
преди 3 години
преди 11 години
преди 11 години
преди 11 години
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. import re
  23. from earwigbot import exceptions
  24. from earwigbot.tasks import Task
  25. from earwigbot.wiki import constants
  26. class WikiProjectTagger(Task):
  27. """A task to tag talk pages with WikiProject banners.
  28. Usage: :command:`earwigbot -t wikiproject_tagger PATH
  29. --banner BANNER (--category CAT | --file FILE) [--summary SUM] [--update]
  30. [--append PARAMS] [--autoassess [CLASSES]] [--only-with BANNER]
  31. [--nocreate] [--recursive [NUM]] [--site SITE] [--dry-run]`
  32. .. glossary::
  33. ``--banner BANNER``
  34. the page name of the banner to add, without a namespace (unless the
  35. namespace is something other than ``Template``) so
  36. ``--banner "WikiProject Biography"`` for ``{{WikiProject Biography}}``
  37. ``--category CAT`` or ``--file FILE``
  38. determines which pages to tag; either all pages in a category (to
  39. include subcategories as well, see ``--recursive``) or all
  40. pages/categories in a file (utf-8 encoded and path relative to the
  41. current directory)
  42. ``--summary SUM``
  43. an optional edit summary to use; defaults to
  44. ``"Tagging with WikiProject banner {{BANNER}}."``
  45. ``--update``
  46. updates existing banners with new fields; should include at least one
  47. of ``--append`` or ``--autoassess`` to be useful
  48. ``--append PARAMS``
  49. optional comma-separated parameters to append to the banner (after an
  50. auto-assessment, if any); use syntax ``importance=low,taskforce=yes``
  51. to add ``|importance=low|taskforce=yes``
  52. ``--autoassess [CLASSES]``
  53. try to assess each article's class automatically based on the class of
  54. other banners on the same page; if CLASSES is given as a
  55. comma-separated list, only those classes will be auto-assessed
  56. ``--only-with BANNER``
  57. only tag pages that already have the given banner
  58. ``--nocreate``
  59. don't create new talk pages with just a banner if the page doesn't
  60. already exist
  61. ``--recursive NUM``
  62. recursively go through subcategories up to a maximum depth of ``NUM``,
  63. or if ``NUM`` isn't provided, go infinitely (this can be dangerous)
  64. ``--tag-categories``
  65. also tag category pages
  66. ``--site SITE``
  67. the ID of the site to tag pages on, defaulting to the default site
  68. ``--dry-run``
  69. don't actually make any edits, just log the pages that would have been
  70. edited
  71. """
  72. name = "wikiproject_tagger"
  73. # Regexes for template names that should always go above the banner, based
  74. # on [[Wikipedia:Talk page layout]]:
  75. TOP_TEMPS = [
  76. r"skip ?to ?(toc|talk|toctalk)$",
  77. r"ga ?nominee$",
  78. r"(user ?)?talk ?(header|page|page ?header)$",
  79. r"community ?article ?probation$",
  80. r"censor(-nudity)?$",
  81. r"blp(o| ?others?)?$",
  82. r"controvers(ial2?|y)$",
  83. r"(not ?(a ?)?)?forum$",
  84. r"tv(episode|series)talk$",
  85. r"recurring ?themes$",
  86. r"faq$",
  87. r"(round ?in ?)?circ(les|ular)$",
  88. r"ar(ti|it)cle ?(history|milestones)$",
  89. r"failed ?ga$",
  90. r"old ?prod( ?full)?$",
  91. r"(old|previous) ?afd$",
  92. ]
  93. @staticmethod
  94. def _upperfirst(text):
  95. """Try to uppercase the first letter of a string."""
  96. try:
  97. return text[0].upper() + text[1:]
  98. except IndexError:
  99. return text
  100. def run(self, **kwargs):
  101. """Main entry point for the bot task."""
  102. if "file" not in kwargs and "category" not in kwargs:
  103. log = "No pages to tag; I need either a 'category' or a 'file' passed as kwargs"
  104. self.logger.error(log)
  105. return
  106. if "banner" not in kwargs:
  107. log = "Needs a banner to add passed as the 'banner' kwarg"
  108. self.logger.error(log)
  109. return
  110. site = self.bot.wiki.get_site(name=kwargs.get("site"))
  111. banner = kwargs["banner"]
  112. summary = kwargs.get("summary", "Tagging with WikiProject banner $3.")
  113. update = kwargs.get("update", False)
  114. append = kwargs.get("append")
  115. autoassess = kwargs.get("autoassess", False)
  116. ow_banner = kwargs.get("only-with")
  117. nocreate = kwargs.get("nocreate", False)
  118. recursive = kwargs.get("recursive", 0)
  119. tag_categories = kwargs.get("tag-categories", False)
  120. dry_run = kwargs.get("dry-run", False)
  121. banner, names = self.get_names(site, banner)
  122. if not names:
  123. return
  124. if ow_banner:
  125. _, only_with = self.get_names(site, ow_banner)
  126. if not only_with:
  127. return
  128. else:
  129. only_with = None
  130. job = _Job(banner=banner, names=names, summary=summary, update=update,
  131. append=append, autoassess=autoassess, only_with=only_with,
  132. nocreate=nocreate, tag_categories=tag_categories,
  133. dry_run=dry_run)
  134. try:
  135. self.run_job(kwargs, site, job, recursive)
  136. except _ShutoffEnabled:
  137. return
  138. def run_job(self, kwargs, site, job, recursive):
  139. """Run a tagging *job* on a given *site*."""
  140. if "category" in kwargs:
  141. title = kwargs["category"]
  142. title = self.guess_namespace(site, title, constants.NS_CATEGORY)
  143. self.process_category(site.get_page(title), job, recursive)
  144. if "file" in kwargs:
  145. with open(kwargs["file"], "r") as fileobj:
  146. for line in fileobj:
  147. if line.strip():
  148. line = line.decode("utf8")
  149. if line.startswith("[[") and line.endswith("]]"):
  150. line = line[2:-2]
  151. page = site.get_page(line)
  152. if page.namespace == constants.NS_CATEGORY:
  153. self.process_category(page, job, recursive)
  154. else:
  155. self.process_page(page, job)
  156. def guess_namespace(self, site, title, assumed):
  157. """If the given *title* does not have an explicit namespace, guess it.
  158. For example, when transcluding templates, the namespace is guessed to
  159. be ``NS_TEMPLATE`` unless one is explicitly declared (so ``{{foo}}`` ->
  160. ``[[Template:Foo]]``, but ``{{:foo}}`` -> ``[[Foo]]``).
  161. """
  162. prefix = title.split(":", 1)[0]
  163. if prefix == title:
  164. return ":".join((site.namespace_id_to_name(assumed), title))
  165. try:
  166. site.namespace_name_to_id(prefix)
  167. except exceptions.NamespaceNotFoundError:
  168. return ":".join((site.namespace_id_to_name(assumed), title))
  169. return title
  170. def get_names(self, site, banner):
  171. """Return all possible aliases for a given *banner* template."""
  172. title = self.guess_namespace(site, banner, constants.NS_TEMPLATE)
  173. if title == banner:
  174. banner = banner.split(":", 1)[1]
  175. page = site.get_page(title)
  176. if page.exists != page.PAGE_EXISTS:
  177. self.logger.error("Banner [[%s]] does not exist", title)
  178. return banner, None
  179. names = {banner, title}
  180. result = site.api_query(action="query", list="backlinks", bllimit=500,
  181. blfilterredir="redirects", bltitle=title)
  182. for backlink in result["query"]["backlinks"]:
  183. names.add(backlink["title"])
  184. if backlink["ns"] == constants.NS_TEMPLATE:
  185. names.add(backlink["title"].split(":", 1)[1])
  186. log = "Found %s aliases for banner [[%s]]"
  187. self.logger.debug(log, len(names), title)
  188. return banner, names
  189. def process_category(self, page, job, recursive):
  190. """Try to tag all pages in the given category."""
  191. if page.title in job.processed_cats:
  192. self.logger.debug("Skipping category, already processed: [[%s]]",
  193. page.title)
  194. return
  195. self.logger.info("Processing category: [[%s]]", page.title)
  196. job.processed_cats.add(page.title)
  197. if job.tag_categories:
  198. self.process_page(page, job)
  199. for member in page.get_members():
  200. nspace = member.namespace
  201. if nspace == constants.NS_CATEGORY:
  202. if recursive is True:
  203. self.process_category(member, job, True)
  204. elif recursive > 0:
  205. self.process_category(member, job, recursive - 1)
  206. elif job.tag_categories:
  207. self.process_page(member, job)
  208. elif nspace in (constants.NS_USER, constants.NS_USER_TALK):
  209. continue
  210. else:
  211. self.process_page(member, job)
  212. def process_page(self, page, job):
  213. """Try to tag a specific *page* using the *job* description."""
  214. if not page.is_talkpage:
  215. page = page.toggle_talk()
  216. if page.title in job.processed_pages:
  217. self.logger.debug("Skipping page, already processed: [[%s]]",
  218. page.title)
  219. return
  220. job.processed_pages.add(page.title)
  221. if job.counter % 10 == 0: # Do a shutoff check every ten pages
  222. if self.shutoff_enabled(page.site):
  223. raise _ShutoffEnabled()
  224. job.counter += 1
  225. try:
  226. code = page.parse()
  227. except exceptions.PageNotFoundError:
  228. self.process_new_page(page, job)
  229. return
  230. except exceptions.InvalidPageError:
  231. self.logger.error("Skipping invalid page: [[%s]]", page.title)
  232. return
  233. is_update = False
  234. for template in code.ifilter_templates(recursive=True):
  235. if template.name.matches(job.names):
  236. if job.update:
  237. banner = template
  238. is_update = True
  239. break
  240. else:
  241. log = "Skipping page: [[%s]]; already tagged with '%s'"
  242. self.logger.info(log, page.title, template.name)
  243. return
  244. if job.only_with:
  245. if not any(template.name.matches(job.only_with)
  246. for template in code.ifilter_templates(recursive=True)):
  247. log = "Skipping page: [[%s]]; fails only-with condition"
  248. self.logger.info(log, page.title)
  249. return
  250. if is_update:
  251. old_banner = str(banner)
  252. self.update_banner(banner, job, code)
  253. if banner == old_banner:
  254. log = "Skipping page: [[%s]]; already tagged and no updates"
  255. self.logger.info(log, page.title)
  256. return
  257. self.logger.info("Updating banner on page: [[%s]]", page.title)
  258. banner = banner.encode("utf8")
  259. else:
  260. self.logger.info("Tagging page: [[%s]]", page.title)
  261. banner = self.make_banner(job, code)
  262. shell = self.get_banner_shell(code)
  263. if shell:
  264. self.add_banner_to_shell(shell, banner)
  265. else:
  266. self.add_banner(code, banner)
  267. self.save_page(page, job, str(code), banner)
  268. def process_new_page(self, page, job):
  269. """Try to tag a *page* that doesn't exist yet using the *job*."""
  270. if job.nocreate or job.only_with:
  271. log = "Skipping nonexistent page: [[%s]]"
  272. self.logger.info(log, page.title)
  273. else:
  274. self.logger.info("Tagging new page: [[%s]]", page.title)
  275. banner = self.make_banner(job)
  276. self.save_page(page, job, banner, banner)
  277. def save_page(self, page, job, text, banner):
  278. """Save a page with an updated banner."""
  279. if job.dry_run:
  280. self.logger.debug("[DRY RUN] Banner: %s", banner)
  281. else:
  282. summary = job.summary.replace("$3", banner)
  283. page.edit(text, self.make_summary(summary), minor=True)
  284. def make_banner(self, job, code=None):
  285. """Return banner text to add based on a *job* and a page's *code*."""
  286. banner = job.banner
  287. if code is not None and job.autoassess is not False:
  288. assess, reason = self.get_autoassessment(code, job.autoassess)
  289. if assess:
  290. banner += "|class=" + assess
  291. if reason:
  292. banner += "|auto=" + reason
  293. if job.append:
  294. banner += "|" + "|".join(job.append.split(","))
  295. return "{{" + banner + "}}"
  296. def update_banner(self, banner, job, code):
  297. """Update an existing *banner* based on a *job* and a page's *code*."""
  298. has = lambda key: (banner.has(key) and
  299. banner.get(key).value.strip() not in ("", "?"))
  300. if job.autoassess is not False:
  301. if not has("class"):
  302. assess, reason = self.get_autoassessment(code, job.autoassess)
  303. if assess:
  304. banner.add("class", assess)
  305. if reason:
  306. banner.add("auto", reason)
  307. if job.append:
  308. for param in job.append.split(","):
  309. key, value = param.split("=", 1)
  310. if not has(key):
  311. banner.add(key, value)
  312. def get_autoassessment(self, code, only_classes=None):
  313. """Get an autoassessment for a page.
  314. Return (assessed class as a string or None, assessment reason or None).
  315. """
  316. if only_classes is None or only_classes is True:
  317. classnames = ["a", "b", "book", "c", "dab", "fa", "fl", "ga",
  318. "list", "redirect", "start", "stub"]
  319. else:
  320. classnames = [klass.strip().lower()
  321. for klass in only_classes.split(",")]
  322. classes = {klass: 0 for klass in classnames}
  323. for template in code.ifilter_templates(recursive=True):
  324. if template.has("class"):
  325. value = str(template.get("class").value).lower()
  326. if value in classes:
  327. classes[value] += 1
  328. values = tuple(classes.values())
  329. best = max(values)
  330. if best:
  331. confidence = float(best) / sum(values)
  332. if confidence > 0.75:
  333. rank = tuple(classes.keys())[values.index(best)]
  334. if rank in ("fa", "fl", "ga"):
  335. return rank.upper(), "inherit"
  336. else:
  337. return self._upperfirst(rank), "inherit"
  338. return None, None
  339. def get_banner_shell(self, code):
  340. """Return the banner shell template within *code*, else ``None``."""
  341. regex = r"^\{\{\s*((WikiProject|WP)[ _]?Banner[ _]?S(hell)?|W(BPS|PBS|PB)|Shell)\s*(\||\}\})"
  342. shells = code.filter_templates(matches=regex)
  343. if not shells:
  344. shells = code.filter_templates(matches=regex, recursive=True)
  345. if shells:
  346. log = "Inserting banner into shell: %s"
  347. self.logger.debug(log, shells[0].name)
  348. return shells[0]
  349. def add_banner_to_shell(self, shell, banner):
  350. """Add *banner* to *shell*."""
  351. if shell.has_param(1):
  352. if str(shell.get(1).value).endswith("\n"):
  353. banner += "\n"
  354. else:
  355. banner = "\n" + banner
  356. shell.get(1).value.append(banner)
  357. else:
  358. shell.add(1, banner)
  359. def add_banner(self, code, banner):
  360. """Add *banner* to *code*, following template order conventions."""
  361. predecessor = None
  362. for template in code.ifilter_templates(recursive=False):
  363. name = template.name.lower().replace("_", " ")
  364. for regex in self.TOP_TEMPS:
  365. if re.match(regex, name):
  366. self.logger.debug("Skipping past top template: %s", name)
  367. predecessor = template
  368. break
  369. if "wikiproject" in name or name.startswith("wp"):
  370. self.logger.debug("Skipping past banner template: %s", name)
  371. predecessor = template
  372. if predecessor:
  373. self.logger.debug("Inserting banner after template")
  374. if not str(predecessor).endswith("\n"):
  375. banner = "\n" + banner
  376. post = code.index(predecessor) + 1
  377. if len(code.nodes) > post and not code.get(post).startswith("\n"):
  378. banner += "\n"
  379. code.insert_after(predecessor, banner)
  380. else:
  381. self.logger.debug("Inserting banner at beginning")
  382. code.insert(0, banner + "\n")
  383. class _Job:
  384. """Represents a single wikiproject-tagging task.
  385. Stores information on the banner to add, the edit summary to use, whether
  386. or not to autoassess and create new pages from scratch, and a counter of
  387. the number of pages edited.
  388. """
  389. def __init__(self, **kwargs):
  390. self.banner = kwargs["banner"]
  391. self.names = kwargs["names"]
  392. self.summary = kwargs["summary"]
  393. self.update = kwargs["update"]
  394. self.append = kwargs["append"]
  395. self.autoassess = kwargs["autoassess"]
  396. self.only_with = kwargs["only_with"]
  397. self.nocreate = kwargs["nocreate"]
  398. self.tag_categories = kwargs["tag_categories"]
  399. self.dry_run = kwargs["dry_run"]
  400. self.counter = 0
  401. self.processed_cats = set()
  402. self.processed_pages = set()
  403. class _ShutoffEnabled(Exception):
  404. """Raised by process_page() if shutoff is enabled. Caught by run(), which
  405. will then stop the task."""
  406. pass