Additional IRC commands and bot tasks for EarwigBot https://en.wikipedia.org/wiki/User:EarwigBot
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

synonym_authorities.py 14 KiB

há 3 anos
há 3 anos
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. # Copyright (C) 2021 Ben Kurtovic <ben.kurtovic@gmail.com>
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19. # SOFTWARE.
  20. import difflib
  21. import json
  22. import re
  23. import sqlite3
  24. import subprocess
  25. import time
  26. import more_itertools
  27. import mwparserfromhell
  28. import unidecode
  29. from earwigbot.tasks import Task
  30. class SynonymAuthorities(Task):
  31. """
  32. Correct mismatched synonym authorities in taxon articles created by Qbugbot.
  33. """
  34. name = "synonym_authorities"
  35. number = 21
  36. base_summary = (
  37. "Fix {changes} mismatched synonym authorities per ITIS "
  38. "([[Wikipedia:Bots/Requests for approval/EarwigBot 21|more info]])"
  39. )
  40. def setup(self):
  41. self.site = self.bot.wiki.get_site()
  42. self.creator = "Qbugbot"
  43. self.pages_path = "qbugbot_pages.json"
  44. self.synonyms_path = "qbugbot_synonyms.json"
  45. self.edits_path = "qbugbot_edits.json"
  46. self.itis_path = "itis.db"
  47. self.summary = self.make_summary(self.base_summary)
  48. def run(self, action=None):
  49. if action == "fetch_pages":
  50. self.fetch_pages()
  51. elif action == "fetch_synonyms":
  52. self.fetch_synonyms()
  53. elif action == "prepare_edits":
  54. self.prepare_edits()
  55. elif action == "view_edits":
  56. self.view_edits()
  57. elif action == "save_edits":
  58. self.save_edits()
  59. elif action is None:
  60. raise RuntimeError("This task requires an action")
  61. else:
  62. raise RuntimeError(f"No such action: {action}")
  63. def fetch_pages(self):
  64. """
  65. Fetch pages edited by Qbugbot.
  66. """
  67. pages = {}
  68. for chunk in more_itertools.chunked(self._iter_creations(), 500):
  69. pages.update(self._fetch_chunk(chunk))
  70. self.logger.info(f"Fetched {len(pages)} pages")
  71. with open(self.pages_path, "w") as fp:
  72. json.dump(pages, fp)
  73. def _iter_creations(self):
  74. # TODO: include converted redirects ([[Category:Articles created by Qbugbot]])
  75. params = {
  76. "action": "query",
  77. "list": "usercontribs",
  78. "ucuser": self.creator,
  79. "uclimit": 5000,
  80. "ucnamespace": 0,
  81. "ucprop": "ids",
  82. "ucshow": "new",
  83. "formatversion": 2,
  84. }
  85. results = self.site.api_query(**params)
  86. while contribs := results["query"]["usercontribs"]:
  87. yield from contribs
  88. if "continue" not in results:
  89. break
  90. params.update(results["continue"])
  91. results = self.site.api_query(**params)
  92. def _fetch_chunk(self, chunk):
  93. result = self.site.api_query(
  94. action="query",
  95. prop="revisions",
  96. rvprop="ids|content",
  97. rvslots="main",
  98. pageids="|".join(str(page["pageid"]) for page in chunk),
  99. formatversion=2,
  100. )
  101. pages = result["query"]["pages"]
  102. assert len(pages) == len(chunk)
  103. return {
  104. page["pageid"]: {
  105. "title": page["title"],
  106. "content": page["revisions"][0]["slots"]["main"]["content"],
  107. "revid": page["revisions"][0]["revid"],
  108. }
  109. for page in pages
  110. }
  111. def fetch_synonyms(self):
  112. """
  113. Fetch correct synonym lists for pages generated by fetch_pages.
  114. """
  115. with open(self.pages_path) as fp:
  116. pages = json.load(fp)
  117. wikidata = self.bot.wiki.get_site("wikidatawiki")
  118. itis_property = "P815"
  119. conn = sqlite3.connect(self.itis_path)
  120. cur = conn.cursor()
  121. synonyms = {}
  122. for chunk in more_itertools.chunked(pages.items(), 50):
  123. titles = {page["title"]: pageid for pageid, page in chunk}
  124. result = wikidata.api_query(
  125. action="wbgetentities",
  126. sites="enwiki",
  127. titles="|".join(titles),
  128. props="claims|sitelinks",
  129. languages="en",
  130. sitefilter="enwiki",
  131. )
  132. for item in result["entities"].values():
  133. if "sitelinks" not in item:
  134. self.logger.warning(f"No sitelinks for item: {item}")
  135. continue
  136. title = item["sitelinks"]["enwiki"]["title"]
  137. pageid = titles[title]
  138. if itis_property not in item["claims"]:
  139. self.logger.warning(f"No ITIS ID for [[{title}]]")
  140. continue
  141. claims = item["claims"][itis_property]
  142. assert len(claims) == 1, (title, claims)
  143. itis_id = claims[0]["mainsnak"]["datavalue"]["value"]
  144. cur.execute(
  145. """
  146. SELECT synonym.complete_name, authors.taxon_author
  147. FROM synonym_links sl
  148. INNER JOIN taxonomic_units accepted ON sl.tsn_accepted = accepted.tsn
  149. INNER JOIN taxonomic_units synonym ON sl.tsn = synonym.tsn
  150. LEFT JOIN taxon_authors_lkp authors ON synonym.taxon_author_id = authors.taxon_author_id
  151. WHERE sl.tsn_accepted = ?
  152. UNION ALL
  153. SELECT complete_name, taxon_author
  154. FROM taxonomic_units accepted
  155. LEFT JOIN taxon_authors_lkp authors USING (taxon_author_id)
  156. WHERE accepted.tsn = ?;
  157. """,
  158. (itis_id, itis_id),
  159. )
  160. synonyms[pageid] = cur.fetchall()
  161. self.logger.info(f"Fetched {len(synonyms)} synonym lists")
  162. with open(self.synonyms_path, "w") as fp:
  163. json.dump(synonyms, fp)
  164. def prepare_edits(self):
  165. """
  166. Prepare edits based on the output of fetch_pages and fetch_synonyms.
  167. """
  168. with open(self.pages_path) as fp:
  169. pages = json.load(fp)
  170. with open(self.synonyms_path) as fp:
  171. synonyms = json.load(fp)
  172. edits = {}
  173. for pageid, pageinfo in pages.items():
  174. if pageid not in synonyms:
  175. continue
  176. wikitext = mwparserfromhell.parse(pageinfo["content"])
  177. try:
  178. changes = self._update_synonyms(
  179. pageinfo["title"], wikitext, synonyms[pageid]
  180. )
  181. if not changes:
  182. continue
  183. except Exception:
  184. self.logger.error(
  185. f'Failed to update synonyms for [[{pageinfo["title"]}]]'
  186. )
  187. raise
  188. edits[pageid] = {
  189. "title": pageinfo["title"],
  190. "revid": pageinfo["revid"],
  191. "original": pageinfo["content"],
  192. "content": str(wikitext),
  193. "changes": changes,
  194. }
  195. with open(self.edits_path, "w") as fp:
  196. json.dump(edits, fp)
  197. def _update_synonyms(self, title, wikitext, synonyms):
  198. if len(synonyms) <= 1:
  199. return False
  200. if wikitext.split("\n", 1)[0].upper().startswith("#REDIRECT"):
  201. self.logger.debug(f"[[{title}]]: Skipping redirect")
  202. return False
  203. taxoboxes = wikitext.filter_templates(
  204. matches=lambda tmpl: tmpl.name.matches(("Speciesbox", "Automatic taxobox"))
  205. )
  206. if not taxoboxes:
  207. self.logger.warning(f"[[{title}]]: No taxoboxes found")
  208. return False
  209. if len(taxoboxes) > 1:
  210. self.logger.warning(f"[[{title}]]: Multiple taxoboxes found")
  211. return False
  212. try:
  213. syn_param = taxoboxes[0].get("synonyms")
  214. except ValueError:
  215. self.logger.debug(f"[[{title}]]: No synonyms parameter in taxobox")
  216. return False
  217. tmpls = syn_param.value.filter_templates(
  218. matches=lambda tmpl: tmpl.name.matches(("Species list", "Taxon list"))
  219. )
  220. if not tmpls:
  221. # This means the bot's original work is no longer there. In most cases, this is
  222. # an unrelated synonym list added by another editor and there is nothing to check,
  223. # but it's possible someone converted the bot's list into a different format without
  224. # checking the authorities. Those cases need to be manually checked.
  225. self.logger.warning(f"[[{title}]]: Could not find a taxa list in taxobox")
  226. return False
  227. if len(tmpls) > 1:
  228. self.logger.warning(f"[[{title}]]: Multiple taxa lists found in taxobox")
  229. return False
  230. expected = {}
  231. for taxon, author in synonyms:
  232. if taxon in expected and expected[taxon] != author:
  233. # These need to be manually reviewed
  234. self.logger.warning(
  235. f"[[{title}]]: Expected synonym list has duplicates"
  236. )
  237. return False
  238. expected[self._normalize(taxon)] = self._normalize(author)
  239. actual = {}
  240. formatted_authors = {}
  241. splist = tmpls[0]
  242. for i in range(len(splist.params) // 2):
  243. taxon_param, author_param = splist.params[2 * i], splist.params[2 * i + 1]
  244. taxon = self._normalize(taxon_param.value)
  245. author = self._normalize(author_param.value)
  246. if taxon not in expected:
  247. self.logger.warning(f"[[{title}]]: Unknown synonym {taxon!r}")
  248. return False
  249. actual[taxon] = author
  250. formatted_authors.setdefault(author, []).append(author_param.value.strip())
  251. expected = {
  252. taxon: author for taxon, author in expected.items() if taxon in actual
  253. }
  254. assert set(expected.keys()) == set(actual.keys())
  255. if expected == actual:
  256. self.logger.debug(f"[[{title}]]: Nothing to update")
  257. return None
  258. if list(expected.values()) != list(actual.values()):
  259. if set(expected.values()) == set(actual.values()):
  260. self.logger.warning(
  261. f"[[{title}]]: Actual authors are not in expected order"
  262. )
  263. else:
  264. self.logger.warning(
  265. f"[[{title}]]: Actual authors do not match expected"
  266. )
  267. return False
  268. changes = []
  269. for i in range(len(splist.params) // 2):
  270. taxon_param, author_param = splist.params[2 * i], splist.params[2 * i + 1]
  271. taxon = self._normalize(taxon_param.value)
  272. if expected[taxon] != actual[taxon]:
  273. author = formatted_authors[expected[taxon]].pop(0)
  274. match = re.match(r"^(\s*).*?(\s*)$", str(author_param.value))
  275. ws_before, ws_after = match.group(1), match.group(2)
  276. author_param.value = f"{ws_before}{author}{ws_after}"
  277. changes.append((taxon, actual[taxon], expected[taxon]))
  278. if changes:
  279. self.logger.info(f"Will update {len(changes)} synonyms in [[{title}]]")
  280. else:
  281. self.logger.debug(f"Nothing to update in [[{title}]]")
  282. return changes
  283. @staticmethod
  284. def _normalize(value):
  285. """
  286. Normalize a taxon or author name.
  287. """
  288. if isinstance(value, mwparserfromhell.wikicode.Wikicode):
  289. value = value.strip_code()
  290. if not value or not value.strip():
  291. return None
  292. return unidecode.unidecode(
  293. value.strip().casefold().replace("&", "and").replace(",", "")
  294. )
  295. def view_edits(self):
  296. """
  297. Examine edits prepared by prepare_edits.
  298. """
  299. with open(self.edits_path) as fp:
  300. edits = json.load(fp)
  301. self.logger.info(f"{len(edits)} pages to edit")
  302. for pageid, edit in edits.items():
  303. print(f'\n{pageid}: {edit["title"]}:')
  304. old, new = edit["original"], edit["content"]
  305. udiff = difflib.unified_diff(
  306. old.splitlines(), new.splitlines(), "old", "new"
  307. )
  308. subprocess.run(
  309. ["delta", "-s", "--paging", "never"], input="\n".join(udiff), text=True
  310. )
  311. def save_edits(self):
  312. """
  313. Save edits prepared by prepare_edits.
  314. """
  315. with open(self.edits_path) as fp:
  316. edits = json.load(fp)
  317. self.logger.info(f"{len(edits)} pages to edit")
  318. for pageid, edit in edits.items():
  319. page = self.site.get_page(edit["title"])
  320. self.logger.info(f"{pageid}: [[{page.title}]]")
  321. if self.shutoff_enabled():
  322. raise RuntimeError("Shutoff enabled")
  323. if not page.check_exclusion():
  324. self.logger.warning(f"[[{page.title}]]: Bot excluded from editing")
  325. continue
  326. page.edit(
  327. edit["content"],
  328. summary=self.summary.format(changes=len(edit["changes"])),
  329. baserevid=edit["revid"],
  330. basetimestamp=None,
  331. starttimestamp=None,
  332. )
  333. time.sleep(10)