Additional IRC commands and bot tasks for EarwigBot https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

354 lines
14 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2021 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. import difflib
  23. import json
  24. import re
  25. import sqlite3
  26. import subprocess
  27. import time
  28. import more_itertools
  29. import mwparserfromhell
  30. import unidecode
  31. from earwigbot.tasks import Task
  32. class SynonymAuthorities(Task):
  33. """
  34. Correct mismatched synonym authorities in taxon articles created by Qbugbot.
  35. """
  36. name = 'synonym_authorities'
  37. number = 21
  38. base_summary = (
  39. 'Fix {changes} mismatched synonym authorities per ITIS '
  40. '([[Wikipedia:Bots/Requests for approval/EarwigBot 21|more info]])'
  41. )
  42. def setup(self):
  43. self.site = self.bot.wiki.get_site()
  44. self.creator = 'Qbugbot'
  45. self.pages_path = 'qbugbot_pages.json'
  46. self.synonyms_path = 'qbugbot_synonyms.json'
  47. self.edits_path = 'qbugbot_edits.json'
  48. self.itis_path = 'itis.db'
  49. self.summary = self.make_summary(self.base_summary)
  50. def run(self, action=None):
  51. if action == 'fetch_pages':
  52. self.fetch_pages()
  53. elif action == 'fetch_synonyms':
  54. self.fetch_synonyms()
  55. elif action == 'prepare_edits':
  56. self.prepare_edits()
  57. elif action == 'view_edits':
  58. self.view_edits()
  59. elif action == 'save_edits':
  60. self.save_edits()
  61. elif action is None:
  62. raise RuntimeError(f'This task requires an action')
  63. else:
  64. raise RuntimeError(f'No such action: {action}')
  65. def fetch_pages(self):
  66. """
  67. Fetch pages edited by Qbugbot.
  68. """
  69. pages = {}
  70. for chunk in more_itertools.chunked(self._iter_creations(), 500):
  71. pages.update(self._fetch_chunk(chunk))
  72. self.logger.info(f'Fetched {len(pages)} pages')
  73. with open(self.pages_path, 'w') as fp:
  74. json.dump(pages, fp)
  75. def _iter_creations(self):
  76. # TODO: include converted redirects ([[Category:Articles created by Qbugbot]])
  77. params = {
  78. 'action': 'query',
  79. 'list': 'usercontribs',
  80. 'ucuser': self.creator,
  81. 'uclimit': 5000,
  82. 'ucnamespace': 0,
  83. 'ucprop': 'ids',
  84. 'ucshow': 'new',
  85. 'formatversion': 2,
  86. }
  87. results = self.site.api_query(**params)
  88. while contribs := results['query']['usercontribs']:
  89. yield from contribs
  90. if 'continue' not in results:
  91. break
  92. params.update(results['continue'])
  93. results = self.site.api_query(**params)
  94. def _fetch_chunk(self, chunk):
  95. result = self.site.api_query(
  96. action='query',
  97. prop='revisions',
  98. rvprop='ids|content',
  99. rvslots='main',
  100. pageids='|'.join(str(page['pageid']) for page in chunk),
  101. formatversion=2,
  102. )
  103. pages = result['query']['pages']
  104. assert len(pages) == len(chunk)
  105. return {
  106. page['pageid']: {
  107. 'title': page['title'],
  108. 'content': page['revisions'][0]['slots']['main']['content'],
  109. 'revid': page['revisions'][0]['revid'],
  110. }
  111. for page in pages
  112. }
  113. def fetch_synonyms(self):
  114. """
  115. Fetch correct synonym lists for pages generated by fetch_pages.
  116. """
  117. with open(self.pages_path) as fp:
  118. pages = json.load(fp)
  119. wikidata = self.bot.wiki.get_site('wikidatawiki')
  120. itis_property = 'P815'
  121. conn = sqlite3.connect(self.itis_path)
  122. cur = conn.cursor()
  123. synonyms = {}
  124. for chunk in more_itertools.chunked(pages.items(), 50):
  125. titles = {page['title']: pageid for pageid, page in chunk}
  126. result = wikidata.api_query(
  127. action='wbgetentities',
  128. sites='enwiki',
  129. titles='|'.join(titles),
  130. props='claims|sitelinks',
  131. languages='en',
  132. sitefilter='enwiki',
  133. )
  134. for item in result['entities'].values():
  135. if 'sitelinks' not in item:
  136. self.logger.warning(f'No sitelinks for item: {item}')
  137. continue
  138. title = item['sitelinks']['enwiki']['title']
  139. pageid = titles[title]
  140. if itis_property not in item['claims']:
  141. self.logger.warning(f'No ITIS ID for [[{title}]]')
  142. continue
  143. claims = item['claims'][itis_property]
  144. assert len(claims) == 1, (title, claims)
  145. itis_id = claims[0]['mainsnak']['datavalue']['value']
  146. cur.execute("""
  147. SELECT synonym.complete_name, authors.taxon_author
  148. FROM synonym_links sl
  149. INNER JOIN taxonomic_units accepted ON sl.tsn_accepted = accepted.tsn
  150. INNER JOIN taxonomic_units synonym ON sl.tsn = synonym.tsn
  151. LEFT JOIN taxon_authors_lkp authors ON synonym.taxon_author_id = authors.taxon_author_id
  152. WHERE sl.tsn_accepted = ?
  153. UNION ALL
  154. SELECT complete_name, taxon_author
  155. FROM taxonomic_units accepted
  156. LEFT JOIN taxon_authors_lkp authors USING (taxon_author_id)
  157. WHERE accepted.tsn = ?;
  158. """, (itis_id, itis_id))
  159. synonyms[pageid] = cur.fetchall()
  160. self.logger.info(f'Fetched {len(synonyms)} synonym lists')
  161. with open(self.synonyms_path, 'w') as fp:
  162. json.dump(synonyms, fp)
  163. def prepare_edits(self):
  164. """
  165. Prepare edits based on the output of fetch_pages and fetch_synonyms.
  166. """
  167. with open(self.pages_path) as fp:
  168. pages = json.load(fp)
  169. with open(self.synonyms_path) as fp:
  170. synonyms = json.load(fp)
  171. edits = {}
  172. for pageid, pageinfo in pages.items():
  173. if pageid not in synonyms:
  174. continue
  175. wikitext = mwparserfromhell.parse(pageinfo['content'])
  176. try:
  177. changes = self._update_synonyms(pageinfo['title'], wikitext, synonyms[pageid])
  178. if not changes:
  179. continue
  180. except Exception:
  181. self.logger.error(f'Failed to update synonyms for [[{pageinfo["title"]}]]')
  182. raise
  183. edits[pageid] = {
  184. 'title': pageinfo['title'],
  185. 'revid': pageinfo['revid'],
  186. 'original': pageinfo['content'],
  187. 'content': str(wikitext),
  188. 'changes': changes,
  189. }
  190. with open(self.edits_path, 'w') as fp:
  191. json.dump(edits, fp)
  192. def _update_synonyms(self, title, wikitext, synonyms):
  193. if len(synonyms) <= 1:
  194. return False
  195. if wikitext.split('\n', 1)[0].upper().startswith('#REDIRECT'):
  196. self.logger.debug(f'[[{title}]]: Skipping redirect')
  197. return False
  198. taxoboxes = wikitext.filter_templates(
  199. matches=lambda tmpl: tmpl.name.matches(('Speciesbox', 'Automatic taxobox')))
  200. if not taxoboxes:
  201. self.logger.warning(f'[[{title}]]: No taxoboxes found')
  202. return False
  203. if len(taxoboxes) > 1:
  204. self.logger.warning(f'[[{title}]]: Multiple taxoboxes found')
  205. return False
  206. try:
  207. syn_param = taxoboxes[0].get('synonyms')
  208. except ValueError:
  209. self.logger.debug(f'[[{title}]]: No synonyms parameter in taxobox')
  210. return False
  211. tmpls = syn_param.value.filter_templates(
  212. matches=lambda tmpl: tmpl.name.matches(('Species list', 'Taxon list')))
  213. if not tmpls:
  214. # This means the bot's original work is no longer there. In most cases, this is
  215. # an unrelated synonym list added by another editor and there is nothing to check,
  216. # but it's possible someone converted the bot's list into a different format without
  217. # checking the authorities. Those cases need to be manually checked.
  218. self.logger.warning(f'[[{title}]]: Could not find a taxa list in taxobox')
  219. return False
  220. if len(tmpls) > 1:
  221. self.logger.warning(f'[[{title}]]: Multiple taxa lists found in taxobox')
  222. return False
  223. expected = {}
  224. for taxon, author in synonyms:
  225. if taxon in expected and expected[taxon] != author:
  226. # These need to be manually reviewed
  227. self.logger.warning(f'[[{title}]]: Expected synonym list has duplicates')
  228. return False
  229. expected[self._normalize(taxon)] = self._normalize(author)
  230. actual = {}
  231. formatted_authors = {}
  232. splist = tmpls[0]
  233. for i in range(len(splist.params) // 2):
  234. taxon_param, author_param = splist.params[2 * i], splist.params[2 * i + 1]
  235. taxon = self._normalize(taxon_param.value)
  236. author = self._normalize(author_param.value)
  237. if taxon not in expected:
  238. self.logger.warning(f'[[{title}]]: Unknown synonym {taxon!r}')
  239. return False
  240. actual[taxon] = author
  241. formatted_authors.setdefault(author, []).append(author_param.value.strip())
  242. expected = {taxon: author for taxon, author in expected.items() if taxon in actual}
  243. assert set(expected.keys()) == set(actual.keys())
  244. if expected == actual:
  245. self.logger.debug(f'[[{title}]]: Nothing to update')
  246. return None
  247. if list(expected.values()) != list(actual.values()):
  248. if set(expected.values()) == set(actual.values()):
  249. self.logger.warning(f'[[{title}]]: Actual authors are not in expected order')
  250. else:
  251. self.logger.warning(f'[[{title}]]: Actual authors do not match expected')
  252. return False
  253. changes = []
  254. for i in range(len(splist.params) // 2):
  255. taxon_param, author_param = splist.params[2 * i], splist.params[2 * i + 1]
  256. taxon = self._normalize(taxon_param.value)
  257. if expected[taxon] != actual[taxon]:
  258. author = formatted_authors[expected[taxon]].pop(0)
  259. match = re.match(r'^(\s*).*?(\s*)$', str(author_param.value))
  260. ws_before, ws_after = match.group(1), match.group(2)
  261. author_param.value = f'{ws_before}{author}{ws_after}'
  262. changes.append((taxon, actual[taxon], expected[taxon]))
  263. if changes:
  264. self.logger.info(f'Will update {len(changes)} synonyms in [[{title}]]')
  265. else:
  266. self.logger.debug(f'Nothing to update in [[{title}]]')
  267. return changes
  268. @staticmethod
  269. def _normalize(value):
  270. """
  271. Normalize a taxon or author name.
  272. """
  273. if isinstance(value, mwparserfromhell.wikicode.Wikicode):
  274. value = value.strip_code()
  275. if not value or not value.strip():
  276. return None
  277. return unidecode.unidecode(value.strip().casefold().replace('&', 'and').replace(',', ''))
  278. def view_edits(self):
  279. """
  280. Examine edits prepared by prepare_edits.
  281. """
  282. with open(self.edits_path) as fp:
  283. edits = json.load(fp)
  284. self.logger.info(f'{len(edits)} pages to edit')
  285. for pageid, edit in edits.items():
  286. print(f'\n{pageid}: {edit["title"]}:')
  287. old, new = edit['original'], edit['content']
  288. udiff = difflib.unified_diff(old.splitlines(), new.splitlines(), 'old', 'new')
  289. subprocess.run(
  290. ['delta', '-s', '--paging', 'never'],
  291. input='\n'.join(udiff), text=True
  292. )
  293. def save_edits(self):
  294. """
  295. Save edits prepared by prepare_edits.
  296. """
  297. with open(self.edits_path) as fp:
  298. edits = json.load(fp)
  299. self.logger.info(f'{len(edits)} pages to edit')
  300. for pageid, edit in edits.items():
  301. page = self.site.get_page(edit['title'])
  302. self.logger.info(f'{pageid}: [[{page.title}]]')
  303. if self.shutoff_enabled():
  304. raise RuntimeError('Shutoff enabled')
  305. if not page.check_exclusion():
  306. self.logger.warning(f'[[{page.title}]]: Bot excluded from editing')
  307. continue
  308. page.edit(
  309. edit['content'],
  310. summary=self.summary.format(changes=len(edit['changes'])),
  311. baserevid=edit['revid'],
  312. basetimestamp=None,
  313. starttimestamp=None,
  314. )
  315. time.sleep(10)