Additional IRC commands and bot tasks for EarwigBot https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

349 lines
13 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2021 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. import difflib
  23. import json
  24. import re
  25. import sqlite3
  26. import subprocess
  27. import time
  28. import more_itertools
  29. import mwparserfromhell
  30. import unidecode
  31. from earwigbot.tasks import Task
  32. class SynonymAuthorities(Task):
  33. """
  34. Correct mismatched synonym authorities in taxon articles created by Qbugbot.
  35. """
  36. name = 'synonym_authorities'
  37. summary = (
  38. 'Fix {changes} mismatched synonym authorities per ITIS '
  39. '([[Wikipedia:Bots/Requests for approval/EarwigBot 21|more info]])'
  40. )
  41. def setup(self):
  42. self.site = self.bot.wiki.get_site()
  43. self.creator = 'Qbugbot'
  44. self.pages_path = 'qbugbot_pages.json'
  45. self.synonyms_path = 'qbugbot_synonyms.json'
  46. self.edits_path = 'qbugbot_edits.json'
  47. self.itis_path = 'itis.db'
  48. def run(self, action=None):
  49. if action == 'fetch_pages':
  50. self.fetch_pages()
  51. elif action == 'fetch_synonyms':
  52. self.fetch_synonyms()
  53. elif action == 'prepare_edits':
  54. self.prepare_edits()
  55. elif action == 'view_edits':
  56. self.view_edits()
  57. elif action == 'save_edits':
  58. self.save_edits()
  59. elif action is None:
  60. raise RuntimeError(f'This task requires an action')
  61. else:
  62. raise RuntimeError(f'No such action: {action}')
  63. def fetch_pages(self):
  64. """
  65. Fetch pages edited by Qbugbot.
  66. """
  67. pages = {}
  68. for chunk in more_itertools.chunked(self._iter_creations(), 500):
  69. pages.update(self._fetch_chunk(chunk))
  70. self.logger.info(f'Fetched {len(pages)} pages')
  71. with open(self.pages_path, 'w') as fp:
  72. json.dump(pages, fp)
  73. def _iter_creations(self):
  74. params = {
  75. 'action': 'query',
  76. 'list': 'usercontribs',
  77. 'ucuser': self.creator,
  78. 'uclimit': 5000,
  79. 'ucnamespace': 0,
  80. 'ucprop': 'ids',
  81. 'ucshow': 'new',
  82. 'formatversion': 2,
  83. }
  84. results = self.site.api_query(**params)
  85. while contribs := results['query']['usercontribs']:
  86. yield from contribs
  87. if 'continue' not in results:
  88. break
  89. params.update(results['continue'])
  90. results = self.site.api_query(**params)
  91. def _fetch_chunk(self, chunk):
  92. result = self.site.api_query(
  93. action='query',
  94. prop='revisions',
  95. rvprop='ids|content',
  96. rvslots='main',
  97. pageids='|'.join(str(page['pageid']) for page in chunk),
  98. formatversion=2,
  99. )
  100. pages = result['query']['pages']
  101. assert len(pages) == len(chunk)
  102. return {
  103. page['pageid']: {
  104. 'title': page['title'],
  105. 'content': page['revisions'][0]['slots']['main']['content'],
  106. 'revid': page['revisions'][0]['revid'],
  107. }
  108. for page in pages
  109. }
  110. def fetch_synonyms(self):
  111. """
  112. Fetch correct synonym lists for pages generated by fetch_pages.
  113. """
  114. with open(self.pages_path) as fp:
  115. pages = json.load(fp)
  116. wikidata = self.bot.wiki.get_site('wikidatawiki')
  117. itis_property = 'P815'
  118. conn = sqlite3.connect(self.itis_path)
  119. cur = conn.cursor()
  120. synonyms = {}
  121. for chunk in more_itertools.chunked(pages.items(), 50):
  122. titles = {page['title']: pageid for pageid, page in chunk}
  123. result = wikidata.api_query(
  124. action='wbgetentities',
  125. sites='enwiki',
  126. titles='|'.join(titles),
  127. props='claims|sitelinks',
  128. languages='en',
  129. sitefilter='enwiki',
  130. )
  131. for item in result['entities'].values():
  132. if 'sitelinks' not in item:
  133. self.logger.warning(f'No sitelinks for item: {item}')
  134. continue
  135. title = item['sitelinks']['enwiki']['title']
  136. pageid = titles[title]
  137. if itis_property not in item['claims']:
  138. self.logger.warning(f'No ITIS ID for [[{title}]]')
  139. continue
  140. claims = item['claims'][itis_property]
  141. assert len(claims) == 1, (title, claims)
  142. itis_id = claims[0]['mainsnak']['datavalue']['value']
  143. cur.execute("""
  144. SELECT synonym.complete_name, authors.taxon_author
  145. FROM synonym_links sl
  146. INNER JOIN taxonomic_units accepted ON sl.tsn_accepted = accepted.tsn
  147. INNER JOIN taxonomic_units synonym ON sl.tsn = synonym.tsn
  148. LEFT JOIN taxon_authors_lkp authors ON synonym.taxon_author_id = authors.taxon_author_id
  149. WHERE sl.tsn_accepted = ?
  150. UNION ALL
  151. SELECT complete_name, taxon_author
  152. FROM taxonomic_units accepted
  153. LEFT JOIN taxon_authors_lkp authors USING (taxon_author_id)
  154. WHERE accepted.tsn = ?;
  155. """, (itis_id, itis_id))
  156. synonyms[pageid] = cur.fetchall()
  157. self.logger.info(f'Fetched {len(synonyms)} synonym lists')
  158. with open(self.synonyms_path, 'w') as fp:
  159. json.dump(synonyms, fp)
  160. def prepare_edits(self):
  161. """
  162. Prepare edits based on the output of fetch_pages and fetch_synonyms.
  163. """
  164. with open(self.pages_path) as fp:
  165. pages = json.load(fp)
  166. with open(self.synonyms_path) as fp:
  167. synonyms = json.load(fp)
  168. edits = {}
  169. for pageid, pageinfo in pages.items():
  170. if pageid not in synonyms:
  171. continue
  172. wikitext = mwparserfromhell.parse(pageinfo['content'])
  173. try:
  174. changes = self._update_synonyms(pageinfo['title'], wikitext, synonyms[pageid])
  175. if not changes:
  176. continue
  177. except Exception:
  178. self.logger.error(f'Failed to update synonyms for [[{pageinfo["title"]}]]')
  179. raise
  180. edits[pageid] = {
  181. 'title': pageinfo['title'],
  182. 'revid': pageinfo['revid'],
  183. 'original': pageinfo['content'],
  184. 'content': str(wikitext),
  185. 'changes': changes,
  186. }
  187. with open(self.edits_path, 'w') as fp:
  188. json.dump(edits, fp)
  189. def _update_synonyms(self, title, wikitext, synonyms):
  190. if len(synonyms) <= 1:
  191. return False
  192. if wikitext.split('\n', 1)[0].upper().startswith('#REDIRECT'):
  193. self.logger.debug(f'[[{title}]]: Skipping redirect')
  194. return False
  195. taxoboxes = wikitext.filter_templates(
  196. matches=lambda tmpl: tmpl.name.matches(('Speciesbox', 'Automatic taxobox')))
  197. if not taxoboxes:
  198. self.logger.warning(f'[[{title}]]: No taxoboxes found')
  199. return False
  200. if len(taxoboxes) > 1:
  201. self.logger.warning(f'[[{title}]]: Multiple taxoboxes found')
  202. return False
  203. try:
  204. syn_param = taxoboxes[0].get('synonyms')
  205. except ValueError:
  206. self.logger.debug(f'[[{title}]]: No synonyms parameter in taxobox')
  207. return False
  208. tmpls = syn_param.value.filter_templates(
  209. matches=lambda tmpl: tmpl.name.matches(('Species list', 'Taxon list')))
  210. if not tmpls:
  211. # This means the bot's original work is no longer there. In most cases, this is
  212. # an unrelated synonym list added by another editor and there is nothing to check,
  213. # but it's possible someone converted the bot's list into a different format without
  214. # checking the authorities. Those cases need to be manually checked.
  215. self.logger.warning(f'[[{title}]]: Could not find a taxa list in taxobox')
  216. return False
  217. if len(tmpls) > 1:
  218. self.logger.warning(f'[[{title}]]: Multiple taxa lists found in taxobox')
  219. return False
  220. expected = {}
  221. for taxon, author in synonyms:
  222. if taxon in expected and expected[taxon] != author:
  223. # These need to be manually reviewed
  224. self.logger.warning(f'[[{title}]]: Expected synonym list has duplicates')
  225. return False
  226. expected[self._normalize(taxon)] = self._normalize(author)
  227. actual = {}
  228. formatted_authors = {}
  229. splist = tmpls[0]
  230. for i in range(len(splist.params) // 2):
  231. taxon_param, author_param = splist.params[2 * i], splist.params[2 * i + 1]
  232. taxon = self._normalize(taxon_param.value)
  233. author = self._normalize(author_param.value)
  234. if taxon not in expected:
  235. self.logger.warning(f'[[{title}]]: Unknown synonym {taxon!r}')
  236. return False
  237. actual[taxon] = author
  238. formatted_authors.setdefault(author, []).append(author_param.value.strip())
  239. expected = {taxon: author for taxon, author in expected.items() if taxon in actual}
  240. assert set(expected.keys()) == set(actual.keys())
  241. if expected == actual:
  242. self.logger.debug(f'[[{title}]]: Nothing to update')
  243. return None
  244. if list(expected.values()) != list(actual.values()):
  245. if set(expected.values()) == set(actual.values()):
  246. self.logger.warning(f'[[{title}]]: Actual authors are not in expected order')
  247. else:
  248. self.logger.warning(f'[[{title}]]: Actual authors do not match expected')
  249. return False
  250. changes = []
  251. for i in range(len(splist.params) // 2):
  252. taxon_param, author_param = splist.params[2 * i], splist.params[2 * i + 1]
  253. taxon = self._normalize(taxon_param.value)
  254. if expected[taxon] != actual[taxon]:
  255. author = formatted_authors[expected[taxon]].pop(0)
  256. match = re.match(r'^(\s*).*?(\s*)$', str(author_param.value))
  257. ws_before, ws_after = match.group(1), match.group(2)
  258. author_param.value = f'{ws_before}{author}{ws_after}'
  259. changes.append((taxon, actual[taxon], expected[taxon]))
  260. if changes:
  261. self.logger.info(f'Will update {len(changes)} synonyms in [[{title}]]')
  262. else:
  263. self.logger.debug(f'Nothing to update in [[{title}]]')
  264. return changes
  265. @staticmethod
  266. def _normalize(value):
  267. """
  268. Normalize a taxon or author name.
  269. """
  270. if isinstance(value, mwparserfromhell.wikicode.Wikicode):
  271. value = value.strip_code()
  272. if not value or not value.strip():
  273. return None
  274. return unidecode.unidecode(value.strip().casefold().replace('&', 'and').replace(',', ''))
  275. def view_edits(self):
  276. """
  277. Examine edits prepared by prepare_edits.
  278. """
  279. with open(self.edits_path) as fp:
  280. edits = json.load(fp)
  281. self.logger.info(f'{len(edits)} pages to edit')
  282. for pageid, edit in edits.items():
  283. print(f'\n{pageid}: {edit["title"]}:')
  284. old, new = edit['original'], edit['content']
  285. udiff = difflib.unified_diff(old.splitlines(), new.splitlines(), 'old', 'new')
  286. subprocess.run(
  287. ['delta', '-s', '--paging', 'never'],
  288. input='\n'.join(udiff), text=True
  289. )
  290. def save_edits(self):
  291. """
  292. Save edits prepared by prepare_edits.
  293. """
  294. with open(self.edits_path) as fp:
  295. edits = json.load(fp)
  296. self.logger.info(f'{len(edits)} pages to edit')
  297. for pageid, edit in edits.items():
  298. page = self.site.get_page(edit['title'])
  299. self.logger.info(f'\n{pageid}: [[{page.title}]]')
  300. if self.shutoff_enabled(page):
  301. raise RuntimeError('Shutoff enabled')
  302. if not page.check_exclusion():
  303. self.logger.warning(f'[[{page.title}]]: Bot excluded from editing')
  304. continue
  305. page.edit(
  306. edit['content'],
  307. summary=self.summary.format(changes=len(edit['changes'])),
  308. baserevid=edit['revid'],
  309. )
  310. time.sleep(10)