A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

192 lines
7.2 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. import re
  23. from earwigbot import exceptions
  24. from earwigbot.commands import Command
  25. class Dictionary(Command):
  26. """Define words and stuff."""
  27. name = "dictionary"
  28. commands = ["dict", "dictionary", "define", "def"]
  29. def process(self, data):
  30. if not data.args:
  31. self.reply(data, "What do you want me to define?")
  32. return
  33. term = " ".join(data.args)
  34. lang = self.bot.wiki.get_site().lang
  35. try:
  36. defined = self.define(term, lang)
  37. except exceptions.APIError:
  38. msg = "Cannot find a {0}-language Wiktionary."
  39. self.reply(data, msg.format(lang))
  40. else:
  41. self.reply(data, defined.encode("utf8"))
  42. def define(self, term, lang, tries=2):
  43. try:
  44. site = self.bot.wiki.get_site(project="wiktionary", lang=lang)
  45. except exceptions.SiteNotFoundError:
  46. site = self.bot.wiki.add_site(project="wiktionary", lang=lang)
  47. page = site.get_page(term, follow_redirects=True)
  48. try:
  49. entry = page.get()
  50. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  51. if term.lower() != term and tries:
  52. return self.define(term.lower(), lang, tries - 1)
  53. if term.capitalize() != term and tries:
  54. return self.define(term.capitalize(), lang, tries - 1)
  55. return "No definition found."
  56. level, languages = self.get_languages(entry)
  57. if not languages:
  58. return "Couldn't parse {0}!".format(page.url)
  59. if "#" in term: # Requesting a specific language
  60. lcase_langs = {lang.lower(): lang for lang in languages}
  61. request = term.rsplit("#", 1)[1]
  62. lang = lcase_langs.get(request.lower())
  63. if not lang:
  64. resp = "Language {0} not found in definition."
  65. return resp.format(request)
  66. definition = self.get_definition(languages[lang], level)
  67. return "({0}) {1}".format(lang, definition)
  68. result = []
  69. for lang, section in sorted(languages.items()):
  70. definition = self.get_definition(section, level)
  71. result.append("({0}) {1}".format(lang, definition))
  72. return "; ".join(result)
  73. def get_languages(self, entry, level=2):
  74. regex = r"(?:\A|\n)==\s*([a-zA-Z0-9_ ]*?)\s*==(?:\Z|\n)"
  75. split = re.split(regex, entry)
  76. if len(split) % 2 == 0:
  77. if level == 2:
  78. return self.get_languages(entry, level=3)
  79. else:
  80. return 3, None
  81. return 2, None
  82. split.pop(0)
  83. languages = {}
  84. for i in range(0, len(split), 2):
  85. languages[split[i]] = split[i + 1]
  86. return level, languages
  87. def get_definition(self, section, level):
  88. parts_of_speech = {
  89. "v.": "Verb",
  90. "n.": "Noun",
  91. "pron.": "Pronoun",
  92. "adj.": "Adjective",
  93. "adv.": "Adverb",
  94. "prep.": "Preposition",
  95. "conj.": "Conjunction",
  96. "inter.": "Interjection",
  97. "symbol": "Symbol",
  98. "suffix": "Suffix",
  99. "initialism": "Initialism",
  100. "phrase": "Phrase",
  101. "proverb": "Proverb",
  102. "prop. n.": "Proper noun",
  103. "abbr.": "Abbreviation",
  104. "punct.": "Punctuation mark",
  105. }
  106. blocks = "=" * (level + 1)
  107. defs = []
  108. for part, basename in parts_of_speech.items():
  109. fullnames = [basename, r"\{\{" + basename + r"\}\}",
  110. r"\{\{" + basename.lower() + r"\}\}"]
  111. for fullname in fullnames:
  112. regex = blocks + r"\s*" + fullname + r"\s*" + blocks
  113. if re.search(regex, section):
  114. regex = blocks + r"\s*" + fullname
  115. regex += r"\s*{0}(.*?)(?:(?:{0})|\Z)".format(blocks)
  116. bodies = re.findall(regex, section, re.DOTALL)
  117. if bodies:
  118. for body in bodies:
  119. definition = self.parse_body(body)
  120. if definition:
  121. msg = "\x02{0}\x0F {1}"
  122. defs.append(msg.format(part, definition))
  123. return "; ".join(defs)
  124. def parse_body(self, body):
  125. substitutions = [
  126. (r"<!--(.*?)-->", ""),
  127. (r"<ref>(.*?)</ref>", ""),
  128. (r"\[\[[^\]|]*?\|([^\]|]*?)\]\]", r"\1"),
  129. (r"\{\{unsupported\|(.*?)\}\}", r"\1"),
  130. (r"\{\{(.*?) of\|([^}|]*?)(\|(.*?))?\}\}", r"\1 of \2."),
  131. (r"\{\{w\|(.*?)\}\}", r"\1"),
  132. (r"\{\{surname(.*?)\}\}", r"A surname."),
  133. (r"\{\{given name\|([^}|]*?)(\|(.*?))?\}\}", r"A \1 given name."),
  134. ]
  135. senses = []
  136. for line in body.splitlines():
  137. line = line.strip()
  138. if re.match(r"#\s*[^:*#]", line):
  139. for regex, repl in substitutions:
  140. line = re.sub(regex, repl, line)
  141. line = self.strip_templates(line)
  142. line = line[1:].replace("'''", "").replace("''", "")
  143. line = line.replace("[[", "").replace("]]", "")
  144. if line.strip():
  145. senses.append(line.strip()[0].upper() + line.strip()[1:])
  146. if not senses:
  147. return None
  148. if len(senses) == 1:
  149. return senses[0]
  150. result = [] # Number the senses incrementally
  151. for i, sense in enumerate(senses):
  152. result.append("{0}. {1}".format(i + 1, sense))
  153. return " ".join(result)
  154. def strip_templates(self, line):
  155. line = list(line)
  156. stripped = ""
  157. depth = 0
  158. while line:
  159. this = line.pop(0)
  160. if line:
  161. next = line[0]
  162. else:
  163. next = ""
  164. if this == "{" and next == "{":
  165. line.pop(0)
  166. depth += 1
  167. elif this == "}" and next == "}":
  168. line.pop(0)
  169. depth -= 1
  170. elif depth == 0:
  171. stripped += this
  172. return stripped