A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

182 lines
6.7 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. import re
  23. from earwigbot import exceptions
  24. from earwigbot.commands import Command
  25. class Dictionary(Command):
  26. """Define words and stuff."""
  27. name = "dictionary"
  28. commands = ["dict", "dictionary", "define"]
  29. def process(self, data):
  30. if not data.args:
  31. self.reply(data, "What do you want me to define?")
  32. return
  33. term = " ".join(data.args)
  34. lang = self.bot.wiki.get_site().lang
  35. try:
  36. defined = self.define(term, lang)
  37. except exceptions.APIError:
  38. msg = "Cannot find a {0}-language Wiktionary."
  39. self.reply(data, msg.format(lang))
  40. else:
  41. self.reply(data, defined.encode("utf8"))
  42. def define(self, term, lang, tries=2):
  43. try:
  44. site = self.bot.wiki.get_site(project="wiktionary", lang=lang)
  45. except exceptions.SiteNotFoundError:
  46. site = self.bot.wiki.add_site(project="wiktionary", lang=lang)
  47. page = site.get_page(term, follow_redirects=True)
  48. try:
  49. entry = page.get()
  50. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  51. if term.lower() != term and tries:
  52. return self.define(term.lower(), lang, tries - 1)
  53. if term.capitalize() != term and tries:
  54. return self.define(term.capitalize(), lang, tries - 1)
  55. return "No definition found."
  56. level, languages = self.get_languages(entry)
  57. if not languages:
  58. return u"Couldn't parse {0}!".format(page.url)
  59. result = []
  60. for lang, section in sorted(languages.items()):
  61. definition = self.get_definition(section, level)
  62. result.append(u"({0}) {1}".format(lang, definition))
  63. return u"; ".join(result)
  64. def get_languages(self, entry, level=2):
  65. regex = r"(?:\A|\n)==\s*([a-zA-Z0-9_ ]*?)\s*==(?:\Z|\n)"
  66. split = re.split(regex, entry)
  67. if len(split) % 2 == 0:
  68. if level == 2:
  69. return self.get_languages(entry, level=3)
  70. else:
  71. return 3, None
  72. return 2, None
  73. split.pop(0)
  74. languages = {}
  75. for i in xrange(0, len(split), 2):
  76. languages[split[i]] = split[i + 1]
  77. return level, languages
  78. def get_definition(self, section, level):
  79. parts_of_speech = {
  80. "v.": "Verb",
  81. "n.": "Noun",
  82. "pron.": "Pronoun",
  83. "adj.": "Adjective",
  84. "adv.": "Adverb",
  85. "prep.": "Preposition",
  86. "conj.": "Conjunction",
  87. "inter.": "Interjection",
  88. "symbol": "Symbol",
  89. "suffix": "Suffix",
  90. "initialism": "Initialism",
  91. "phrase": "Phrase",
  92. "proverb": "Proverb",
  93. "prop. n.": "Proper noun",
  94. "abbr.": "Abbreviation",
  95. "punct.": "Punctuation mark",
  96. }
  97. blocks = "=" * (level + 1)
  98. defs = []
  99. for part, basename in parts_of_speech.iteritems():
  100. fullnames = [basename, "\{\{" + basename + "\}\}",
  101. "\{\{" + basename.lower() + "\}\}"]
  102. for fullname in fullnames:
  103. regex = blocks + "\s*" + fullname + "\s*" + blocks
  104. if re.search(regex, section):
  105. regex = blocks + "\s*" + fullname
  106. regex += "\s*{0}(.*?)(?:(?:{0})|\Z)".format(blocks)
  107. bodies = re.findall(regex, section, re.DOTALL)
  108. if bodies:
  109. for body in bodies:
  110. definition = self.parse_body(body)
  111. if definition:
  112. msg = u"\x02{0}\x0F {1}"
  113. defs.append(msg.format(part, definition))
  114. return "; ".join(defs)
  115. def parse_body(self, body):
  116. substitutions = [
  117. ("<!--(.*?)-->", ""),
  118. ("<ref>(.*?)</ref>", ""),
  119. ("\[\[[^\]|]*?\|([^\]|]*?)\]\]", r"\1"),
  120. ("\{\{unsupported\|(.*?)\}\}", r"\1"),
  121. ("\{\{(.*?) of\|([^}|]*?)(\|(.*?))?\}\}", r"\1 of \2."),
  122. ("\{\{w\|(.*?)\}\}", r"\1"),
  123. ("\{\{surname(.*?)\}\}", r"A surname."),
  124. ("\{\{given name\|([^}|]*?)(\|(.*?))?\}\}", r"A \1 given name."),
  125. ]
  126. senses = []
  127. for line in body.splitlines():
  128. line = line.strip()
  129. if re.match("#\s*[^:*#]", line):
  130. for regex, repl in substitutions:
  131. line = re.sub(regex, repl, line)
  132. line = self.strip_templates(line)
  133. line = line[1:].replace("'''", "").replace("''", "")
  134. line = line.replace("[[", "").replace("]]", "")
  135. if line.strip():
  136. senses.append(line.strip()[0].upper() + line.strip()[1:])
  137. if not senses:
  138. return None
  139. if len(senses) == 1:
  140. return senses[0]
  141. result = [] # Number the senses incrementally
  142. for i, sense in enumerate(senses):
  143. result.append(u"{0}. {1}".format(i + 1, sense))
  144. return " ".join(result)
  145. def strip_templates(self, line):
  146. line = list(line)
  147. stripped = ""
  148. depth = 0
  149. while line:
  150. this = line.pop(0)
  151. if line:
  152. next = line[0]
  153. else:
  154. next = ""
  155. if this == "{" and next == "{":
  156. line.pop(0)
  157. depth += 1
  158. elif this == "}" and next == "}":
  159. line.pop(0)
  160. depth -= 1
  161. elif depth == 0:
  162. stripped += this
  163. return stripped