A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

179 lines
6.5 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. import re
  23. from earwigbot import exceptions
  24. from earwigbot.commands import Command
  25. class Dictionary(Command):
  26. """Define words and stuff."""
  27. name = "dictionary"
  28. commands = ["dict", "dictionary", "define"]
  29. def process(self, data):
  30. if not data.args:
  31. self.reply(data, "what do you want me to define?")
  32. return
  33. term = " ".join(data.args)
  34. lang = self.bot.wiki.get_site().lang
  35. try:
  36. defined = self.define(term, lang)
  37. except exceptions.APIError:
  38. msg = "cannot find a {0}-language Wiktionary."
  39. self.reply(data, msg.format(lang))
  40. else:
  41. self.reply(data, defined.encode("utf8"))
  42. def define(self, term, lang, tries=2):
  43. try:
  44. site = self.bot.wiki.get_site(project="wiktionary", lang=lang)
  45. except exceptions.SiteNotFoundError:
  46. site = self.bot.wiki.add_site(project="wiktionary", lang=lang)
  47. page = site.get_page(term)
  48. try:
  49. entry = page.get()
  50. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  51. if term.lower() != term and tries:
  52. return self.define(term.lower(), lang, tries - 1)
  53. if term.capitalize() != term and tries:
  54. return self.define(term.capitalize(), lang, tries - 1)
  55. return "no definition found."
  56. level, languages = self.get_languages(entry)
  57. if not languages:
  58. return u"couldn't parse {0}!".format(page.url)
  59. result = []
  60. for lang, section in sorted(languages.items()):
  61. definition = self.get_definition(section, level)
  62. result.append(u"({0}) {1}".format(lang, definition))
  63. return u"; ".join(result)
  64. def get_languages(self, entry, level=2):
  65. regex = r"(?:\A|\n)==\s*([a-zA-Z0-9_ ]*?)\s*==(?:\Z|\n)"
  66. split = re.split(regex, entry)
  67. if len(split) % 2 == 0:
  68. if level == 2:
  69. return self.get_languages(entry, level=3)
  70. else:
  71. return 3, None
  72. return 2, None
  73. split.pop(0)
  74. languages = {}
  75. for i in xrange(0, len(split), 2):
  76. languages[split[i]] = split[i + 1]
  77. return level, languages
  78. def get_definition(self, section, level):
  79. parts_of_speech = {
  80. "v.": "Verb",
  81. "n.": "Noun",
  82. "pron.": "Pronoun",
  83. "adj.": "Adjective",
  84. "adv.": "Adverb",
  85. "prep.": "Preposition",
  86. "conj.": "Conjunction",
  87. "inter.": "Interjection",
  88. "symbol": "Symbol",
  89. "suffix": "Suffix",
  90. "initialism": "Initialism",
  91. "phrase": "Phrase",
  92. "proverb": "Proverb",
  93. "prop. n.": "Proper noun",
  94. "abbr.": "Abbreviation",
  95. }
  96. blocks = "=" * (level + 1)
  97. defs = []
  98. for part, basename in parts_of_speech.iteritems():
  99. fullnames = [basename, "\{\{" + basename + "\}\}",
  100. "\{\{" + basename.lower() + "\}\}"]
  101. for fullname in fullnames:
  102. regex = blocks + "\s*" + fullname + "\s*" + blocks
  103. if re.search(regex, section):
  104. regex = blocks + "\s*" + fullname
  105. regex += "\s*{0}(.*?)(?:(?:{0})|\Z)".format(blocks)
  106. bodies = re.findall(regex, section, re.DOTALL)
  107. if bodies:
  108. for body in bodies:
  109. definition = self.parse_body(body)
  110. if definition:
  111. msg = u"\x02{0}\x0F {1}"
  112. defs.append(msg.format(part, definition))
  113. return "; ".join(defs)
  114. def parse_body(self, body):
  115. substitutions = [
  116. ("<!--(.*?)-->", ""),
  117. ("\[\[[^\]|]*?\|([^\]|]*?)\]\]", r"\1"),
  118. ("\{\{(.*?) of\|(.*?)\}\}", r"\1 of \2."),
  119. ("\{\{w\|(.*?)\}\}", r"\1"),
  120. ("\{\{surname(.*?)\}\}", r"A surname."),
  121. ("\{\{given name\|(.*?)(\||\}\})", r"A \1 given name."),
  122. ]
  123. senses = []
  124. for line in body.splitlines():
  125. line = line.strip()
  126. if re.match("#\s*[^:*]", line):
  127. for regex, repl in substitutions:
  128. line = re.sub(regex, repl, line)
  129. line = self.strip_templates(line)
  130. line = line[1:].replace("'''", "").replace("''", "")
  131. line = line.replace("[[", "").replace("]]", "")
  132. if line.strip():
  133. senses.append(line.strip()[0].upper() + line.strip()[1:])
  134. if not senses:
  135. return None
  136. if len(senses) == 1:
  137. return senses[0]
  138. result = [] # Number the senses incrementally
  139. for i, sense in enumerate(senses):
  140. result.append(u"{0}. {1}".format(i + 1, sense))
  141. return " ".join(result)
  142. def strip_templates(self, line):
  143. line = list(line)
  144. stripped = ""
  145. depth = 0
  146. while line:
  147. this = line.pop(0)
  148. if line:
  149. next = line[0]
  150. else:
  151. next = ""
  152. if this == "{" and next == "{":
  153. line.pop(0)
  154. depth += 1
  155. elif this == "}" and next == "}":
  156. line.pop(0)
  157. depth -= 1
  158. elif depth == 0:
  159. stripped += this
  160. return stripped