ben
/
earwigbot
mirror of https://github.com/earwig/earwigbot


			
				
					
						
						
							
							# -*- coding: utf-8  -*-
#
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import re

from earwigbot import exceptions
from earwigbot.commands import Command

class Dictionary(Command):
    """Define words and stuff."""
    name = "dictionary"
    commands = ["dict", "dictionary", "define", "def"]

    def process(self, data):
        if not data.args:
            self.reply(data, "What do you want me to define?")
            return

        term = " ".join(data.args)
        lang = self.bot.wiki.get_site().lang
        try:
            defined = self.define(term, lang)
        except exceptions.APIError:
            msg = "Cannot find a {0}-language Wiktionary."
            self.reply(data, msg.format(lang))
        else:
            self.reply(data, defined.encode("utf8"))

    def define(self, term, lang, tries=2):
        try:
            site = self.bot.wiki.get_site(project="wiktionary", lang=lang)
        except exceptions.SiteNotFoundError:
            site = self.bot.wiki.add_site(project="wiktionary", lang=lang)

        page = site.get_page(term, follow_redirects=True)
        try:
            entry = page.get()
        except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
            if term.lower() != term and tries:
                return self.define(term.lower(), lang, tries - 1)
            if term.capitalize() != term and tries:
                return self.define(term.capitalize(), lang, tries - 1)
            return "No definition found."

        level, languages = self.get_languages(entry)
        if not languages:
            return "Couldn't parse {0}!".format(page.url)

        if "#" in term:  # Requesting a specific language
            lcase_langs = {lang.lower(): lang for lang in languages}
            request = term.rsplit("#", 1)[1]
            lang = lcase_langs.get(request.lower())
            if not lang:
                resp = "Language {0} not found in definition."
                return resp.format(request)
            definition = self.get_definition(languages[lang], level)
            return "({0}) {1}".format(lang, definition)

        result = []
        for lang, section in sorted(languages.items()):
            definition = self.get_definition(section, level)
            result.append("({0}) {1}".format(lang, definition))
        return "; ".join(result)

    def get_languages(self, entry, level=2):
        regex = r"(?:\A|\n)==\s*([a-zA-Z0-9_ ]*?)\s*==(?:\Z|\n)"
        split = re.split(regex, entry)
        if len(split) % 2 == 0:
            if level == 2:
                return self.get_languages(entry, level=3)
            else:
                return 3, None
            return 2, None

        split.pop(0)
        languages = {}
        for i in range(0, len(split), 2):
            languages[split[i]] = split[i + 1]
        return level, languages

    def get_definition(self, section, level):
        parts_of_speech = {
            "v.": "Verb",
            "n.": "Noun",
            "pron.": "Pronoun",
            "adj.": "Adjective",
            "adv.": "Adverb",
            "prep.": "Preposition",
            "conj.": "Conjunction",
            "inter.": "Interjection",
            "symbol": "Symbol",
            "suffix": "Suffix",
            "initialism": "Initialism",
            "phrase": "Phrase",
            "proverb": "Proverb",
            "prop. n.": "Proper noun",
            "abbr.": "Abbreviation",
            "punct.": "Punctuation mark",
        }
        blocks = "=" * (level + 1)
        defs = []
        for part, basename in parts_of_speech.items():
            fullnames = [basename, r"\{\{" + basename + r"\}\}",
                         r"\{\{" + basename.lower() + r"\}\}"]
            for fullname in fullnames:
                regex = blocks + r"\s*" + fullname + r"\s*" + blocks
                if re.search(regex, section):
                    regex = blocks + r"\s*" + fullname
                    regex += r"\s*{0}(.*?)(?:(?:{0})|\Z)".format(blocks)
                    bodies = re.findall(regex, section, re.DOTALL)
                    if bodies:
                        for body in bodies:
                            definition = self.parse_body(body)
                            if definition:
                                msg = "\x02{0}\x0F {1}"
                                defs.append(msg.format(part, definition))

        return "; ".join(defs)

    def parse_body(self, body):
        substitutions = [
            (r"<!--(.*?)-->", ""),
            (r"<ref>(.*?)</ref>", ""),
            (r"\[\[[^\]|]*?\|([^\]|]*?)\]\]", r"\1"),
            (r"\{\{unsupported\|(.*?)\}\}", r"\1"),
            (r"\{\{(.*?) of\|([^}|]*?)(\|(.*?))?\}\}", r"\1 of \2."),
            (r"\{\{w\|(.*?)\}\}", r"\1"),
            (r"\{\{surname(.*?)\}\}", r"A surname."),
            (r"\{\{given name\|([^}|]*?)(\|(.*?))?\}\}", r"A \1 given name."),
        ]

        senses = []
        for line in body.splitlines():
            line = line.strip()
            if re.match(r"#\s*[^:*#]", line):
                for regex, repl in substitutions:
                    line = re.sub(regex, repl, line)
                line = self.strip_templates(line)
                line = line[1:].replace("'''", "").replace("''", "")
                line = line.replace("[[", "").replace("]]", "")
                if line.strip():
                    senses.append(line.strip()[0].upper() + line.strip()[1:])

        if not senses:
            return None
        if len(senses) == 1:
            return senses[0]

        result = []  # Number the senses incrementally
        for i, sense in enumerate(senses):
            result.append("{0}. {1}".format(i + 1, sense))
        return " ".join(result)

    def strip_templates(self, line):
        line = list(line)
        stripped = ""
        depth = 0
        while line:
            this = line.pop(0)
            if line:
                next = line[0]
            else:
                next = ""
            if this == "{" and next == "{":
                line.pop(0)
                depth += 1
            elif this == "}" and next == "}":
                line.pop(0)
                depth -= 1
            elif depth == 0:
                stripped += this
        return stripped