Merge branch 'feature/dictionary' into develop

12 years ago · f1e0a6f4de
--- a/earwigbot/commands/afc_report.py
+++ b/earwigbot/commands/afc_report.py
@@ -74,12 +74,13 @@ class AFCReport(Command):
            return page

    def report(self, page):
        url = page.url.replace("en.wikipedia.org/wiki", "enwp.org")
        url = page.url.encode("utf8")
        url = url.replace("en.wikipedia.org/wiki", "enwp.org")
        short = self.statistics.get_short_title(page.title)
        status = self.get_status(page)
        user = page.get_creator()
        user_name = user.name
        user_url = user.get_talkpage().url
        user_url = user.get_talkpage().url.encode("utf8")

        msg1 = "AfC submission report for \x0302{0}\x0F ({1}):"
        msg2 = "Status: \x0303{0}\x0F"
--- a/earwigbot/commands/afc_submissions.py
+++ b/earwigbot/commands/afc_submissions.py
@@ -54,6 +54,6 @@ class AFCSubmissions(Command):
        site = self.bot.wiki.get_site()
        category = site.get_category("Pending AfC submissions")
        members = category.get_members(limit=number + len(self.ignore_list))
        urls = [member.url for member in members if member.title not in self.ignore_list]
        urls = [member.url.encode("utf8") for member in members if member.title not in self.ignore_list]
        pages = ", ".join(urls[:number])
        self.reply(data, "{0} pending AfC subs: {1}".format(number, pages))
--- a/earwigbot/commands/dictionary.py
+++ b/earwigbot/commands/dictionary.py
@@ -0,0 +1,149 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 import re

 from earwigbot import exceptions
 from earwigbot.commands import Command

 class Dictionary(Command):
    """Define words and stuff."""
    name = "dictionary"
    commands = ["dict", "dictionary", "define"]

    def process(self, data):
        if not data.args:
            self.reply(data, "what do you want me to define?")
            return

        term = " ".join(data.args)
        lang = self.bot.wiki.get_site().lang
        try:
            defined = self.define(term, lang)
        except exceptions.APIError:
            msg = "cannot find a {0}-language Wiktionary."
            self.reply(data, msg.format(lang))
        else:
            self.reply(data, defined.encode("utf8"))

    def define(self, term, lang):
        try:
            site = self.bot.wiki.get_site(project="wiktionary", lang=lang)
        except exceptions.SiteNotFoundError:
            site = self.bot.wiki.add_site(project="wiktionary", lang=lang)

        page = site.get_page(term)
        try:
            entry = page.get()
        except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
            return "no definition found."

        languages = self.get_languages(entry)
        if not languages:
            return u"couldn't parse {0}!".format(page.url)

        result = []
        for lang, section in sorted(languages.items()):
            this = u"({0}) {1}".format(lang, self.get_definition(section))
            result.append(this)
        return u"; ".join(result)

    def get_languages(self, entry):
        regex = r"(?:\A|\n)==\s*([a-zA-Z0-9_ ]*?)\s*==(?:\Z|\n)"
        split = re.split(regex, entry)
        if len(split) % 2 == 0:
            return None

        split.pop(0)
        languages = {}
        for i in xrange(0, len(split), 2):
            languages[split[i]] = split[i + 1]
        return languages

    def get_definition(self, section):
        parts_of_speech = {
            "v.": "Verb",
            "n.": "Noun",
            "pron.": "Pronoun",
            "adj.": "Adjective",
            "adv.": "Adverb",
            "prep.": "Preposition",
            "conj.": "Conjunction",
            "inter.": "Interjection",
            "symbol": "Symbol",
            "suffix": "Suffix",
            "initialism": "Initialism",
            "phrase": "Phrase",
            "proverb": "Proverb",
        }
        defs = []
        for part, fullname in parts_of_speech.iteritems():
            if re.search("===\s*" + fullname + "\s*===", section):
                regex = "===\s*" + fullname + "\s*===(.*?)(?:(?:===)|\Z)"
                body = re.findall(regex, section, re.DOTALL)
                if body:
                    definition = self.parse_body(body[0])
                    if definition:
                        defs.append("\x02{0}\x0F {1}".format(part, definition))

        return "; ".join(defs)

    def parse_body(self, body):
        senses = []
        for line in body.splitlines():
            line = line.strip()
            if re.match("#\s*[^:*]", line):
                line = re.sub("\[\[(.*?)\|(.*?)\]\]", r"\2", line)
                line = self.strip_templates(line)
                line = line[1:].replace("'''", "").replace("''", "")
                line = line.replace("[[", "").replace("]]", "")
                senses.append(line.strip())

        if not senses:
            return None
        if len(senses) == 1:
            return senses[0]

        result = []  # Number the senses incrementally
        for i, sense in enumerate(senses):
            result.append(u"{0}. {1}".format(i + 1, sense))
        return " ".join(result)

    def strip_templates(self, line):
        line = list(line)
        stripped = ""
        depth = 0
        while line:
            this = line.pop(0)
            if line:
                next = line[0]
            else:
                next = ""
            if this == "{" and next == "{":
                line.pop(0)
                depth += 1
            elif this == "}" and next == "}":
                line.pop(0)
                depth -= 1
            elif depth == 0:
                stripped += this
        return stripped
--- a/earwigbot/commands/link.py
+++ b/earwigbot/commands/link.py
@@ -35,15 +35,15 @@ class Link(Command):

        if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg):
            links = self.parse_line(msg)
            links = " , ".join(links)
            self.reply(data, links)
            links = u" , ".join(links)
            self.reply(data, links.encode("utf8"))

        elif data.command == "link":
            if not data.args:
                self.reply(data, "what do you want me to link to?")
                return
            pagename = " ".join(data.args)
            link = self.site.get_page(pagename).url
            link = self.site.get_page(pagename).url.encode("utf8")
            self.reply(data, link)

    def parse_line(self, line):
@@ -68,5 +68,4 @@ class Link(Command):
        return results

    def parse_template(self, pagename):
        pagename = "".join(("Template:", pagename))
        return self.site.get_page(pagename).url
        return self.site.get_page("Template:" + pagename).url
--- a/earwigbot/config.py
+++ b/earwigbot/config.py
@@ -274,7 +274,8 @@ class BotConfig(object):
                key = getpass("Enter key to decrypt bot passwords: ")
                self._decryption_cipher = Blowfish.new(sha256(key).digest())
            signature = self.metadata["signature"]
            assert bcrypt.hashpw(key, signature) == signature
            if bcrypt.hashpw(key, signature) != signature:
                raise RuntimeError("Incorrect password.")
            for node, nodes in self._decryptable_nodes:
                self._decrypt(node, nodes)

--- a/earwigbot/managers.py
+++ b/earwigbot/managers.py
@@ -78,7 +78,7 @@ class _ResourceManager(object):
        try:
            resource = klass(self.bot)  # Create instance of resource
        except Exception:
            e = "Error instantiating {0} class in {1} (from {2})"
            e = "Error instantiating {0} class in '{1}' (from {2})"
            self.logger.exception(e.format(res_type, name, path))
        else:
            self._resources[resource.name] = resource
@@ -98,7 +98,7 @@ class _ResourceManager(object):
        try:
            module = imp.load_module(name, f, path, desc)
        except Exception:
            e = "Couldn't load module {0} (from {1})"
            e = "Couldn't load module '{0}' (from {1})"
            self.logger.exception(e.format(name, path))
            return
        finally:
--- a/earwigbot/tasks/afc_statistics.py
+++ b/earwigbot/tasks/afc_statistics.py
@@ -269,7 +269,8 @@ class AFCStatistics(Task):
        tracked = [i[0] for i in cursor.fetchall()]

        category = self.site.get_category(self.pending_cat)
        for title, pageid in category.get_members():
        for page in category.get_members():
            title, pageid = page.title, page.pageid
            if title in self.ignore_list:
                continue
            if pageid not in tracked:
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -513,9 +513,9 @@ class Page(CopyvioMixIn):
            return self._fullurl
        else:
            encoded = self._title.encode("utf8").replace(" ", "_")
            slug = quote(encoded, safe="/:")
            slug = quote(encoded, safe="/:").decode("utf8")
            path = self.site._article_path.replace("$1", slug)
            return ''.join((self.site.url, path))
            return u"".join((self.site.url, path))

    @property
    def namespace(self):
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -131,13 +131,19 @@ class Site(object):
        self._api_info_cache = {"maxlag": 0, "lastcheck": 0}

        # Attributes used for SQL queries:
        self._sql_data = sql
        if sql:
            self._sql_data = sql
        else:
            self._sql_data = {}
        self._sql_conn = None
        self._sql_lock = Lock()
        self._sql_info_cache = {"replag": 0, "lastcheck": 0, "usable": None}

        # Attribute used in copyright violation checks (see CopyrightMixIn):
        self._search_config = search_config
        if search_config:
            self._search_config = search_config
        else:
            self._search_config = {}

        # Set up cookiejar and URL opener for making API queries:
        if cookiejar:
@@ -150,9 +156,6 @@ class Site(object):
        self._opener.addheaders = [("User-Agent", user_agent),
                                   ("Accept-Encoding", "gzip")]

        # Get all of the above attributes that were not specified as arguments:
        self._load_attributes()

        # Set up our internal logger:
        if logger:
            self._logger = logger
@@ -160,6 +163,9 @@ class Site(object):
            self._logger = getLogger("earwigbot.wiki")
            self._logger.addHandler(NullHandler())

        # Get all of the above attributes that were not specified as arguments:
        self._load_attributes()

        # If we have a name/pass and the API says we're not logged in, log in:
        self._login_info = name, password = login
        if name and password:
--- a/earwigbot/wiki/sitesdb.py
+++ b/earwigbot/wiki/sitesdb.py
@@ -278,6 +278,7 @@ class SitesDB(object):
            else:
                conn.execute("DELETE FROM sql_data WHERE sql_site = ?", (name,))
                conn.execute("DELETE FROM namespaces WHERE ns_site = ?", (name,))
                self._logger.info("Removed site '{0}'".format(name))
                return True

    def get_site(self, name=None, project=None, lang=None):
@@ -376,34 +377,20 @@ class SitesDB(object):
        assert_edit = config.wiki.get("assert")
        maxlag = config.wiki.get("maxlag")
        wait_between_queries = config.wiki.get("waitTime", 3)
        logger = self._logger.getChild(name)
        search_config = config.wiki.get("search")

        if user_agent:
            user_agent = user_agent.replace("$1", __version__)
            user_agent = user_agent.replace("$2", python_version())

        if search_config:
            nltk_dir = path.join(self.config.root_dir, ".nltk")
            search_config["nltk_dir"] = nltk_dir
            search_config["exclusions_db"] = self._exclusions_db

        if not sql:
            sql = config.wiki.get("sql", {})
            for key, value in sql.iteritems():
                if "$1" in value:
                    sql[key] = value.replace("$1", name)

        # Create a Site object to log in and load the other attributes:
        site = Site(base_url=base_url, script_path=script_path, sql=sql,
                    login=login, cookiejar=cookiejar, user_agent=user_agent,
                    use_https=use_https, assert_edit=assert_edit,
                    maxlag=maxlag, wait_between_queries=wait_between_queries,
                    logger=logger, search_config=search_config)
                    maxlag=maxlag, wait_between_queries=wait_between_queries)

        self._logger.info("Added site '{0}'".format(site.name))
        self._add_site_to_sitesdb(site)
        self._sites[site.name] = site
        return site
        return self._get_site_object(site.name)

    def remove_site(self, name=None, project=None, lang=None):
        """Remove a site from the sitesdb.