Finish chunking algorithm, improve !link, other fixes.

12 年之前 · c260648bdb
--- a/earwigbot/commands/link.py
+++ b/earwigbot/commands/link.py
@@ -30,6 +30,7 @@ class Link(Command):
    name = "link"

    def process(self, data):
        self.site = self.bot.wiki.get_site()
        msg = data.msg

        if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg):
@@ -41,8 +42,8 @@ class Link(Command):
            if not data.args:
                self.reply(data, "what do you want me to link to?")
                return
            pagename = ' '.join(data.args)
            link = self.parse_link(pagename)
            pagename = " ".join(data.args)
            link = self.site.get_page(pagename).url
            self.reply(data, link)

    def parse_line(self, line):
@@ -56,8 +57,7 @@ class Link(Command):
        if links:
            # re.findall() returns a list of tuples, but we only want the 2nd
            # item in each tuple:
            links = [i[1] for i in links]
            results = map(self.parse_link, links)
            results = [self.site.get_page(name[1]).url for name in links]

        # Find all {{templates}}
        templates = re.findall("(\{\{(.*?)(\||\}\}))", line)
@@ -67,10 +67,6 @@ class Link(Command):

        return results

    def parse_link(self, pagename):
        link = quote(pagename.replace(" ", "_"), safe="/:")
        return "".join(("http://enwp.org/", link))

    def parse_template(self, pagename):
        pagename = "".join(("Template:", pagename))
        return self.parse_link(pagename)
        return self.site.get_page(pagename).url
--- a/earwigbot/wiki/copyvios/init.py
+++ b/earwigbot/wiki/copyvios/init.py
@@ -179,7 +179,7 @@ class CopyvioMixIn(object):
        best_chains = (empty, MarkovChainIntersection(empty, empty))
        parser = ArticleTextParser(self.get())
        clean = parser.strip()
        chunks = parser.chunk(max_queries, self._search_config["nltk_dir"])
        chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
        article_chain = MarkovChain(clean)
        last_query = time()

--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -70,18 +70,18 @@ class ArticleTextParser(BaseTextParser):
        The actual stripping is handled by :py:mod:`mwparserfromhell`.
        """
        wikicode = mwparserfromhell.parse(self.text)
        self.clean = u" ".join(wikicode.normalize().ifilter_text())
        self.clean = wikicode.strip_code(normalize=True)
        return self.clean

    def chunk(self, max_chunks, nltk_dir):
    def chunk(self, nltk_dir, max_chunks, max_query=256):
        """Convert the clean article text into a list of web-searchable chunks.

        No greater than *max_chunks* will be returned. Each chunk will only be
        a sentence or two long at most. The idea here is to return a
        representative sample of the article text rather than the whole, so
        we'll probably pick and choose from its introduction, body, and
        conclusion, especially if the article is large and *max_chunks* is low,
        so we don't end up just searching for the first paragraph.
        a sentence or two long at most (no more than *max_query*). The idea is
        to return a sample of the article text rather than the whole, so we'll
        pick and choose from parts of it, especially if the article is large
        and *max_chunks* is low, so we don't end up just searching for just the
        first paragraph.

        This is implemented using :py:mod:`nltk` (http://nltk.org/). A base
        directory (*nltk_dir*) is required to store nltk's punctuation
@@ -89,14 +89,38 @@ class ArticleTextParser(BaseTextParser):
        """
        datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
        try:
            tokenizer = nltk.data.load(datafile)
            tokenizer = nltk.data.load("file:" + datafile)
        except LookupError:
            nltk.download("punkt", nltk_dir)
            tokenizer = nltk.data.load(datafile)

        sentences = tokenizer.tokenize(self.clean)
        #if max_chunks >= len(sentences):
        #    return sentences
            tokenizer = nltk.data.load("file:" + datafile)

        sentences = []
        for sentence in tokenizer.tokenize(self.clean):
            if len(sentence) > max_query:
                words = sentence.split()
                while len(" ".join(words)) > max_query:
                    words.pop()
                sentence = " ".join(words)
            sentences.append(sentence)

        if max_chunks >= len(sentences):
            return sentences

        chunks = []
        while len(chunks) < max_chunks:
            if len(chunks) % 5 == 0:
                chunk = sentences.pop(0)  # Pop from beginning
            elif len(chunks) % 5 == 1:
                chunk = sentences.pop()  # Pop from end
            elif len(chunks) % 5 == 2:
                chunk = sentences.pop(len(sentences) / 2)  # Pop from Q2
            elif len(chunks) % 5 == 3:
                chunk = sentences.pop(len(sentences) / 4)  # Pop from Q1
            else:
                chunk = sentences.pop(3 * len(sentences) / 4)  # Pop from Q3
            chunks.append(chunk)

        return chunks


 class HTMLTextParser(BaseTextParser):
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -560,10 +560,10 @@ class Site(object):
                return [self.SERVICE_API]
            sqllag = self._sql_info_cache["replag"]

        if sqllag > 180:
        if sqllag > 300:
            if not self._maxlag:
                return [self.SERVICE_API, self.SERVICE_SQL]
            if now - self._api_info_cache["lastcheck"] > 120:
            if now - self._api_info_cache["lastcheck"] > 300:
                self._api_info_cache["lastcheck"] = now
                try:
                    self._api_info_cache["maxlag"] = apilag = self.get_maxlag()
@@ -571,7 +571,7 @@ class Site(object):
                    self._api_info_cache["maxlag"] = apilag = 0
            else:
                apilag = self._api_info_cache["maxlag"]
            if sqllag / (180.0 / self._maxlag) < apilag:
            if apilag > self._maxlag:
                return [self.SERVICE_SQL, self.SERVICE_API]
            return [self.SERVICE_API, self.SERVICE_SQL]

--- a/earwigbot/wiki/sitesdb.py
+++ b/earwigbot/wiki/sitesdb.py
@@ -363,7 +363,7 @@ class SitesDB(object):
        use_https = config.wiki.get("useHTTPS", False)
        assert_edit = config.wiki.get("assert")
        maxlag = config.wiki.get("maxlag")
        wait_between_queries = config.wiki.get("waitTime", 5)
        wait_between_queries = config.wiki.get("waitTime", 3)
        logger = self._logger.getChild(name)
        search_config = config.wiki.get("search")