From c260648bdb2a45a9c0a76f6e4df53889f28f270c Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 7 Jul 2012 21:40:54 -0400
Subject: [PATCH] Finish chunking algorithm, improve !link, other fixes.

---
 earwigbot/commands/link.py          | 14 ++++-------
 earwigbot/wiki/copyvios/__init__.py |  2 +-
 earwigbot/wiki/copyvios/parsers.py  | 50 +++++++++++++++++++++++++++----------
 earwigbot/wiki/site.py              |  6 ++---
 earwigbot/wiki/sitesdb.py           |  2 +-
 5 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/earwigbot/commands/link.py b/earwigbot/commands/link.py
index 0b54554..ebe3669 100644
--- a/earwigbot/commands/link.py
+++ b/earwigbot/commands/link.py
@@ -30,6 +30,7 @@ class Link(Command):
     name = "link"
 
     def process(self, data):
+        self.site = self.bot.wiki.get_site()
         msg = data.msg
 
         if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg):
@@ -41,8 +42,8 @@ class Link(Command):
             if not data.args:
                 self.reply(data, "what do you want me to link to?")
                 return
-            pagename = ' '.join(data.args)
-            link = self.parse_link(pagename)
+            pagename = " ".join(data.args)
+            link = self.site.get_page(pagename).url
             self.reply(data, link)
 
     def parse_line(self, line):
@@ -56,8 +57,7 @@ class Link(Command):
         if links:
             # re.findall() returns a list of tuples, but we only want the 2nd
             # item in each tuple:
-            links = [i[1] for i in links]
-            results = map(self.parse_link, links)
+            results = [self.site.get_page(name[1]).url for name in links]
 
         # Find all {{templates}}
         templates = re.findall("(\{\{(.*?)(\||\}\}))", line)
@@ -67,10 +67,6 @@ class Link(Command):
 
         return results
 
-    def parse_link(self, pagename):
-        link = quote(pagename.replace(" ", "_"), safe="/:")
-        return "".join(("http://enwp.org/", link))
-
     def parse_template(self, pagename):
         pagename = "".join(("Template:", pagename))
-        return self.parse_link(pagename)
+        return self.site.get_page(pagename).url
diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index 5fb7bf2..cf2ddde 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -179,7 +179,7 @@ class CopyvioMixIn(object):
         best_chains = (empty, MarkovChainIntersection(empty, empty))
         parser = ArticleTextParser(self.get())
         clean = parser.strip()
-        chunks = parser.chunk(max_queries, self._search_config["nltk_dir"])
+        chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
         article_chain = MarkovChain(clean)
         last_query = time()
 
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index a00369d..b258730 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -70,18 +70,18 @@ class ArticleTextParser(BaseTextParser):
         The actual stripping is handled by :py:mod:`mwparserfromhell`.
         """
         wikicode = mwparserfromhell.parse(self.text)
-        self.clean = u" ".join(wikicode.normalize().ifilter_text())
+        self.clean = wikicode.strip_code(normalize=True)
         return self.clean
 
-    def chunk(self, max_chunks, nltk_dir):
+    def chunk(self, nltk_dir, max_chunks, max_query=256):
         """Convert the clean article text into a list of web-searchable chunks.
 
         No greater than *max_chunks* will be returned. Each chunk will only be
-        a sentence or two long at most. The idea here is to return a
-        representative sample of the article text rather than the whole, so
-        we'll probably pick and choose from its introduction, body, and
-        conclusion, especially if the article is large and *max_chunks* is low,
-        so we don't end up just searching for the first paragraph.
+        a sentence or two long at most (no more than *max_query*). The idea is
+        to return a sample of the article text rather than the whole, so we'll
+        pick and choose from parts of it, especially if the article is large
+        and *max_chunks* is low, so we don't end up just searching for just the
+        first paragraph.
 
         This is implemented using :py:mod:`nltk` (http://nltk.org/). A base
         directory (*nltk_dir*) is required to store nltk's punctuation
@@ -89,14 +89,38 @@ class ArticleTextParser(BaseTextParser):
         """
         datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
         try:
-            tokenizer = nltk.data.load(datafile)
+            tokenizer = nltk.data.load("file:" + datafile)
         except LookupError:
             nltk.download("punkt", nltk_dir)
-            tokenizer = nltk.data.load(datafile)
-
-        sentences = tokenizer.tokenize(self.clean)
-        #if max_chunks >= len(sentences):
-        #    return sentences
+            tokenizer = nltk.data.load("file:" + datafile)
+
+        sentences = []
+        for sentence in tokenizer.tokenize(self.clean):
+            if len(sentence) > max_query:
+                words = sentence.split()
+                while len(" ".join(words)) > max_query:
+                    words.pop()
+                sentence = " ".join(words)
+            sentences.append(sentence)
+
+        if max_chunks >= len(sentences):
+            return sentences
+
+        chunks = []
+        while len(chunks) < max_chunks:
+            if len(chunks) % 5 == 0:
+                chunk = sentences.pop(0)  # Pop from beginning
+            elif len(chunks) % 5 == 1:
+                chunk = sentences.pop()  # Pop from end
+            elif len(chunks) % 5 == 2:
+                chunk = sentences.pop(len(sentences) / 2)  # Pop from Q2
+            elif len(chunks) % 5 == 3:
+                chunk = sentences.pop(len(sentences) / 4)  # Pop from Q1
+            else:
+                chunk = sentences.pop(3 * len(sentences) / 4)  # Pop from Q3
+            chunks.append(chunk)
+
+        return chunks
 
 
 class HTMLTextParser(BaseTextParser):
diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py
index f627a02..8261703 100644
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -560,10 +560,10 @@ class Site(object):
                 return [self.SERVICE_API]
             sqllag = self._sql_info_cache["replag"]
 
-        if sqllag > 180:
+        if sqllag > 300:
             if not self._maxlag:
                 return [self.SERVICE_API, self.SERVICE_SQL]
-            if now - self._api_info_cache["lastcheck"] > 120:
+            if now - self._api_info_cache["lastcheck"] > 300:
                 self._api_info_cache["lastcheck"] = now
                 try:
                     self._api_info_cache["maxlag"] = apilag = self.get_maxlag()
@@ -571,7 +571,7 @@ class Site(object):
                     self._api_info_cache["maxlag"] = apilag = 0
             else:
                 apilag = self._api_info_cache["maxlag"]
-            if sqllag / (180.0 / self._maxlag) < apilag:
+            if apilag > self._maxlag:
                 return [self.SERVICE_SQL, self.SERVICE_API]
             return [self.SERVICE_API, self.SERVICE_SQL]
 
diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py
index 5af7e3a..fd3c521 100644
--- a/earwigbot/wiki/sitesdb.py
+++ b/earwigbot/wiki/sitesdb.py
@@ -363,7 +363,7 @@ class SitesDB(object):
         use_https = config.wiki.get("useHTTPS", False)
         assert_edit = config.wiki.get("assert")
         maxlag = config.wiki.get("maxlag")
-        wait_between_queries = config.wiki.get("waitTime", 5)
+        wait_between_queries = config.wiki.get("waitTime", 3)
         logger = self._logger.getChild(name)
         search_config = config.wiki.get("search")