From c260648bdb2a45a9c0a76f6e4df53889f28f270c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 7 Jul 2012 21:40:54 -0400 Subject: [PATCH] Finish chunking algorithm, improve !link, other fixes. --- earwigbot/commands/link.py | 14 ++++------- earwigbot/wiki/copyvios/__init__.py | 2 +- earwigbot/wiki/copyvios/parsers.py | 50 +++++++++++++++++++++++++++---------- earwigbot/wiki/site.py | 6 ++--- earwigbot/wiki/sitesdb.py | 2 +- 5 files changed, 47 insertions(+), 27 deletions(-) diff --git a/earwigbot/commands/link.py b/earwigbot/commands/link.py index 0b54554..ebe3669 100644 --- a/earwigbot/commands/link.py +++ b/earwigbot/commands/link.py @@ -30,6 +30,7 @@ class Link(Command): name = "link" def process(self, data): + self.site = self.bot.wiki.get_site() msg = data.msg if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg): @@ -41,8 +42,8 @@ class Link(Command): if not data.args: self.reply(data, "what do you want me to link to?") return - pagename = ' '.join(data.args) - link = self.parse_link(pagename) + pagename = " ".join(data.args) + link = self.site.get_page(pagename).url self.reply(data, link) def parse_line(self, line): @@ -56,8 +57,7 @@ class Link(Command): if links: # re.findall() returns a list of tuples, but we only want the 2nd # item in each tuple: - links = [i[1] for i in links] - results = map(self.parse_link, links) + results = [self.site.get_page(name[1]).url for name in links] # Find all {{templates}} templates = re.findall("(\{\{(.*?)(\||\}\}))", line) @@ -67,10 +67,6 @@ class Link(Command): return results - def parse_link(self, pagename): - link = quote(pagename.replace(" ", "_"), safe="/:") - return "".join(("http://enwp.org/", link)) - def parse_template(self, pagename): pagename = "".join(("Template:", pagename)) - return self.parse_link(pagename) + return self.site.get_page(pagename).url diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 5fb7bf2..cf2ddde 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -179,7 +179,7 @@ class CopyvioMixIn(object): best_chains = (empty, MarkovChainIntersection(empty, empty)) parser = ArticleTextParser(self.get()) clean = parser.strip() - chunks = parser.chunk(max_queries, self._search_config["nltk_dir"]) + chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) article_chain = MarkovChain(clean) last_query = time() diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index a00369d..b258730 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -70,18 +70,18 @@ class ArticleTextParser(BaseTextParser): The actual stripping is handled by :py:mod:`mwparserfromhell`. """ wikicode = mwparserfromhell.parse(self.text) - self.clean = u" ".join(wikicode.normalize().ifilter_text()) + self.clean = wikicode.strip_code(normalize=True) return self.clean - def chunk(self, max_chunks, nltk_dir): + def chunk(self, nltk_dir, max_chunks, max_query=256): """Convert the clean article text into a list of web-searchable chunks. No greater than *max_chunks* will be returned. Each chunk will only be - a sentence or two long at most. The idea here is to return a - representative sample of the article text rather than the whole, so - we'll probably pick and choose from its introduction, body, and - conclusion, especially if the article is large and *max_chunks* is low, - so we don't end up just searching for the first paragraph. + a sentence or two long at most (no more than *max_query*). The idea is + to return a sample of the article text rather than the whole, so we'll + pick and choose from parts of it, especially if the article is large + and *max_chunks* is low, so we don't end up just searching for just the + first paragraph. This is implemented using :py:mod:`nltk` (http://nltk.org/). A base directory (*nltk_dir*) is required to store nltk's punctuation @@ -89,14 +89,38 @@ class ArticleTextParser(BaseTextParser): """ datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") try: - tokenizer = nltk.data.load(datafile) + tokenizer = nltk.data.load("file:" + datafile) except LookupError: nltk.download("punkt", nltk_dir) - tokenizer = nltk.data.load(datafile) - - sentences = tokenizer.tokenize(self.clean) - #if max_chunks >= len(sentences): - # return sentences + tokenizer = nltk.data.load("file:" + datafile) + + sentences = [] + for sentence in tokenizer.tokenize(self.clean): + if len(sentence) > max_query: + words = sentence.split() + while len(" ".join(words)) > max_query: + words.pop() + sentence = " ".join(words) + sentences.append(sentence) + + if max_chunks >= len(sentences): + return sentences + + chunks = [] + while len(chunks) < max_chunks: + if len(chunks) % 5 == 0: + chunk = sentences.pop(0) # Pop from beginning + elif len(chunks) % 5 == 1: + chunk = sentences.pop() # Pop from end + elif len(chunks) % 5 == 2: + chunk = sentences.pop(len(sentences) / 2) # Pop from Q2 + elif len(chunks) % 5 == 3: + chunk = sentences.pop(len(sentences) / 4) # Pop from Q1 + else: + chunk = sentences.pop(3 * len(sentences) / 4) # Pop from Q3 + chunks.append(chunk) + + return chunks class HTMLTextParser(BaseTextParser): diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index f627a02..8261703 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -560,10 +560,10 @@ class Site(object): return [self.SERVICE_API] sqllag = self._sql_info_cache["replag"] - if sqllag > 180: + if sqllag > 300: if not self._maxlag: return [self.SERVICE_API, self.SERVICE_SQL] - if now - self._api_info_cache["lastcheck"] > 120: + if now - self._api_info_cache["lastcheck"] > 300: self._api_info_cache["lastcheck"] = now try: self._api_info_cache["maxlag"] = apilag = self.get_maxlag() @@ -571,7 +571,7 @@ class Site(object): self._api_info_cache["maxlag"] = apilag = 0 else: apilag = self._api_info_cache["maxlag"] - if sqllag / (180.0 / self._maxlag) < apilag: + if apilag > self._maxlag: return [self.SERVICE_SQL, self.SERVICE_API] return [self.SERVICE_API, self.SERVICE_SQL] diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py index 5af7e3a..fd3c521 100644 --- a/earwigbot/wiki/sitesdb.py +++ b/earwigbot/wiki/sitesdb.py @@ -363,7 +363,7 @@ class SitesDB(object): use_https = config.wiki.get("useHTTPS", False) assert_edit = config.wiki.get("assert") maxlag = config.wiki.get("maxlag") - wait_between_queries = config.wiki.get("waitTime", 5) + wait_between_queries = config.wiki.get("waitTime", 3) logger = self._logger.getChild(name) search_config = config.wiki.get("search")