Browse Source

Finish chunking algorithm, improve !link, other fixes.

tags/v0.1^2
Ben Kurtovic 12 years ago
parent
commit
c260648bdb
5 changed files with 47 additions and 27 deletions
  1. +5
    -9
      earwigbot/commands/link.py
  2. +1
    -1
      earwigbot/wiki/copyvios/__init__.py
  3. +37
    -13
      earwigbot/wiki/copyvios/parsers.py
  4. +3
    -3
      earwigbot/wiki/site.py
  5. +1
    -1
      earwigbot/wiki/sitesdb.py

+ 5
- 9
earwigbot/commands/link.py View File

@@ -30,6 +30,7 @@ class Link(Command):
name = "link" name = "link"


def process(self, data): def process(self, data):
self.site = self.bot.wiki.get_site()
msg = data.msg msg = data.msg


if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg): if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg):
@@ -41,8 +42,8 @@ class Link(Command):
if not data.args: if not data.args:
self.reply(data, "what do you want me to link to?") self.reply(data, "what do you want me to link to?")
return return
pagename = ' '.join(data.args)
link = self.parse_link(pagename)
pagename = " ".join(data.args)
link = self.site.get_page(pagename).url
self.reply(data, link) self.reply(data, link)


def parse_line(self, line): def parse_line(self, line):
@@ -56,8 +57,7 @@ class Link(Command):
if links: if links:
# re.findall() returns a list of tuples, but we only want the 2nd # re.findall() returns a list of tuples, but we only want the 2nd
# item in each tuple: # item in each tuple:
links = [i[1] for i in links]
results = map(self.parse_link, links)
results = [self.site.get_page(name[1]).url for name in links]


# Find all {{templates}} # Find all {{templates}}
templates = re.findall("(\{\{(.*?)(\||\}\}))", line) templates = re.findall("(\{\{(.*?)(\||\}\}))", line)
@@ -67,10 +67,6 @@ class Link(Command):


return results return results


def parse_link(self, pagename):
link = quote(pagename.replace(" ", "_"), safe="/:")
return "".join(("http://enwp.org/", link))

def parse_template(self, pagename): def parse_template(self, pagename):
pagename = "".join(("Template:", pagename)) pagename = "".join(("Template:", pagename))
return self.parse_link(pagename)
return self.site.get_page(pagename).url

+ 1
- 1
earwigbot/wiki/copyvios/__init__.py View File

@@ -179,7 +179,7 @@ class CopyvioMixIn(object):
best_chains = (empty, MarkovChainIntersection(empty, empty)) best_chains = (empty, MarkovChainIntersection(empty, empty))
parser = ArticleTextParser(self.get()) parser = ArticleTextParser(self.get())
clean = parser.strip() clean = parser.strip()
chunks = parser.chunk(max_queries, self._search_config["nltk_dir"])
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
article_chain = MarkovChain(clean) article_chain = MarkovChain(clean)
last_query = time() last_query = time()




+ 37
- 13
earwigbot/wiki/copyvios/parsers.py View File

@@ -70,18 +70,18 @@ class ArticleTextParser(BaseTextParser):
The actual stripping is handled by :py:mod:`mwparserfromhell`. The actual stripping is handled by :py:mod:`mwparserfromhell`.
""" """
wikicode = mwparserfromhell.parse(self.text) wikicode = mwparserfromhell.parse(self.text)
self.clean = u" ".join(wikicode.normalize().ifilter_text())
self.clean = wikicode.strip_code(normalize=True)
return self.clean return self.clean


def chunk(self, max_chunks, nltk_dir):
def chunk(self, nltk_dir, max_chunks, max_query=256):
"""Convert the clean article text into a list of web-searchable chunks. """Convert the clean article text into a list of web-searchable chunks.


No greater than *max_chunks* will be returned. Each chunk will only be No greater than *max_chunks* will be returned. Each chunk will only be
a sentence or two long at most. The idea here is to return a
representative sample of the article text rather than the whole, so
we'll probably pick and choose from its introduction, body, and
conclusion, especially if the article is large and *max_chunks* is low,
so we don't end up just searching for the first paragraph.
a sentence or two long at most (no more than *max_query*). The idea is
to return a sample of the article text rather than the whole, so we'll
pick and choose from parts of it, especially if the article is large
and *max_chunks* is low, so we don't end up just searching for just the
first paragraph.


This is implemented using :py:mod:`nltk` (http://nltk.org/). A base This is implemented using :py:mod:`nltk` (http://nltk.org/). A base
directory (*nltk_dir*) is required to store nltk's punctuation directory (*nltk_dir*) is required to store nltk's punctuation
@@ -89,14 +89,38 @@ class ArticleTextParser(BaseTextParser):
""" """
datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
try: try:
tokenizer = nltk.data.load(datafile)
tokenizer = nltk.data.load("file:" + datafile)
except LookupError: except LookupError:
nltk.download("punkt", nltk_dir) nltk.download("punkt", nltk_dir)
tokenizer = nltk.data.load(datafile)

sentences = tokenizer.tokenize(self.clean)
#if max_chunks >= len(sentences):
# return sentences
tokenizer = nltk.data.load("file:" + datafile)

sentences = []
for sentence in tokenizer.tokenize(self.clean):
if len(sentence) > max_query:
words = sentence.split()
while len(" ".join(words)) > max_query:
words.pop()
sentence = " ".join(words)
sentences.append(sentence)

if max_chunks >= len(sentences):
return sentences

chunks = []
while len(chunks) < max_chunks:
if len(chunks) % 5 == 0:
chunk = sentences.pop(0) # Pop from beginning
elif len(chunks) % 5 == 1:
chunk = sentences.pop() # Pop from end
elif len(chunks) % 5 == 2:
chunk = sentences.pop(len(sentences) / 2) # Pop from Q2
elif len(chunks) % 5 == 3:
chunk = sentences.pop(len(sentences) / 4) # Pop from Q1
else:
chunk = sentences.pop(3 * len(sentences) / 4) # Pop from Q3
chunks.append(chunk)

return chunks




class HTMLTextParser(BaseTextParser): class HTMLTextParser(BaseTextParser):


+ 3
- 3
earwigbot/wiki/site.py View File

@@ -560,10 +560,10 @@ class Site(object):
return [self.SERVICE_API] return [self.SERVICE_API]
sqllag = self._sql_info_cache["replag"] sqllag = self._sql_info_cache["replag"]


if sqllag > 180:
if sqllag > 300:
if not self._maxlag: if not self._maxlag:
return [self.SERVICE_API, self.SERVICE_SQL] return [self.SERVICE_API, self.SERVICE_SQL]
if now - self._api_info_cache["lastcheck"] > 120:
if now - self._api_info_cache["lastcheck"] > 300:
self._api_info_cache["lastcheck"] = now self._api_info_cache["lastcheck"] = now
try: try:
self._api_info_cache["maxlag"] = apilag = self.get_maxlag() self._api_info_cache["maxlag"] = apilag = self.get_maxlag()
@@ -571,7 +571,7 @@ class Site(object):
self._api_info_cache["maxlag"] = apilag = 0 self._api_info_cache["maxlag"] = apilag = 0
else: else:
apilag = self._api_info_cache["maxlag"] apilag = self._api_info_cache["maxlag"]
if sqllag / (180.0 / self._maxlag) < apilag:
if apilag > self._maxlag:
return [self.SERVICE_SQL, self.SERVICE_API] return [self.SERVICE_SQL, self.SERVICE_API]
return [self.SERVICE_API, self.SERVICE_SQL] return [self.SERVICE_API, self.SERVICE_SQL]




+ 1
- 1
earwigbot/wiki/sitesdb.py View File

@@ -363,7 +363,7 @@ class SitesDB(object):
use_https = config.wiki.get("useHTTPS", False) use_https = config.wiki.get("useHTTPS", False)
assert_edit = config.wiki.get("assert") assert_edit = config.wiki.get("assert")
maxlag = config.wiki.get("maxlag") maxlag = config.wiki.get("maxlag")
wait_between_queries = config.wiki.get("waitTime", 5)
wait_between_queries = config.wiki.get("waitTime", 3)
logger = self._logger.getChild(name) logger = self._logger.getChild(name)
search_config = config.wiki.get("search") search_config = config.wiki.get("search")




Loading…
Cancel
Save