@@ -30,6 +30,7 @@ class Link(Command): | |||
name = "link" | |||
def process(self, data): | |||
self.site = self.bot.wiki.get_site() | |||
msg = data.msg | |||
if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg): | |||
@@ -41,8 +42,8 @@ class Link(Command): | |||
if not data.args: | |||
self.reply(data, "what do you want me to link to?") | |||
return | |||
pagename = ' '.join(data.args) | |||
link = self.parse_link(pagename) | |||
pagename = " ".join(data.args) | |||
link = self.site.get_page(pagename).url | |||
self.reply(data, link) | |||
def parse_line(self, line): | |||
@@ -56,8 +57,7 @@ class Link(Command): | |||
if links: | |||
# re.findall() returns a list of tuples, but we only want the 2nd | |||
# item in each tuple: | |||
links = [i[1] for i in links] | |||
results = map(self.parse_link, links) | |||
results = [self.site.get_page(name[1]).url for name in links] | |||
# Find all {{templates}} | |||
templates = re.findall("(\{\{(.*?)(\||\}\}))", line) | |||
@@ -67,10 +67,6 @@ class Link(Command): | |||
return results | |||
def parse_link(self, pagename): | |||
link = quote(pagename.replace(" ", "_"), safe="/:") | |||
return "".join(("http://enwp.org/", link)) | |||
def parse_template(self, pagename): | |||
pagename = "".join(("Template:", pagename)) | |||
return self.parse_link(pagename) | |||
return self.site.get_page(pagename).url |
@@ -179,7 +179,7 @@ class CopyvioMixIn(object): | |||
best_chains = (empty, MarkovChainIntersection(empty, empty)) | |||
parser = ArticleTextParser(self.get()) | |||
clean = parser.strip() | |||
chunks = parser.chunk(max_queries, self._search_config["nltk_dir"]) | |||
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) | |||
article_chain = MarkovChain(clean) | |||
last_query = time() | |||
@@ -70,18 +70,18 @@ class ArticleTextParser(BaseTextParser): | |||
The actual stripping is handled by :py:mod:`mwparserfromhell`. | |||
""" | |||
wikicode = mwparserfromhell.parse(self.text) | |||
self.clean = u" ".join(wikicode.normalize().ifilter_text()) | |||
self.clean = wikicode.strip_code(normalize=True) | |||
return self.clean | |||
def chunk(self, max_chunks, nltk_dir): | |||
def chunk(self, nltk_dir, max_chunks, max_query=256): | |||
"""Convert the clean article text into a list of web-searchable chunks. | |||
No greater than *max_chunks* will be returned. Each chunk will only be | |||
a sentence or two long at most. The idea here is to return a | |||
representative sample of the article text rather than the whole, so | |||
we'll probably pick and choose from its introduction, body, and | |||
conclusion, especially if the article is large and *max_chunks* is low, | |||
so we don't end up just searching for the first paragraph. | |||
a sentence or two long at most (no more than *max_query*). The idea is | |||
to return a sample of the article text rather than the whole, so we'll | |||
pick and choose from parts of it, especially if the article is large | |||
and *max_chunks* is low, so we don't end up just searching for just the | |||
first paragraph. | |||
This is implemented using :py:mod:`nltk` (http://nltk.org/). A base | |||
directory (*nltk_dir*) is required to store nltk's punctuation | |||
@@ -89,14 +89,38 @@ class ArticleTextParser(BaseTextParser): | |||
""" | |||
datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") | |||
try: | |||
tokenizer = nltk.data.load(datafile) | |||
tokenizer = nltk.data.load("file:" + datafile) | |||
except LookupError: | |||
nltk.download("punkt", nltk_dir) | |||
tokenizer = nltk.data.load(datafile) | |||
sentences = tokenizer.tokenize(self.clean) | |||
#if max_chunks >= len(sentences): | |||
# return sentences | |||
tokenizer = nltk.data.load("file:" + datafile) | |||
sentences = [] | |||
for sentence in tokenizer.tokenize(self.clean): | |||
if len(sentence) > max_query: | |||
words = sentence.split() | |||
while len(" ".join(words)) > max_query: | |||
words.pop() | |||
sentence = " ".join(words) | |||
sentences.append(sentence) | |||
if max_chunks >= len(sentences): | |||
return sentences | |||
chunks = [] | |||
while len(chunks) < max_chunks: | |||
if len(chunks) % 5 == 0: | |||
chunk = sentences.pop(0) # Pop from beginning | |||
elif len(chunks) % 5 == 1: | |||
chunk = sentences.pop() # Pop from end | |||
elif len(chunks) % 5 == 2: | |||
chunk = sentences.pop(len(sentences) / 2) # Pop from Q2 | |||
elif len(chunks) % 5 == 3: | |||
chunk = sentences.pop(len(sentences) / 4) # Pop from Q1 | |||
else: | |||
chunk = sentences.pop(3 * len(sentences) / 4) # Pop from Q3 | |||
chunks.append(chunk) | |||
return chunks | |||
class HTMLTextParser(BaseTextParser): | |||
@@ -560,10 +560,10 @@ class Site(object): | |||
return [self.SERVICE_API] | |||
sqllag = self._sql_info_cache["replag"] | |||
if sqllag > 180: | |||
if sqllag > 300: | |||
if not self._maxlag: | |||
return [self.SERVICE_API, self.SERVICE_SQL] | |||
if now - self._api_info_cache["lastcheck"] > 120: | |||
if now - self._api_info_cache["lastcheck"] > 300: | |||
self._api_info_cache["lastcheck"] = now | |||
try: | |||
self._api_info_cache["maxlag"] = apilag = self.get_maxlag() | |||
@@ -571,7 +571,7 @@ class Site(object): | |||
self._api_info_cache["maxlag"] = apilag = 0 | |||
else: | |||
apilag = self._api_info_cache["maxlag"] | |||
if sqllag / (180.0 / self._maxlag) < apilag: | |||
if apilag > self._maxlag: | |||
return [self.SERVICE_SQL, self.SERVICE_API] | |||
return [self.SERVICE_API, self.SERVICE_SQL] | |||
@@ -363,7 +363,7 @@ class SitesDB(object): | |||
use_https = config.wiki.get("useHTTPS", False) | |||
assert_edit = config.wiki.get("assert") | |||
maxlag = config.wiki.get("maxlag") | |||
wait_between_queries = config.wiki.get("waitTime", 5) | |||
wait_between_queries = config.wiki.get("waitTime", 3) | |||
logger = self._logger.getChild(name) | |||
search_config = config.wiki.get("search") | |||