@@ -30,6 +30,7 @@ class Link(Command): | |||||
name = "link" | name = "link" | ||||
def process(self, data): | def process(self, data): | ||||
self.site = self.bot.wiki.get_site() | |||||
msg = data.msg | msg = data.msg | ||||
if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg): | if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg): | ||||
@@ -41,8 +42,8 @@ class Link(Command): | |||||
if not data.args: | if not data.args: | ||||
self.reply(data, "what do you want me to link to?") | self.reply(data, "what do you want me to link to?") | ||||
return | return | ||||
pagename = ' '.join(data.args) | |||||
link = self.parse_link(pagename) | |||||
pagename = " ".join(data.args) | |||||
link = self.site.get_page(pagename).url | |||||
self.reply(data, link) | self.reply(data, link) | ||||
def parse_line(self, line): | def parse_line(self, line): | ||||
@@ -56,8 +57,7 @@ class Link(Command): | |||||
if links: | if links: | ||||
# re.findall() returns a list of tuples, but we only want the 2nd | # re.findall() returns a list of tuples, but we only want the 2nd | ||||
# item in each tuple: | # item in each tuple: | ||||
links = [i[1] for i in links] | |||||
results = map(self.parse_link, links) | |||||
results = [self.site.get_page(name[1]).url for name in links] | |||||
# Find all {{templates}} | # Find all {{templates}} | ||||
templates = re.findall("(\{\{(.*?)(\||\}\}))", line) | templates = re.findall("(\{\{(.*?)(\||\}\}))", line) | ||||
@@ -67,10 +67,6 @@ class Link(Command): | |||||
return results | return results | ||||
def parse_link(self, pagename): | |||||
link = quote(pagename.replace(" ", "_"), safe="/:") | |||||
return "".join(("http://enwp.org/", link)) | |||||
def parse_template(self, pagename): | def parse_template(self, pagename): | ||||
pagename = "".join(("Template:", pagename)) | pagename = "".join(("Template:", pagename)) | ||||
return self.parse_link(pagename) | |||||
return self.site.get_page(pagename).url |
@@ -179,7 +179,7 @@ class CopyvioMixIn(object): | |||||
best_chains = (empty, MarkovChainIntersection(empty, empty)) | best_chains = (empty, MarkovChainIntersection(empty, empty)) | ||||
parser = ArticleTextParser(self.get()) | parser = ArticleTextParser(self.get()) | ||||
clean = parser.strip() | clean = parser.strip() | ||||
chunks = parser.chunk(max_queries, self._search_config["nltk_dir"]) | |||||
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) | |||||
article_chain = MarkovChain(clean) | article_chain = MarkovChain(clean) | ||||
last_query = time() | last_query = time() | ||||
@@ -70,18 +70,18 @@ class ArticleTextParser(BaseTextParser): | |||||
The actual stripping is handled by :py:mod:`mwparserfromhell`. | The actual stripping is handled by :py:mod:`mwparserfromhell`. | ||||
""" | """ | ||||
wikicode = mwparserfromhell.parse(self.text) | wikicode = mwparserfromhell.parse(self.text) | ||||
self.clean = u" ".join(wikicode.normalize().ifilter_text()) | |||||
self.clean = wikicode.strip_code(normalize=True) | |||||
return self.clean | return self.clean | ||||
def chunk(self, max_chunks, nltk_dir): | |||||
def chunk(self, nltk_dir, max_chunks, max_query=256): | |||||
"""Convert the clean article text into a list of web-searchable chunks. | """Convert the clean article text into a list of web-searchable chunks. | ||||
No greater than *max_chunks* will be returned. Each chunk will only be | No greater than *max_chunks* will be returned. Each chunk will only be | ||||
a sentence or two long at most. The idea here is to return a | |||||
representative sample of the article text rather than the whole, so | |||||
we'll probably pick and choose from its introduction, body, and | |||||
conclusion, especially if the article is large and *max_chunks* is low, | |||||
so we don't end up just searching for the first paragraph. | |||||
a sentence or two long at most (no more than *max_query*). The idea is | |||||
to return a sample of the article text rather than the whole, so we'll | |||||
pick and choose from parts of it, especially if the article is large | |||||
and *max_chunks* is low, so we don't end up just searching for just the | |||||
first paragraph. | |||||
This is implemented using :py:mod:`nltk` (http://nltk.org/). A base | This is implemented using :py:mod:`nltk` (http://nltk.org/). A base | ||||
directory (*nltk_dir*) is required to store nltk's punctuation | directory (*nltk_dir*) is required to store nltk's punctuation | ||||
@@ -89,14 +89,38 @@ class ArticleTextParser(BaseTextParser): | |||||
""" | """ | ||||
datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") | datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") | ||||
try: | try: | ||||
tokenizer = nltk.data.load(datafile) | |||||
tokenizer = nltk.data.load("file:" + datafile) | |||||
except LookupError: | except LookupError: | ||||
nltk.download("punkt", nltk_dir) | nltk.download("punkt", nltk_dir) | ||||
tokenizer = nltk.data.load(datafile) | |||||
sentences = tokenizer.tokenize(self.clean) | |||||
#if max_chunks >= len(sentences): | |||||
# return sentences | |||||
tokenizer = nltk.data.load("file:" + datafile) | |||||
sentences = [] | |||||
for sentence in tokenizer.tokenize(self.clean): | |||||
if len(sentence) > max_query: | |||||
words = sentence.split() | |||||
while len(" ".join(words)) > max_query: | |||||
words.pop() | |||||
sentence = " ".join(words) | |||||
sentences.append(sentence) | |||||
if max_chunks >= len(sentences): | |||||
return sentences | |||||
chunks = [] | |||||
while len(chunks) < max_chunks: | |||||
if len(chunks) % 5 == 0: | |||||
chunk = sentences.pop(0) # Pop from beginning | |||||
elif len(chunks) % 5 == 1: | |||||
chunk = sentences.pop() # Pop from end | |||||
elif len(chunks) % 5 == 2: | |||||
chunk = sentences.pop(len(sentences) / 2) # Pop from Q2 | |||||
elif len(chunks) % 5 == 3: | |||||
chunk = sentences.pop(len(sentences) / 4) # Pop from Q1 | |||||
else: | |||||
chunk = sentences.pop(3 * len(sentences) / 4) # Pop from Q3 | |||||
chunks.append(chunk) | |||||
return chunks | |||||
class HTMLTextParser(BaseTextParser): | class HTMLTextParser(BaseTextParser): | ||||
@@ -560,10 +560,10 @@ class Site(object): | |||||
return [self.SERVICE_API] | return [self.SERVICE_API] | ||||
sqllag = self._sql_info_cache["replag"] | sqllag = self._sql_info_cache["replag"] | ||||
if sqllag > 180: | |||||
if sqllag > 300: | |||||
if not self._maxlag: | if not self._maxlag: | ||||
return [self.SERVICE_API, self.SERVICE_SQL] | return [self.SERVICE_API, self.SERVICE_SQL] | ||||
if now - self._api_info_cache["lastcheck"] > 120: | |||||
if now - self._api_info_cache["lastcheck"] > 300: | |||||
self._api_info_cache["lastcheck"] = now | self._api_info_cache["lastcheck"] = now | ||||
try: | try: | ||||
self._api_info_cache["maxlag"] = apilag = self.get_maxlag() | self._api_info_cache["maxlag"] = apilag = self.get_maxlag() | ||||
@@ -571,7 +571,7 @@ class Site(object): | |||||
self._api_info_cache["maxlag"] = apilag = 0 | self._api_info_cache["maxlag"] = apilag = 0 | ||||
else: | else: | ||||
apilag = self._api_info_cache["maxlag"] | apilag = self._api_info_cache["maxlag"] | ||||
if sqllag / (180.0 / self._maxlag) < apilag: | |||||
if apilag > self._maxlag: | |||||
return [self.SERVICE_SQL, self.SERVICE_API] | return [self.SERVICE_SQL, self.SERVICE_API] | ||||
return [self.SERVICE_API, self.SERVICE_SQL] | return [self.SERVICE_API, self.SERVICE_SQL] | ||||
@@ -363,7 +363,7 @@ class SitesDB(object): | |||||
use_https = config.wiki.get("useHTTPS", False) | use_https = config.wiki.get("useHTTPS", False) | ||||
assert_edit = config.wiki.get("assert") | assert_edit = config.wiki.get("assert") | ||||
maxlag = config.wiki.get("maxlag") | maxlag = config.wiki.get("maxlag") | ||||
wait_between_queries = config.wiki.get("waitTime", 5) | |||||
wait_between_queries = config.wiki.get("waitTime", 3) | |||||
logger = self._logger.getChild(name) | logger = self._logger.getChild(name) | ||||
search_config = config.wiki.get("search") | search_config = config.wiki.get("search") | ||||