diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index e9ac415..89e1e8f 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -74,7 +74,7 @@ class ArticleTextParser(BaseTextParser): self.clean = clean.replace("\n\n", "\n").strip() return self.clean - def chunk(self, nltk_dir, max_chunks, max_query=256): + def chunk(self, nltk_dir, max_chunks, min_query=8, max_query=128): """Convert the clean article text into a list of web-searchable chunks. No greater than *max_chunks* will be returned. Each chunk will only be @@ -99,6 +99,8 @@ class ArticleTextParser(BaseTextParser): sentences = [] for sentence in tokenizer.tokenize(self.clean): + if len(sentence) < min_query: + continue if len(sentence) > max_query: words = sentence.split() while len(" ".join(words)) > max_query: diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index 2a875cc..7570294 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -77,7 +77,7 @@ class YahooBOSSSearchEngine(BaseSearchEngine): "oauth_nonce": oauth.generate_nonce(), "oauth_timestamp": oauth.Request.make_timestamp(), "oauth_consumer_key": consumer.key, - "q": quote_plus('"' + query.encode("utf8") + '"'), + "q": quote_plus(query.encode("utf8")), "type": "html,text", "format": "json", }