Browse Source

Remove auto-quotes from queries; add min_query; halve max_query.

tags/v0.2
Ben Kurtovic 10 years ago
parent
commit
3e4dac967d
2 changed files with 4 additions and 2 deletions
  1. +3
    -1
      earwigbot/wiki/copyvios/parsers.py
  2. +1
    -1
      earwigbot/wiki/copyvios/search.py

+ 3
- 1
earwigbot/wiki/copyvios/parsers.py View File

@@ -74,7 +74,7 @@ class ArticleTextParser(BaseTextParser):
self.clean = clean.replace("\n\n", "\n").strip()
return self.clean

def chunk(self, nltk_dir, max_chunks, max_query=256):
def chunk(self, nltk_dir, max_chunks, min_query=8, max_query=128):
"""Convert the clean article text into a list of web-searchable chunks.

No greater than *max_chunks* will be returned. Each chunk will only be
@@ -99,6 +99,8 @@ class ArticleTextParser(BaseTextParser):

sentences = []
for sentence in tokenizer.tokenize(self.clean):
if len(sentence) < min_query:
continue
if len(sentence) > max_query:
words = sentence.split()
while len(" ".join(words)) > max_query:


+ 1
- 1
earwigbot/wiki/copyvios/search.py View File

@@ -77,7 +77,7 @@ class YahooBOSSSearchEngine(BaseSearchEngine):
"oauth_nonce": oauth.generate_nonce(),
"oauth_timestamp": oauth.Request.make_timestamp(),
"oauth_consumer_key": consumer.key,
"q": quote_plus('"' + query.encode("utf8") + '"'),
"q": quote_plus(query.encode("utf8")),
"type": "html,text",
"format": "json",
}


Loading…
Cancel
Save