@@ -87,6 +87,7 @@ class CopyvioMixIn(object): | |||||
""" | """ | ||||
def __init__(self, site): | def __init__(self, site): | ||||
self._search_config = site._search_config | |||||
self._opener = build_opener() | self._opener = build_opener() | ||||
self._opener.addheaders = site._opener.addheaders | self._opener.addheaders = site._opener.addheaders | ||||
@@ -126,7 +127,8 @@ class CopyvioMixIn(object): | |||||
unknown to us, and UnsupportedSearchEngineError if we are missing a | unknown to us, and UnsupportedSearchEngineError if we are missing a | ||||
required package or module, like oauth2 for "Yahoo! BOSS". | required package or module, like oauth2 for "Yahoo! BOSS". | ||||
""" | """ | ||||
engine, credentials = self._site._search_config | |||||
engine = self._search_config["engine"] | |||||
credentials = self._search_config["credentials"] | |||||
if engine == "Yahoo! BOSS": | if engine == "Yahoo! BOSS": | ||||
if not oauth: | if not oauth: | ||||
@@ -177,7 +179,7 @@ class CopyvioMixIn(object): | |||||
best_chains = (empty, MarkovChainIntersection(empty, empty)) | best_chains = (empty, MarkovChainIntersection(empty, empty)) | ||||
parser = ArticleTextParser(self.get()) | parser = ArticleTextParser(self.get()) | ||||
clean = parser.strip() | clean = parser.strip() | ||||
chunks = parser.chunk(max_queries) | |||||
chunks = parser.chunk(max_queries, self._search_config["nltk_dir"]) | |||||
article_chain = MarkovChain(clean) | article_chain = MarkovChain(clean) | ||||
last_query = time() | last_query = time() | ||||
@@ -20,7 +20,7 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
import htmlentitydefs | |||||
from os import path | |||||
try: | try: | ||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||
@@ -32,6 +32,11 @@ try: | |||||
except ImportError: | except ImportError: | ||||
mwparserfromhell = None | mwparserfromhell = None | ||||
try: | |||||
import nltk | |||||
except ImportError: | |||||
nltk = None | |||||
__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] | __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] | ||||
class BaseTextParser(object): | class BaseTextParser(object): | ||||
@@ -68,17 +73,30 @@ class ArticleTextParser(BaseTextParser): | |||||
self.clean = u" ".join(wikicode.normalize().ifilter_text()) | self.clean = u" ".join(wikicode.normalize().ifilter_text()) | ||||
return self.clean | return self.clean | ||||
def chunk(self, max_chunks): | |||||
def chunk(self, max_chunks, nltk_dir): | |||||
"""Convert the clean article text into a list of web-searchable chunks. | """Convert the clean article text into a list of web-searchable chunks. | ||||
No greater than *max_chunks* will be returned. Each chunk will only be | No greater than *max_chunks* will be returned. Each chunk will only be | ||||
a couple sentences long at most. The idea here is to return a | |||||
representative sample of the article text rather than the entire | |||||
article, so we'll probably pick and choose from its introduction, body, | |||||
and conclusion, especially if the article is large and *max_chunks* is | |||||
low, so we don't end up just searching for the first paragraph. | |||||
a sentence or two long at most. The idea here is to return a | |||||
representative sample of the article text rather than the whole, so | |||||
we'll probably pick and choose from its introduction, body, and | |||||
conclusion, especially if the article is large and *max_chunks* is low, | |||||
so we don't end up just searching for the first paragraph. | |||||
This is implemented using :py:mod:`nltk` (http://nltk.org/). A base | |||||
directory (*nltk_dir*) is required to store nltk's punctuation | |||||
database. This is typically located in the bot's working directory. | |||||
""" | """ | ||||
return [self.text] # TODO: NotImplemented | |||||
datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") | |||||
try: | |||||
tokenizer = nltk.data.load(datafile) | |||||
except LookupError: | |||||
nltk.download("punkt", nltk_dir) | |||||
tokenizer = nltk.data.load(datafile) | |||||
sentences = tokenizer.tokenize(self.clean) | |||||
#if max_chunks >= len(sentences): | |||||
# return sentences | |||||
class HTMLTextParser(BaseTextParser): | class HTMLTextParser(BaseTextParser): | ||||
@@ -67,7 +67,7 @@ class YahooBOSSSearchEngine(BaseSearchEngine): | |||||
""" | """ | ||||
base_url = "http://yboss.yahooapis.com/ysearch/web" | base_url = "http://yboss.yahooapis.com/ysearch/web" | ||||
query = quote_plus(query.join('"', '"')) | query = quote_plus(query.join('"', '"')) | ||||
params = {"q": query, "style": "raw", "format": "json"} | |||||
params = {"q": query, "type": "html,text", "format": "json"} | |||||
url = "{0}?{1}".format(base_url, urlencode(params)) | url = "{0}?{1}".format(base_url, urlencode(params)) | ||||
consumer = oauth.Consumer(key=self.cred["key"], | consumer = oauth.Consumer(key=self.cred["key"], | ||||
@@ -92,7 +92,7 @@ class Site(object): | |||||
namespaces=None, login=(None, None), cookiejar=None, | namespaces=None, login=(None, None), cookiejar=None, | ||||
user_agent=None, use_https=False, assert_edit=None, | user_agent=None, use_https=False, assert_edit=None, | ||||
maxlag=None, wait_between_queries=3, logger=None, | maxlag=None, wait_between_queries=3, logger=None, | ||||
search_config=(None, None)): | |||||
search_config=None): | |||||
"""Constructor for new Site instances. | """Constructor for new Site instances. | ||||
This probably isn't necessary to call yourself unless you're building a | This probably isn't necessary to call yourself unless you're building a | ||||
@@ -192,6 +192,10 @@ class SitesDB(object): | |||||
user_agent = user_agent.replace("$1", __version__) | user_agent = user_agent.replace("$1", __version__) | ||||
user_agent = user_agent.replace("$2", python_version()) | user_agent = user_agent.replace("$2", python_version()) | ||||
if search_config: | |||||
nltk_dir = path.join(self.config.root_dir, ".nltk") | |||||
search_config["nltk_dir"] = nltk_dir | |||||
return Site(name=name, project=project, lang=lang, base_url=base_url, | return Site(name=name, project=project, lang=lang, base_url=base_url, | ||||
article_path=article_path, script_path=script_path, | article_path=article_path, script_path=script_path, | ||||
sql=sql, namespaces=namespaces, login=login, | sql=sql, namespaces=namespaces, login=login, | ||||
@@ -360,14 +364,23 @@ class SitesDB(object): | |||||
assert_edit = config.wiki.get("assert") | assert_edit = config.wiki.get("assert") | ||||
maxlag = config.wiki.get("maxlag") | maxlag = config.wiki.get("maxlag") | ||||
wait_between_queries = config.wiki.get("waitTime", 5) | wait_between_queries = config.wiki.get("waitTime", 5) | ||||
logger = self._logger.getChild(name) | |||||
search_config = config.wiki.get("search") | search_config = config.wiki.get("search") | ||||
if user_agent: | |||||
user_agent = user_agent.replace("$1", __version__) | |||||
user_agent = user_agent.replace("$2", python_version()) | |||||
if search_config: | |||||
nltk_dir = path.join(self.config.root_dir, ".nltk") | |||||
search_config["nltk_dir"] = nltk_dir | |||||
# Create a Site object to log in and load the other attributes: | # Create a Site object to log in and load the other attributes: | ||||
site = Site(base_url=base_url, script_path=script_path, sql=sql, | site = Site(base_url=base_url, script_path=script_path, sql=sql, | ||||
login=login, cookiejar=cookiejar, user_agent=user_agent, | login=login, cookiejar=cookiejar, user_agent=user_agent, | ||||
use_https=use_https, assert_edit=assert_edit, | use_https=use_https, assert_edit=assert_edit, | ||||
maxlag=maxlag, wait_between_queries=wait_between_queries, | maxlag=maxlag, wait_between_queries=wait_between_queries, | ||||
search_config=search_config) | |||||
logger=logger, search_config=search_config) | |||||
self._add_site_to_sitesdb(site) | self._add_site_to_sitesdb(site) | ||||
self._sites[site.name] = site | self._sites[site.name] = site | ||||
@@ -25,6 +25,25 @@ from setuptools import setup, find_packages | |||||
from earwigbot import __version__ | from earwigbot import __version__ | ||||
# Not all of these dependencies are required, particularly the copyvio-specific | |||||
# ones (bs4, lxml, nltk, and oauth2) or the command-specific ones (GitPython, | |||||
# pytz). The bot should run fine without them, but will raise an exception if | |||||
# you try to detect copyvios or run a command that requries one. | |||||
dependencies = [ | |||||
"GitPython >= 0.3.2.RC1", # Interfacing with git for !git and __version__ | |||||
"PyYAML >= 3.10", # Parsing config files | |||||
"beautifulsoup4 >= 4.1.1", # Parsing/scraping HTML for copyvios | |||||
"lxml >= 2.3.4", # Faster parser for BeautifulSoup | |||||
"mwparserfromhell >= 0.1", # Parsing wikicode for manipulation | |||||
"nltk >= 2.0.2", # Parsing sentences to split article content for copyvios | |||||
"oursql >= 0.9.3", # Interfacing with MediaWiki databases | |||||
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search for copyvios | |||||
"py-bcrypt >= 0.2", # Hashing the bot key in the config file | |||||
"pycrypto >= 2.5", # Storing bot passwords and keys in the config file | |||||
"pytz >= 2012c", # Handling timezones for the !time IRC command | |||||
] | |||||
with open("README.rst") as fp: | with open("README.rst") as fp: | ||||
long_docs = fp.read() | long_docs = fp.read() | ||||
@@ -32,17 +51,7 @@ setup( | |||||
name = "earwigbot", | name = "earwigbot", | ||||
packages = find_packages(exclude=("tests",)), | packages = find_packages(exclude=("tests",)), | ||||
entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]}, | entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]}, | ||||
install_requires = ["GitPython >= 0.3.2.RC1", # Interfacing with git | |||||
"PyYAML >= 3.10", # Config parsing | |||||
"beautifulsoup4 >= 4.1.1", # HTML parsing/scraping | |||||
"lxml >= 2.3.4", # Faster parser for BeautifulSoup | |||||
"mwparserfromhell >= 0.1", # Wikicode parsing | |||||
"oursql >= 0.9.3", # Talking with MediaWiki databases | |||||
"oauth2 >= 1.5.211", # Talking with Yahoo BOSS Search | |||||
"py-bcrypt >= 0.2", # Password hashing in config | |||||
"pycrypto >= 2.5", # Storing bot passwords and keys | |||||
"pytz >= 2012c", # Timezone handling | |||||
], | |||||
install_requires = dependencies, | |||||
test_suite = "tests", | test_suite = "tests", | ||||
version = __version__, | version = __version__, | ||||
author = "Ben Kurtovic", | author = "Ben Kurtovic", | ||||