@@ -87,6 +87,7 @@ class CopyvioMixIn(object): | |||
""" | |||
def __init__(self, site): | |||
self._search_config = site._search_config | |||
self._opener = build_opener() | |||
self._opener.addheaders = site._opener.addheaders | |||
@@ -126,7 +127,8 @@ class CopyvioMixIn(object): | |||
unknown to us, and UnsupportedSearchEngineError if we are missing a | |||
required package or module, like oauth2 for "Yahoo! BOSS". | |||
""" | |||
engine, credentials = self._site._search_config | |||
engine = self._search_config["engine"] | |||
credentials = self._search_config["credentials"] | |||
if engine == "Yahoo! BOSS": | |||
if not oauth: | |||
@@ -177,7 +179,7 @@ class CopyvioMixIn(object): | |||
best_chains = (empty, MarkovChainIntersection(empty, empty)) | |||
parser = ArticleTextParser(self.get()) | |||
clean = parser.strip() | |||
chunks = parser.chunk(max_queries) | |||
chunks = parser.chunk(max_queries, self._search_config["nltk_dir"]) | |||
article_chain = MarkovChain(clean) | |||
last_query = time() | |||
@@ -20,7 +20,7 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
import htmlentitydefs | |||
from os import path | |||
try: | |||
from bs4 import BeautifulSoup | |||
@@ -32,6 +32,11 @@ try: | |||
except ImportError: | |||
mwparserfromhell = None | |||
try: | |||
import nltk | |||
except ImportError: | |||
nltk = None | |||
__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] | |||
class BaseTextParser(object): | |||
@@ -68,17 +73,30 @@ class ArticleTextParser(BaseTextParser): | |||
self.clean = u" ".join(wikicode.normalize().ifilter_text()) | |||
return self.clean | |||
def chunk(self, max_chunks): | |||
def chunk(self, max_chunks, nltk_dir): | |||
"""Convert the clean article text into a list of web-searchable chunks. | |||
No greater than *max_chunks* will be returned. Each chunk will only be | |||
a couple sentences long at most. The idea here is to return a | |||
representative sample of the article text rather than the entire | |||
article, so we'll probably pick and choose from its introduction, body, | |||
and conclusion, especially if the article is large and *max_chunks* is | |||
low, so we don't end up just searching for the first paragraph. | |||
a sentence or two long at most. The idea here is to return a | |||
representative sample of the article text rather than the whole, so | |||
we'll probably pick and choose from its introduction, body, and | |||
conclusion, especially if the article is large and *max_chunks* is low, | |||
so we don't end up just searching for the first paragraph. | |||
This is implemented using :py:mod:`nltk` (http://nltk.org/). A base | |||
directory (*nltk_dir*) is required to store nltk's punctuation | |||
database. This is typically located in the bot's working directory. | |||
""" | |||
return [self.text] # TODO: NotImplemented | |||
datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") | |||
try: | |||
tokenizer = nltk.data.load(datafile) | |||
except LookupError: | |||
nltk.download("punkt", nltk_dir) | |||
tokenizer = nltk.data.load(datafile) | |||
sentences = tokenizer.tokenize(self.clean) | |||
#if max_chunks >= len(sentences): | |||
# return sentences | |||
class HTMLTextParser(BaseTextParser): | |||
@@ -67,7 +67,7 @@ class YahooBOSSSearchEngine(BaseSearchEngine): | |||
""" | |||
base_url = "http://yboss.yahooapis.com/ysearch/web" | |||
query = quote_plus(query.join('"', '"')) | |||
params = {"q": query, "style": "raw", "format": "json"} | |||
params = {"q": query, "type": "html,text", "format": "json"} | |||
url = "{0}?{1}".format(base_url, urlencode(params)) | |||
consumer = oauth.Consumer(key=self.cred["key"], | |||
@@ -92,7 +92,7 @@ class Site(object): | |||
namespaces=None, login=(None, None), cookiejar=None, | |||
user_agent=None, use_https=False, assert_edit=None, | |||
maxlag=None, wait_between_queries=3, logger=None, | |||
search_config=(None, None)): | |||
search_config=None): | |||
"""Constructor for new Site instances. | |||
This probably isn't necessary to call yourself unless you're building a | |||
@@ -192,6 +192,10 @@ class SitesDB(object): | |||
user_agent = user_agent.replace("$1", __version__) | |||
user_agent = user_agent.replace("$2", python_version()) | |||
if search_config: | |||
nltk_dir = path.join(self.config.root_dir, ".nltk") | |||
search_config["nltk_dir"] = nltk_dir | |||
return Site(name=name, project=project, lang=lang, base_url=base_url, | |||
article_path=article_path, script_path=script_path, | |||
sql=sql, namespaces=namespaces, login=login, | |||
@@ -360,14 +364,23 @@ class SitesDB(object): | |||
assert_edit = config.wiki.get("assert") | |||
maxlag = config.wiki.get("maxlag") | |||
wait_between_queries = config.wiki.get("waitTime", 5) | |||
logger = self._logger.getChild(name) | |||
search_config = config.wiki.get("search") | |||
if user_agent: | |||
user_agent = user_agent.replace("$1", __version__) | |||
user_agent = user_agent.replace("$2", python_version()) | |||
if search_config: | |||
nltk_dir = path.join(self.config.root_dir, ".nltk") | |||
search_config["nltk_dir"] = nltk_dir | |||
# Create a Site object to log in and load the other attributes: | |||
site = Site(base_url=base_url, script_path=script_path, sql=sql, | |||
login=login, cookiejar=cookiejar, user_agent=user_agent, | |||
use_https=use_https, assert_edit=assert_edit, | |||
maxlag=maxlag, wait_between_queries=wait_between_queries, | |||
search_config=search_config) | |||
logger=logger, search_config=search_config) | |||
self._add_site_to_sitesdb(site) | |||
self._sites[site.name] = site | |||
@@ -25,6 +25,25 @@ from setuptools import setup, find_packages | |||
from earwigbot import __version__ | |||
# Not all of these dependencies are required, particularly the copyvio-specific | |||
# ones (bs4, lxml, nltk, and oauth2) or the command-specific ones (GitPython, | |||
# pytz). The bot should run fine without them, but will raise an exception if | |||
# you try to detect copyvios or run a command that requries one. | |||
dependencies = [ | |||
"GitPython >= 0.3.2.RC1", # Interfacing with git for !git and __version__ | |||
"PyYAML >= 3.10", # Parsing config files | |||
"beautifulsoup4 >= 4.1.1", # Parsing/scraping HTML for copyvios | |||
"lxml >= 2.3.4", # Faster parser for BeautifulSoup | |||
"mwparserfromhell >= 0.1", # Parsing wikicode for manipulation | |||
"nltk >= 2.0.2", # Parsing sentences to split article content for copyvios | |||
"oursql >= 0.9.3", # Interfacing with MediaWiki databases | |||
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search for copyvios | |||
"py-bcrypt >= 0.2", # Hashing the bot key in the config file | |||
"pycrypto >= 2.5", # Storing bot passwords and keys in the config file | |||
"pytz >= 2012c", # Handling timezones for the !time IRC command | |||
] | |||
with open("README.rst") as fp: | |||
long_docs = fp.read() | |||
@@ -32,17 +51,7 @@ setup( | |||
name = "earwigbot", | |||
packages = find_packages(exclude=("tests",)), | |||
entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]}, | |||
install_requires = ["GitPython >= 0.3.2.RC1", # Interfacing with git | |||
"PyYAML >= 3.10", # Config parsing | |||
"beautifulsoup4 >= 4.1.1", # HTML parsing/scraping | |||
"lxml >= 2.3.4", # Faster parser for BeautifulSoup | |||
"mwparserfromhell >= 0.1", # Wikicode parsing | |||
"oursql >= 0.9.3", # Talking with MediaWiki databases | |||
"oauth2 >= 1.5.211", # Talking with Yahoo BOSS Search | |||
"py-bcrypt >= 0.2", # Password hashing in config | |||
"pycrypto >= 2.5", # Storing bot passwords and keys | |||
"pytz >= 2012c", # Timezone handling | |||
], | |||
install_requires = dependencies, | |||
test_suite = "tests", | |||
version = __version__, | |||
author = "Ben Kurtovic", | |||