Browse Source

Implement NLTK for chunking article content (#5).

tags/v0.1^2
Ben Kurtovic 12 years ago
parent
commit
569c815d99
6 changed files with 66 additions and 24 deletions
  1. +4
    -2
      earwigbot/wiki/copyvios/__init__.py
  2. +26
    -8
      earwigbot/wiki/copyvios/parsers.py
  3. +1
    -1
      earwigbot/wiki/copyvios/search.py
  4. +1
    -1
      earwigbot/wiki/site.py
  5. +14
    -1
      earwigbot/wiki/sitesdb.py
  6. +20
    -11
      setup.py

+ 4
- 2
earwigbot/wiki/copyvios/__init__.py View File

@@ -87,6 +87,7 @@ class CopyvioMixIn(object):
"""

def __init__(self, site):
self._search_config = site._search_config
self._opener = build_opener()
self._opener.addheaders = site._opener.addheaders

@@ -126,7 +127,8 @@ class CopyvioMixIn(object):
unknown to us, and UnsupportedSearchEngineError if we are missing a
required package or module, like oauth2 for "Yahoo! BOSS".
"""
engine, credentials = self._site._search_config
engine = self._search_config["engine"]
credentials = self._search_config["credentials"]

if engine == "Yahoo! BOSS":
if not oauth:
@@ -177,7 +179,7 @@ class CopyvioMixIn(object):
best_chains = (empty, MarkovChainIntersection(empty, empty))
parser = ArticleTextParser(self.get())
clean = parser.strip()
chunks = parser.chunk(max_queries)
chunks = parser.chunk(max_queries, self._search_config["nltk_dir"])
article_chain = MarkovChain(clean)
last_query = time()



+ 26
- 8
earwigbot/wiki/copyvios/parsers.py View File

@@ -20,7 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import htmlentitydefs
from os import path

try:
from bs4 import BeautifulSoup
@@ -32,6 +32,11 @@ try:
except ImportError:
mwparserfromhell = None

try:
import nltk
except ImportError:
nltk = None

__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"]

class BaseTextParser(object):
@@ -68,17 +73,30 @@ class ArticleTextParser(BaseTextParser):
self.clean = u" ".join(wikicode.normalize().ifilter_text())
return self.clean

def chunk(self, max_chunks):
def chunk(self, max_chunks, nltk_dir):
"""Convert the clean article text into a list of web-searchable chunks.

No greater than *max_chunks* will be returned. Each chunk will only be
a couple sentences long at most. The idea here is to return a
representative sample of the article text rather than the entire
article, so we'll probably pick and choose from its introduction, body,
and conclusion, especially if the article is large and *max_chunks* is
low, so we don't end up just searching for the first paragraph.
a sentence or two long at most. The idea here is to return a
representative sample of the article text rather than the whole, so
we'll probably pick and choose from its introduction, body, and
conclusion, especially if the article is large and *max_chunks* is low,
so we don't end up just searching for the first paragraph.

This is implemented using :py:mod:`nltk` (http://nltk.org/). A base
directory (*nltk_dir*) is required to store nltk's punctuation
database. This is typically located in the bot's working directory.
"""
return [self.text] # TODO: NotImplemented
datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
try:
tokenizer = nltk.data.load(datafile)
except LookupError:
nltk.download("punkt", nltk_dir)
tokenizer = nltk.data.load(datafile)

sentences = tokenizer.tokenize(self.clean)
#if max_chunks >= len(sentences):
# return sentences


class HTMLTextParser(BaseTextParser):


+ 1
- 1
earwigbot/wiki/copyvios/search.py View File

@@ -67,7 +67,7 @@ class YahooBOSSSearchEngine(BaseSearchEngine):
"""
base_url = "http://yboss.yahooapis.com/ysearch/web"
query = quote_plus(query.join('"', '"'))
params = {"q": query, "style": "raw", "format": "json"}
params = {"q": query, "type": "html,text", "format": "json"}
url = "{0}?{1}".format(base_url, urlencode(params))

consumer = oauth.Consumer(key=self.cred["key"],


+ 1
- 1
earwigbot/wiki/site.py View File

@@ -92,7 +92,7 @@ class Site(object):
namespaces=None, login=(None, None), cookiejar=None,
user_agent=None, use_https=False, assert_edit=None,
maxlag=None, wait_between_queries=3, logger=None,
search_config=(None, None)):
search_config=None):
"""Constructor for new Site instances.

This probably isn't necessary to call yourself unless you're building a


+ 14
- 1
earwigbot/wiki/sitesdb.py View File

@@ -192,6 +192,10 @@ class SitesDB(object):
user_agent = user_agent.replace("$1", __version__)
user_agent = user_agent.replace("$2", python_version())

if search_config:
nltk_dir = path.join(self.config.root_dir, ".nltk")
search_config["nltk_dir"] = nltk_dir

return Site(name=name, project=project, lang=lang, base_url=base_url,
article_path=article_path, script_path=script_path,
sql=sql, namespaces=namespaces, login=login,
@@ -360,14 +364,23 @@ class SitesDB(object):
assert_edit = config.wiki.get("assert")
maxlag = config.wiki.get("maxlag")
wait_between_queries = config.wiki.get("waitTime", 5)
logger = self._logger.getChild(name)
search_config = config.wiki.get("search")

if user_agent:
user_agent = user_agent.replace("$1", __version__)
user_agent = user_agent.replace("$2", python_version())

if search_config:
nltk_dir = path.join(self.config.root_dir, ".nltk")
search_config["nltk_dir"] = nltk_dir

# Create a Site object to log in and load the other attributes:
site = Site(base_url=base_url, script_path=script_path, sql=sql,
login=login, cookiejar=cookiejar, user_agent=user_agent,
use_https=use_https, assert_edit=assert_edit,
maxlag=maxlag, wait_between_queries=wait_between_queries,
search_config=search_config)
logger=logger, search_config=search_config)

self._add_site_to_sitesdb(site)
self._sites[site.name] = site


+ 20
- 11
setup.py View File

@@ -25,6 +25,25 @@ from setuptools import setup, find_packages

from earwigbot import __version__

# Not all of these dependencies are required, particularly the copyvio-specific
# ones (bs4, lxml, nltk, and oauth2) or the command-specific ones (GitPython,
# pytz). The bot should run fine without them, but will raise an exception if
# you try to detect copyvios or run a command that requries one.

dependencies = [
"GitPython >= 0.3.2.RC1", # Interfacing with git for !git and __version__
"PyYAML >= 3.10", # Parsing config files
"beautifulsoup4 >= 4.1.1", # Parsing/scraping HTML for copyvios
"lxml >= 2.3.4", # Faster parser for BeautifulSoup
"mwparserfromhell >= 0.1", # Parsing wikicode for manipulation
"nltk >= 2.0.2", # Parsing sentences to split article content for copyvios
"oursql >= 0.9.3", # Interfacing with MediaWiki databases
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search for copyvios
"py-bcrypt >= 0.2", # Hashing the bot key in the config file
"pycrypto >= 2.5", # Storing bot passwords and keys in the config file
"pytz >= 2012c", # Handling timezones for the !time IRC command
]

with open("README.rst") as fp:
long_docs = fp.read()

@@ -32,17 +51,7 @@ setup(
name = "earwigbot",
packages = find_packages(exclude=("tests",)),
entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]},
install_requires = ["GitPython >= 0.3.2.RC1", # Interfacing with git
"PyYAML >= 3.10", # Config parsing
"beautifulsoup4 >= 4.1.1", # HTML parsing/scraping
"lxml >= 2.3.4", # Faster parser for BeautifulSoup
"mwparserfromhell >= 0.1", # Wikicode parsing
"oursql >= 0.9.3", # Talking with MediaWiki databases
"oauth2 >= 1.5.211", # Talking with Yahoo BOSS Search
"py-bcrypt >= 0.2", # Password hashing in config
"pycrypto >= 2.5", # Storing bot passwords and keys
"pytz >= 2012c", # Timezone handling
],
install_requires = dependencies,
test_suite = "tests",
version = __version__,
author = "Ben Kurtovic",


Loading…
Cancel
Save