Implement NLTK for chunking article content (#5).

12 years ago · 569c815d99
--- a/earwigbot/wiki/copyvios/init.py
+++ b/earwigbot/wiki/copyvios/init.py
@@ -87,6 +87,7 @@ class CopyvioMixIn(object):
    """
    def __init__(self, site):
        self._search_config = site._search_config
        self._opener = build_opener()
        self._opener.addheaders = site._opener.addheaders
@@ -126,7 +127,8 @@ class CopyvioMixIn(object):
        unknown to us, and UnsupportedSearchEngineError if we are missing a
        required package or module, like oauth2 for "Yahoo! BOSS".
        """
        engine, credentials = self._site._search_config
        engine = self._search_config["engine"]
        credentials = self._search_config["credentials"]
        if engine == "Yahoo! BOSS":
            if not oauth:
@@ -177,7 +179,7 @@ class CopyvioMixIn(object):
        best_chains = (empty, MarkovChainIntersection(empty, empty))
        parser = ArticleTextParser(self.get())
        clean = parser.strip()
        chunks = parser.chunk(max_queries)
        chunks = parser.chunk(max_queries, self._search_config["nltk_dir"])
        article_chain = MarkovChain(clean)
        last_query = time()
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -20,7 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 import htmlentitydefs
 from os import path
 try:
    from bs4 import BeautifulSoup
@@ -32,6 +32,11 @@ try:
 except ImportError:
    mwparserfromhell = None
 try:
    import nltk
 except ImportError:
    nltk = None
 __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"]
 class BaseTextParser(object):
@@ -68,17 +73,30 @@ class ArticleTextParser(BaseTextParser):
        self.clean = u" ".join(wikicode.normalize().ifilter_text())
        return self.clean
    def chunk(self, max_chunks):
    def chunk(self, max_chunks, nltk_dir):
        """Convert the clean article text into a list of web-searchable chunks.
        No greater than *max_chunks* will be returned. Each chunk will only be
        a couple sentences long at most. The idea here is to return a
        representative sample of the article text rather than the entire
        article, so we'll probably pick and choose from its introduction, body,
        and conclusion, especially if the article is large and *max_chunks* is
        low, so we don't end up just searching for the first paragraph.
        a sentence or two long at most. The idea here is to return a
        representative sample of the article text rather than the whole, so
        we'll probably pick and choose from its introduction, body, and
        conclusion, especially if the article is large and *max_chunks* is low,
        so we don't end up just searching for the first paragraph.
        This is implemented using :py:mod:`nltk` (http://nltk.org/). A base
        directory (*nltk_dir*) is required to store nltk's punctuation
        database. This is typically located in the bot's working directory.
        """
        return [self.text]                                                                          # TODO: NotImplemented
        datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
        try:
            tokenizer = nltk.data.load(datafile)
        except LookupError:
            nltk.download("punkt", nltk_dir)
            tokenizer = nltk.data.load(datafile)
        sentences = tokenizer.tokenize(self.clean)
        #if max_chunks >= len(sentences):
        #    return sentences
 class HTMLTextParser(BaseTextParser):
--- a/earwigbot/wiki/copyvios/search.py
+++ b/earwigbot/wiki/copyvios/search.py
@@ -67,7 +67,7 @@ class YahooBOSSSearchEngine(BaseSearchEngine):
        """
        base_url = "http://yboss.yahooapis.com/ysearch/web"
        query = quote_plus(query.join('"', '"'))
        params = {"q": query, "style": "raw", "format": "json"}
        params = {"q": query, "type": "html,text", "format": "json"}
        url = "{0}?{1}".format(base_url, urlencode(params))
        consumer = oauth.Consumer(key=self.cred["key"],
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -92,7 +92,7 @@ class Site(object):
                 namespaces=None, login=(None, None), cookiejar=None,
                 user_agent=None, use_https=False, assert_edit=None,
                 maxlag=None, wait_between_queries=3, logger=None,
                 search_config=(None, None)):
                 search_config=None):
        """Constructor for new Site instances.
        This probably isn't necessary to call yourself unless you're building a
--- a/earwigbot/wiki/sitesdb.py
+++ b/earwigbot/wiki/sitesdb.py
@@ -192,6 +192,10 @@ class SitesDB(object):
            user_agent = user_agent.replace("$1", __version__)
            user_agent = user_agent.replace("$2", python_version())
        if search_config:
            nltk_dir = path.join(self.config.root_dir, ".nltk")
            search_config["nltk_dir"] = nltk_dir
        return Site(name=name, project=project, lang=lang, base_url=base_url,
                    article_path=article_path, script_path=script_path,
                    sql=sql, namespaces=namespaces, login=login,
@@ -360,14 +364,23 @@ class SitesDB(object):
        assert_edit = config.wiki.get("assert")
        maxlag = config.wiki.get("maxlag")
        wait_between_queries = config.wiki.get("waitTime", 5)
        logger = self._logger.getChild(name)
        search_config = config.wiki.get("search")
        if user_agent:
            user_agent = user_agent.replace("$1", __version__)
            user_agent = user_agent.replace("$2", python_version())
        if search_config:
            nltk_dir = path.join(self.config.root_dir, ".nltk")
            search_config["nltk_dir"] = nltk_dir
        # Create a Site object to log in and load the other attributes:
        site = Site(base_url=base_url, script_path=script_path, sql=sql,
                    login=login, cookiejar=cookiejar, user_agent=user_agent,
                    use_https=use_https, assert_edit=assert_edit,
                    maxlag=maxlag, wait_between_queries=wait_between_queries,
                    search_config=search_config)
                    logger=logger, search_config=search_config)
        self._add_site_to_sitesdb(site)
        self._sites[site.name] = site
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,25 @@ from setuptools import setup, find_packages
 from earwigbot import __version__
 # Not all of these dependencies are required, particularly the copyvio-specific
 # ones (bs4, lxml, nltk, and oauth2) or the command-specific ones (GitPython,
 # pytz). The bot should run fine without them, but will raise an exception if
 # you try to detect copyvios or run a command that requries one.
 dependencies = [
    "GitPython >= 0.3.2.RC1",  # Interfacing with git for !git and __version__
    "PyYAML >= 3.10",  # Parsing config files
    "beautifulsoup4 >= 4.1.1",  # Parsing/scraping HTML for copyvios
    "lxml >= 2.3.4",  # Faster parser for BeautifulSoup
    "mwparserfromhell >= 0.1",  # Parsing wikicode for manipulation
    "nltk >= 2.0.2",  # Parsing sentences to split article content for copyvios
    "oursql >= 0.9.3",  # Interfacing with MediaWiki databases
    "oauth2 >= 1.5.211",  # Interfacing with Yahoo! BOSS Search for copyvios
    "py-bcrypt >= 0.2",  # Hashing the bot key in the config file
    "pycrypto >= 2.5",  # Storing bot passwords and keys in the config file
    "pytz >= 2012c",  # Handling timezones for the !time IRC command
 ]
 with open("README.rst") as fp:
    long_docs = fp.read()
@@ -32,17 +51,7 @@ setup(
    name = "earwigbot",
    packages = find_packages(exclude=("tests",)),
    entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]},
    install_requires = ["GitPython >= 0.3.2.RC1",  # Interfacing with git
                        "PyYAML >= 3.10",  # Config parsing
                        "beautifulsoup4 >= 4.1.1",  # HTML parsing/scraping
                        "lxml >= 2.3.4",  # Faster parser for BeautifulSoup
                        "mwparserfromhell >= 0.1",  # Wikicode parsing
                        "oursql >= 0.9.3",  # Talking with MediaWiki databases
                        "oauth2 >= 1.5.211",  # Talking with Yahoo BOSS Search
                        "py-bcrypt >= 0.2",  # Password hashing in config
                        "pycrypto >= 2.5",  # Storing bot passwords and keys
                        "pytz >= 2012c",  # Timezone handling
                        ],
    install_requires = dependencies,
    test_suite = "tests",
    version = __version__,
    author = "Ben Kurtovic",