From d45e342bac59c8587c8e34c2c794023452ef6fda Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Fri, 6 Jul 2012 22:55:23 -0400
Subject: [PATCH] DOCUMENT EVERYTHING (#5)

Also implementing MWParserFromHell, plus some cleanup.
---
 docs/api/earwigbot.wiki.copyvios.rst | 33 +++++++++++++
 docs/api/earwigbot.wiki.rst          | 14 +++---
 earwigbot/wiki/copyvios/__init__.py  | 91 ++++++++++++++++++++----------------
 earwigbot/wiki/copyvios/markov.py    |  4 ++
 earwigbot/wiki/copyvios/parsers.py   | 66 +++++++++++---------------
 earwigbot/wiki/copyvios/search.py    | 19 +++++---
 6 files changed, 136 insertions(+), 91 deletions(-)
 create mode 100644 docs/api/earwigbot.wiki.copyvios.rst

diff --git a/docs/api/earwigbot.wiki.copyvios.rst b/docs/api/earwigbot.wiki.copyvios.rst
new file mode 100644
index 0000000..7dbcf39
--- /dev/null
+++ b/docs/api/earwigbot.wiki.copyvios.rst
@@ -0,0 +1,33 @@
+copyvios Package
+================
+
+:mod:`copyvios` Package
+-----------------------
+
+.. automodule:: earwigbot.wiki.copyvios
+    :members:
+    :undoc-members:
+
+:mod:`markov` Module
+--------------------
+
+.. automodule:: earwigbot.wiki.copyvios.markov
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`parsers` Module
+---------------------
+
+.. automodule:: earwigbot.wiki.copyvios.parsers
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`search` Module
+--------------------
+
+.. automodule:: earwigbot.wiki.copyvios.search
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/api/earwigbot.wiki.rst b/docs/api/earwigbot.wiki.rst
index 806b3eb..45b009b 100644
--- a/docs/api/earwigbot.wiki.rst
+++ b/docs/api/earwigbot.wiki.rst
@@ -22,13 +22,6 @@ wiki Package
     :members:
     :undoc-members:
 
-:mod:`copyright` Module
------------------------
-
-.. automodule:: earwigbot.wiki.copyright
-    :members:
-    :undoc-members:
-
 :mod:`page` Module
 ------------------
 
@@ -57,3 +50,10 @@ wiki Package
 .. automodule:: earwigbot.wiki.user
     :members:
     :undoc-members:
+
+Subpackages
+-----------
+
+.. toctree::
+
+    earwigbot.wiki.copyvios
diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index f85ab22..2c2bb23 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -38,6 +38,22 @@ from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
 __all__ = ["CopyvioCheckResult", "CopyvioMixIn"]
 
 class CopyvioCheckResult(object):
+    """
+    **EarwigBot: Wiki Toolset: Copyvio Check Result**
+
+    A class holding information about the results of a copyvio check.
+
+    *Attributes:*
+
+    - :py:attr:`violation`:     ``True`` if this is a violation, else ``False``
+    - :py:attr:`confidence`:    a float between 0 and 1 indicating accuracy
+    - :py:attr:`url`:           the URL of the violated page
+    - :py:attr:`queries`:       the number of queries used to reach a result
+    - :py:attr:`article_chain`: the MarkovChain of the article text
+    - :py:attr:`source_chain`:  the MarkovChain of the violated page text
+    - :py:attr:`delta_chain`:   the MarkovChainIntersection comparing the two
+    """
+
     def __init__(self, violation, confidence, url, queries, article, chains):
         self.violation = violation
         self.confidence = confidence
@@ -61,14 +77,15 @@ class CopyvioCheckResult(object):
 
 class CopyvioMixIn(object):
     """
-    EarwigBot's Wiki Toolset: Copyright Violation Mixin
+    **EarwigBot: Wiki Toolset: Copyright Violation MixIn**
 
-    This is a mixin that provides two public methods, copyvio_check() and
-    copyvio_compare(). The former checks the page for copyright violations
-    using a search engine API, and the latter compares the page against a
-    specified URL. Credentials for the search engine API are stored in the
-    site's config.
+    This is a mixin that provides two public methods, :py:meth:`copyvio_check`
+    and :py:meth:`copyvio_compare`. The former checks the page for copyright
+    violations using a search engine API, and the latter compares the page
+    against a given URL. Credentials for the search engine API are stored in
+    the :py:class:`~earwigbot.wiki.site.Site`'s config.
     """
+
     def __init__(self, site):
         self._opener = build_opener()
         self._opener.addheaders = site._opener.addheaders
@@ -100,10 +117,10 @@ class CopyvioMixIn(object):
     def _select_search_engine(self):
         """Return a function that can be called to do web searches.
 
-        The "function" is a functools.partial object that takes one argument, a
-        query, and returns a list of URLs, ranked by importance. The underlying
-        logic depends on the 'engine' argument; for example, if 'engine' is
-        "Yahoo! BOSS", we'll use self._yahoo_boss_query for querying.
+        The function takes one argument, a search query, and returns a list of
+        URLs, ranked by importance. The underlying logic depends on the
+        *engine* argument within our config; for example, if *engine* is
+        "Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying.
 
         Raises UnknownSearchEngineError if the 'engine' listed in our config is
         unknown to us, and UnsupportedSearchEngineError if we are missing a
@@ -122,8 +139,8 @@ class CopyvioMixIn(object):
     def _copyvio_compare_content(self, article, url):
         """Return a number comparing an article and a URL.
 
-        The *article* is a Markov chain, whereas the URL is a string that we
-        will try to open ourselves.
+        The *article* is a Markov chain, whereas the *url* is just a string
+        that we'll try to open and read ourselves.
         """
         html = self._open_url_ignoring_errors(url)
         if not html:
@@ -134,30 +151,22 @@ class CopyvioMixIn(object):
         return float(delta.size()) / article.size(), (source, delta)
 
     def copyvio_check(self, min_confidence=0.5, max_queries=-1,
-                      interquery_sleep=1, force=False):
+                      interquery_sleep=1):
         """Check the page for copyright violations.
 
-        Returns a _CopyvioCheckResult object with four useful attributes:
-        "violation", "confidence", "url", and "queries". "confidence" is a
-        number between 0 and 1; if it is less than "min_confidence", we could
-        not find any indication of a violation (so "violation" will be False
-        and "url" may or may not be None), otherwise it indicates the relative
-        faith in our results, "violation" will be True, and "url" will be the
-        place the article is suspected of being copied from. "queries" is the
-        number of queries used to determine the results.
+        Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult`
+        object with information on the results of the check.
 
-        "max_queries" is self-explanatory; we will never make more than this
-        number of queries in a given check. If it's less than 0, we will not
-        limit our number of queries.
+        *max_queries* is self-explanatory; we will never make more than this
+        number of queries in a given check. If it's lower than 0, we will not
+        limit the number of queries.
 
-        "interquery_sleep" is the minimum amount of time we will sleep between
+        *interquery_sleep* is the minimum amount of time we will sleep between
         search engine queries, in seconds.
 
-        "force" is simply passed to page.get() - it has the same behavior there
-        as it does here.
-
-        Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
-        SearchQueryError, ...) on errors.
+        Raises :py:exc:`~earwigbot.exceptions.CopyvioCheckError` or subclasses
+        (:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError`,
+        :py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors.
         """
         searcher = self._select_search_engine()
         handled_urls = []
@@ -166,9 +175,9 @@ class CopyvioMixIn(object):
         num_queries = 0
         empty = MarkovChain("")
         best_chains = (empty, MarkovChainIntersection(empty, empty))
-        content = self.get(force)
-        clean = ArticleTextParser(content).strip()
-        chunks = ArticleTextParser(clean).chunk(max_queries)
+        parser = ArticleTextParser(self.get())
+        clean = parser.strip()
+        chunks = parser.chunk(max_queries)
         article_chain = MarkovChain(clean)
         last_query = time()
 
@@ -200,13 +209,14 @@ class CopyvioMixIn(object):
         return CopyvioCheckResult(v, best_confidence, best_match, num_queries,
                                   article_chain, best_chains)
 
-    def copyvio_compare(self, url, min_confidence=0.5, force=False):
-        """Check the page like copyvio_check(), but against a specific URL.
+    def copyvio_compare(self, url, min_confidence=0.5):
+        """Check the page like :py:meth:`copyvio_check` against a specific URL.
 
         This is essentially a reduced version of the above - a copyivo
         comparison is made using Markov chains and the result is returned in a
-        _CopyvioCheckResult object - without using a search engine, as the
-        suspected "violated" URL is supplied from the start.
+        :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but
+        without using a search engine, since the suspected "violated" URL is
+        supplied from the start.
 
         Its primary use is to generate a result when the URL is retrieved from
         a cache, like the one used in EarwigBot's Toolserver site. After a
@@ -217,10 +227,11 @@ class CopyvioMixIn(object):
         be stored for data retention reasons, so a fresh comparison is made
         using this function.
 
-        Since no searching is done, neither UnknownSearchEngineError nor
-        SearchQueryError will be raised.
+        Since no searching is done, neither
+        :py:exc:`~earwigbot.exceptions.UnknownSearchEngineError` nor
+        :py:exc:`~earwigbot.exceptions.SearchQueryError` will be raised.
         """
-        content = self.get(force)
+        content = self.get()
         clean = ArticleTextParser(content).strip()
         article_chain = MarkovChain(clean)
         confidence, chains = self._copyvio_compare_content(article_chain, url)
diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py
index 081469f..657b4b9 100644
--- a/earwigbot/wiki/copyvios/markov.py
+++ b/earwigbot/wiki/copyvios/markov.py
@@ -26,6 +26,7 @@ from re import sub, UNICODE
 __all__ = ["MarkovChain", "MarkovChainIntersection"]
 
 class MarkovChain(object):
+    """Implements a basic bigram Markov chain of words."""
     START = -1
     END = -2
 
@@ -51,6 +52,7 @@ class MarkovChain(object):
         return "<MarkovChain of size {0}>".format(self.size())
 
     def size(self):
+        """Return the size of the Markov chain: the total number of nodes."""
         count = 0
         for node in self.chain.itervalues():
             for hits in node.itervalues():
@@ -59,6 +61,8 @@ class MarkovChain(object):
 
 
 class MarkovChainIntersection(MarkovChain):
+    """Implements the intersection of two chains (i.e., their shared nodes)."""
+
     def __init__(self, mc1, mc2):
         self.chain = defaultdict(lambda: defaultdict(lambda: 0))
         self.mc1, self.mc2 = mc1, mc2
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index 0c3c17b..8a31127 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -20,9 +20,19 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+try:
+    import mwparserfromhell
+except ImportError:
+    mwparserfromhell = None
+
 __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"]
 
 class BaseTextParser(object):
+    """Base class for a parser that handles text."""
+
+    def __init__(self, text):
+        self.text = text
+
     def __repr__(self):
         """Return the canonical string representation of the text parser."""
         return "{0}(text={1!r})".format(self.__class__.__name__, self.text)
@@ -32,60 +42,40 @@ class BaseTextParser(object):
         name = self.__class__.__name__
         return "<{0} of text with size {1}>".format(name, len(text))
 
-    def __init__(self, text):
-        self.text = text
-
 
 class ArticleTextParser(BaseTextParser):
+    """A parser that can strip and chunk wikicode article text."""
+
     def strip(self):
         """Clean the page's raw text by removing templates and formatting.
 
-        Returns the page's text with all HTML and wikicode formatting removed,
-        including templates, tables, references, and the Bibliography/
-        References/Sources/See also section(s). It retains punctuation
+        Return the page's text with all HTML and wikicode formatting removed,
+        including templates, tables, and references. It retains punctuation
         (spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
-        quotes) and original capitalization, but not brackets (square and
-        angular), abnormal spacing, nor anything else. HTML entities are
+        quotes), original capitalization, and so forth. HTML entities are
         replaced by their unicode equivalents.
 
-        The actual replacement is handled by a few private methods within this
-        class.
+        The actual stripping is handled by :py:mod:`mwparserfromhell`.
         """
-        text = self._strip_tags(self.text)
-        text = self._strip_templates(text)
-        text = self._strip_sections(text)
-        text = self._strip_wikicode(text)
-        text = self._normalize(text)
-        return text
+        wikicode = mwparserfromhell.parse(self.text)
+        self.clean = u" ".join(wikicode.normalize().ifilter_text())
+        return self.clean
 
     def chunk(self, max_chunks):
-        """Convert the article text into a list of web-searchable chunks.
+        """Convert the clean article text into a list of web-searchable chunks.
 
-        No greater than max_chunks will be returned. Each chunk will only be a
-        couple sentences long at most. The idea here is to return a
+        No greater than *max_chunks* will be returned. Each chunk will only be
+        a couple sentences long at most. The idea here is to return a
         representative sample of the article text rather than the entire
         article, so we'll probably pick and choose from its introduction, body,
-        and conclusion, especially if the article is large and max_chunks are
-        few, so we don't end up just searching for the first paragraph.
+        and conclusion, especially if the article is large and *max_chunks* is
+        low, so we don't end up just searching for the first paragraph.
         """
-        return [self.text]
-
-    def _strip_tags(self, text):
-        return text
-
-    def _strip_templates(self, text):
-        return text
-
-    def _strip_sections(self, text):
-        return text
-
-    def _strip_wikicode(self, text):
-        return text
-
-    def _normalize(self, text):
-        return text
+        return [self.text]                                                                          # TODO: NotImplemented
 
 
 class HTMLTextParser(BaseTextParser):
+    """A parser that can extract the text from an HTML document."""
+
     def strip(self):
-        return self.text
+        return self.text                                                                            # TODO: NotImplemented
diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py
index 4345b29..ac40613 100644
--- a/earwigbot/wiki/copyvios/search.py
+++ b/earwigbot/wiki/copyvios/search.py
@@ -33,8 +33,10 @@ from earwigbot.exceptions import SearchQueryError
 __all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"]
 
 class BaseSearchEngine(object):
+    """Base class for a simple search engine interface."""
+
     def __init__(self, cred):
-        """Store credentials 'cred' for searching later on."""
+        """Store credentials *cred* for searching later on."""
         self.cred = cred
 
     def __repr__(self):
@@ -46,25 +48,30 @@ class BaseSearchEngine(object):
         return "<{0}>".format(self.__class__.__name__)
 
     def search(self, query):
-        """Use this engine to search for 'query'.
+        """Use this engine to search for *query*.
 
-        Not implemented in this base class; overridden in subclasses."""
+        Not implemented in this base class; overridden in subclasses.
+        """
         raise NotImplementedError()
 
 
 class YahooBOSSSearchEngine(BaseSearchEngine):
+    """A search engine interface with Yahoo! BOSS."""
+
     def search(self, query):
-        """Do a Yahoo! BOSS web search for 'query'.
+        """Do a Yahoo! BOSS web search for *query*.
 
         Returns a list of URLs, no more than fifty, ranked by relevance (as
-        determined by Yahoo). Raises SearchQueryError() on errors.
+        determined by Yahoo). Raises
+        :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
         """
         base_url = "http://yboss.yahooapis.com/ysearch/web"
         query = quote_plus(query.join('"', '"'))
         params = {"q": query, "style": "raw", "format": "json"}
         url = "{0}?{1}".format(base_url, urlencode(params))
 
-        consumer = oauth.Consumer(key=self.cred["key"], secret=self.cred["secret"])
+        consumer = oauth.Consumer(key=self.cred["key"],
+                                  secret=self.cred["secret"])
         client = oauth.Client(consumer)
         headers, body = client.request(url, "GET")