DOCUMENT EVERYTHING (#5)

Also implementing MWParserFromHell, plus some cleanup.
12 years ago · d45e342bac
--- a/docs/api/earwigbot.wiki.copyvios.rst
+++ b/docs/api/earwigbot.wiki.copyvios.rst
@@ -0,0 +1,33 @@
 copyvios Package
 ================

 :mod:`copyvios` Package
 -----------------------

 .. automodule:: earwigbot.wiki.copyvios
    :members:
    :undoc-members:

 :mod:`markov` Module
 --------------------

 .. automodule:: earwigbot.wiki.copyvios.markov
    :members:
    :undoc-members:
    :show-inheritance:

 :mod:`parsers` Module
 ---------------------

 .. automodule:: earwigbot.wiki.copyvios.parsers
    :members:
    :undoc-members:
    :show-inheritance:

 :mod:`search` Module
 --------------------

 .. automodule:: earwigbot.wiki.copyvios.search
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/api/earwigbot.wiki.rst
+++ b/docs/api/earwigbot.wiki.rst
@@ -22,13 +22,6 @@ wiki Package
    :members:
    :undoc-members:

 :mod:`copyright` Module

 .. automodule:: earwigbot.wiki.copyright
    :members:
    :undoc-members:

 :mod:`page` Module
 ------------------

@@ -57,3 +50,10 @@ wiki Package
 .. automodule:: earwigbot.wiki.user
    :members:
    :undoc-members:

 Subpackages
 -----------

 .. toctree::

    earwigbot.wiki.copyvios
--- a/earwigbot/wiki/copyvios/init.py
+++ b/earwigbot/wiki/copyvios/init.py
@@ -38,6 +38,22 @@ from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
 __all__ = ["CopyvioCheckResult", "CopyvioMixIn"]

 class CopyvioCheckResult(object):
    """
    **EarwigBot: Wiki Toolset: Copyvio Check Result**

    A class holding information about the results of a copyvio check.

    *Attributes:*

    - :py:attr:`violation`:     ``True`` if this is a violation, else ``False``
    - :py:attr:`confidence`:    a float between 0 and 1 indicating accuracy
    - :py:attr:`url`:           the URL of the violated page
    - :py:attr:`queries`:       the number of queries used to reach a result
    - :py:attr:`article_chain`: the MarkovChain of the article text
    - :py:attr:`source_chain`:  the MarkovChain of the violated page text
    - :py:attr:`delta_chain`:   the MarkovChainIntersection comparing the two
    """

    def __init__(self, violation, confidence, url, queries, article, chains):
        self.violation = violation
        self.confidence = confidence
@@ -61,14 +77,15 @@ class CopyvioCheckResult(object):

 class CopyvioMixIn(object):
    """
    EarwigBot's Wiki Toolset: Copyright Violation Mixin
    **EarwigBot: Wiki Toolset: Copyright Violation MixIn**

    This is a mixin that provides two public methods, copyvio_check() and
    copyvio_compare(). The former checks the page for copyright violations
    using a search engine API, and the latter compares the page against a
    specified URL. Credentials for the search engine API are stored in the
    site's config.
    This is a mixin that provides two public methods, :py:meth:`copyvio_check`
    and :py:meth:`copyvio_compare`. The former checks the page for copyright
    violations using a search engine API, and the latter compares the page
    against a given URL. Credentials for the search engine API are stored in
    the :py:class:`~earwigbot.wiki.site.Site`'s config.
    """

    def __init__(self, site):
        self._opener = build_opener()
        self._opener.addheaders = site._opener.addheaders
@@ -100,10 +117,10 @@ class CopyvioMixIn(object):
    def _select_search_engine(self):
        """Return a function that can be called to do web searches.

        The "function" is a functools.partial object that takes one argument, a
        query, and returns a list of URLs, ranked by importance. The underlying
        logic depends on the 'engine' argument; for example, if 'engine' is
        "Yahoo! BOSS", we'll use self._yahoo_boss_query for querying.
        The function takes one argument, a search query, and returns a list of
        URLs, ranked by importance. The underlying logic depends on the
        *engine* argument within our config; for example, if *engine* is
        "Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying.

        Raises UnknownSearchEngineError if the 'engine' listed in our config is
        unknown to us, and UnsupportedSearchEngineError if we are missing a
@@ -122,8 +139,8 @@ class CopyvioMixIn(object):
    def _copyvio_compare_content(self, article, url):
        """Return a number comparing an article and a URL.

        The *article* is a Markov chain, whereas the URL is a string that we
        will try to open ourselves.
        The *article* is a Markov chain, whereas the *url* is just a string
        that we'll try to open and read ourselves.
        """
        html = self._open_url_ignoring_errors(url)
        if not html:
@@ -134,30 +151,22 @@ class CopyvioMixIn(object):
        return float(delta.size()) / article.size(), (source, delta)

    def copyvio_check(self, min_confidence=0.5, max_queries=-1,
                      interquery_sleep=1, force=False):
                      interquery_sleep=1):
        """Check the page for copyright violations.

        Returns a _CopyvioCheckResult object with four useful attributes:
        "violation", "confidence", "url", and "queries". "confidence" is a
        number between 0 and 1; if it is less than "min_confidence", we could
        not find any indication of a violation (so "violation" will be False
        and "url" may or may not be None), otherwise it indicates the relative
        faith in our results, "violation" will be True, and "url" will be the
        place the article is suspected of being copied from. "queries" is the
        number of queries used to determine the results.
        Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult`
        object with information on the results of the check.

        "max_queries" is self-explanatory; we will never make more than this
        number of queries in a given check. If it's less than 0, we will not
        limit our number of queries.
        *max_queries* is self-explanatory; we will never make more than this
        number of queries in a given check. If it's lower than 0, we will not
        limit the number of queries.

        "interquery_sleep" is the minimum amount of time we will sleep between
        *interquery_sleep* is the minimum amount of time we will sleep between
        search engine queries, in seconds.

        "force" is simply passed to page.get() - it has the same behavior there
        as it does here.

        Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
        SearchQueryError, ...) on errors.
        Raises :py:exc:`~earwigbot.exceptions.CopyvioCheckError` or subclasses
        (:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError`,
        :py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors.
        """
        searcher = self._select_search_engine()
        handled_urls = []
@@ -166,9 +175,9 @@ class CopyvioMixIn(object):
        num_queries = 0
        empty = MarkovChain("")
        best_chains = (empty, MarkovChainIntersection(empty, empty))
        content = self.get(force)
        clean = ArticleTextParser(content).strip()
        chunks = ArticleTextParser(clean).chunk(max_queries)
        parser = ArticleTextParser(self.get())
        clean = parser.strip()
        chunks = parser.chunk(max_queries)
        article_chain = MarkovChain(clean)
        last_query = time()

@@ -200,13 +209,14 @@ class CopyvioMixIn(object):
        return CopyvioCheckResult(v, best_confidence, best_match, num_queries,
                                  article_chain, best_chains)

    def copyvio_compare(self, url, min_confidence=0.5, force=False):
        """Check the page like copyvio_check(), but against a specific URL.
    def copyvio_compare(self, url, min_confidence=0.5):
        """Check the page like :py:meth:`copyvio_check` against a specific URL.

        This is essentially a reduced version of the above - a copyivo
        comparison is made using Markov chains and the result is returned in a
        _CopyvioCheckResult object - without using a search engine, as the
        suspected "violated" URL is supplied from the start.
        :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but
        without using a search engine, since the suspected "violated" URL is
        supplied from the start.

        Its primary use is to generate a result when the URL is retrieved from
        a cache, like the one used in EarwigBot's Toolserver site. After a
@@ -217,10 +227,11 @@ class CopyvioMixIn(object):
        be stored for data retention reasons, so a fresh comparison is made
        using this function.

        Since no searching is done, neither UnknownSearchEngineError nor
        SearchQueryError will be raised.
        Since no searching is done, neither
        :py:exc:`~earwigbot.exceptions.UnknownSearchEngineError` nor
        :py:exc:`~earwigbot.exceptions.SearchQueryError` will be raised.
        """
        content = self.get(force)
        content = self.get()
        clean = ArticleTextParser(content).strip()
        article_chain = MarkovChain(clean)
        confidence, chains = self._copyvio_compare_content(article_chain, url)
--- a/earwigbot/wiki/copyvios/markov.py
+++ b/earwigbot/wiki/copyvios/markov.py
@@ -26,6 +26,7 @@ from re import sub, UNICODE
 __all__ = ["MarkovChain", "MarkovChainIntersection"]

 class MarkovChain(object):
    """Implements a basic bigram Markov chain of words."""
    START = -1
    END = -2

@@ -51,6 +52,7 @@ class MarkovChain(object):
        return "<MarkovChain of size {0}>".format(self.size())

    def size(self):
        """Return the size of the Markov chain: the total number of nodes."""
        count = 0
        for node in self.chain.itervalues():
            for hits in node.itervalues():
@@ -59,6 +61,8 @@ class MarkovChain(object):


 class MarkovChainIntersection(MarkovChain):
    """Implements the intersection of two chains (i.e., their shared nodes)."""

    def __init__(self, mc1, mc2):
        self.chain = defaultdict(lambda: defaultdict(lambda: 0))
        self.mc1, self.mc2 = mc1, mc2
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -20,9 +20,19 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 try:
    import mwparserfromhell
 except ImportError:
    mwparserfromhell = None

 __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"]

 class BaseTextParser(object):
    """Base class for a parser that handles text."""

    def __init__(self, text):
        self.text = text

    def __repr__(self):
        """Return the canonical string representation of the text parser."""
        return "{0}(text={1!r})".format(self.__class__.__name__, self.text)
@@ -32,60 +42,40 @@ class BaseTextParser(object):
        name = self.__class__.__name__
        return "<{0} of text with size {1}>".format(name, len(text))

    def __init__(self, text):
        self.text = text


 class ArticleTextParser(BaseTextParser):
    """A parser that can strip and chunk wikicode article text."""

    def strip(self):
        """Clean the page's raw text by removing templates and formatting.

        Returns the page's text with all HTML and wikicode formatting removed,
        including templates, tables, references, and the Bibliography/
        References/Sources/See also section(s). It retains punctuation
        Return the page's text with all HTML and wikicode formatting removed,
        including templates, tables, and references. It retains punctuation
        (spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
        quotes) and original capitalization, but not brackets (square and
        angular), abnormal spacing, nor anything else. HTML entities are
        quotes), original capitalization, and so forth. HTML entities are
        replaced by their unicode equivalents.

        The actual replacement is handled by a few private methods within this
        class.
        The actual stripping is handled by :py:mod:`mwparserfromhell`.
        """
        text = self._strip_tags(self.text)
        text = self._strip_templates(text)
        text = self._strip_sections(text)
        text = self._strip_wikicode(text)
        text = self._normalize(text)
        return text
        wikicode = mwparserfromhell.parse(self.text)
        self.clean = u" ".join(wikicode.normalize().ifilter_text())
        return self.clean

    def chunk(self, max_chunks):
        """Convert the article text into a list of web-searchable chunks.
        """Convert the clean article text into a list of web-searchable chunks.

        No greater than max_chunks will be returned. Each chunk will only be a
        couple sentences long at most. The idea here is to return a
        No greater than *max_chunks* will be returned. Each chunk will only be
        a couple sentences long at most. The idea here is to return a
        representative sample of the article text rather than the entire
        article, so we'll probably pick and choose from its introduction, body,
        and conclusion, especially if the article is large and max_chunks are
        few, so we don't end up just searching for the first paragraph.
        and conclusion, especially if the article is large and *max_chunks* is
        low, so we don't end up just searching for the first paragraph.
        """
        return [self.text]

    def _strip_tags(self, text):
        return text

    def _strip_templates(self, text):
        return text

    def _strip_sections(self, text):
        return text

    def _strip_wikicode(self, text):
        return text

    def _normalize(self, text):
        return text
        return [self.text]                                                                          # TODO: NotImplemented


 class HTMLTextParser(BaseTextParser):
    """A parser that can extract the text from an HTML document."""

    def strip(self):
        return self.text
        return self.text                                                                            # TODO: NotImplemented
--- a/earwigbot/wiki/copyvios/search.py
+++ b/earwigbot/wiki/copyvios/search.py
@@ -33,8 +33,10 @@ from earwigbot.exceptions import SearchQueryError
 __all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"]

 class BaseSearchEngine(object):
    """Base class for a simple search engine interface."""

    def __init__(self, cred):
        """Store credentials 'cred' for searching later on."""
        """Store credentials *cred* for searching later on."""
        self.cred = cred

    def __repr__(self):
@@ -46,25 +48,30 @@ class BaseSearchEngine(object):
        return "<{0}>".format(self.__class__.__name__)

    def search(self, query):
        """Use this engine to search for 'query'.
        """Use this engine to search for *query*.

        Not implemented in this base class; overridden in subclasses."""
        Not implemented in this base class; overridden in subclasses.
        """
        raise NotImplementedError()


 class YahooBOSSSearchEngine(BaseSearchEngine):
    """A search engine interface with Yahoo! BOSS."""

    def search(self, query):
        """Do a Yahoo! BOSS web search for 'query'.
        """Do a Yahoo! BOSS web search for *query*.

        Returns a list of URLs, no more than fifty, ranked by relevance (as
        determined by Yahoo). Raises SearchQueryError() on errors.
        determined by Yahoo). Raises
        :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
        """
        base_url = "http://yboss.yahooapis.com/ysearch/web"
        query = quote_plus(query.join('"', '"'))
        params = {"q": query, "style": "raw", "format": "json"}
        url = "{0}?{1}".format(base_url, urlencode(params))

        consumer = oauth.Consumer(key=self.cred["key"], secret=self.cred["secret"])
        consumer = oauth.Consumer(key=self.cred["key"],
                                  secret=self.cred["secret"])
        client = oauth.Client(consumer)
        headers, body = client.request(url, "GET")