From d45e342bac59c8587c8e34c2c794023452ef6fda Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 6 Jul 2012 22:55:23 -0400 Subject: [PATCH] DOCUMENT EVERYTHING (#5) Also implementing MWParserFromHell, plus some cleanup. --- docs/api/earwigbot.wiki.copyvios.rst | 33 +++++++++++++ docs/api/earwigbot.wiki.rst | 14 +++--- earwigbot/wiki/copyvios/__init__.py | 91 ++++++++++++++++++++---------------- earwigbot/wiki/copyvios/markov.py | 4 ++ earwigbot/wiki/copyvios/parsers.py | 66 +++++++++++--------------- earwigbot/wiki/copyvios/search.py | 19 +++++--- 6 files changed, 136 insertions(+), 91 deletions(-) create mode 100644 docs/api/earwigbot.wiki.copyvios.rst diff --git a/docs/api/earwigbot.wiki.copyvios.rst b/docs/api/earwigbot.wiki.copyvios.rst new file mode 100644 index 0000000..7dbcf39 --- /dev/null +++ b/docs/api/earwigbot.wiki.copyvios.rst @@ -0,0 +1,33 @@ +copyvios Package +================ + +:mod:`copyvios` Package +----------------------- + +.. automodule:: earwigbot.wiki.copyvios + :members: + :undoc-members: + +:mod:`markov` Module +-------------------- + +.. automodule:: earwigbot.wiki.copyvios.markov + :members: + :undoc-members: + :show-inheritance: + +:mod:`parsers` Module +--------------------- + +.. automodule:: earwigbot.wiki.copyvios.parsers + :members: + :undoc-members: + :show-inheritance: + +:mod:`search` Module +-------------------- + +.. automodule:: earwigbot.wiki.copyvios.search + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/earwigbot.wiki.rst b/docs/api/earwigbot.wiki.rst index 806b3eb..45b009b 100644 --- a/docs/api/earwigbot.wiki.rst +++ b/docs/api/earwigbot.wiki.rst @@ -22,13 +22,6 @@ wiki Package :members: :undoc-members: -:mod:`copyright` Module ------------------------ - -.. automodule:: earwigbot.wiki.copyright - :members: - :undoc-members: - :mod:`page` Module ------------------ @@ -57,3 +50,10 @@ wiki Package .. automodule:: earwigbot.wiki.user :members: :undoc-members: + +Subpackages +----------- + +.. toctree:: + + earwigbot.wiki.copyvios diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index f85ab22..2c2bb23 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -38,6 +38,22 @@ from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine __all__ = ["CopyvioCheckResult", "CopyvioMixIn"] class CopyvioCheckResult(object): + """ + **EarwigBot: Wiki Toolset: Copyvio Check Result** + + A class holding information about the results of a copyvio check. + + *Attributes:* + + - :py:attr:`violation`: ``True`` if this is a violation, else ``False`` + - :py:attr:`confidence`: a float between 0 and 1 indicating accuracy + - :py:attr:`url`: the URL of the violated page + - :py:attr:`queries`: the number of queries used to reach a result + - :py:attr:`article_chain`: the MarkovChain of the article text + - :py:attr:`source_chain`: the MarkovChain of the violated page text + - :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two + """ + def __init__(self, violation, confidence, url, queries, article, chains): self.violation = violation self.confidence = confidence @@ -61,14 +77,15 @@ class CopyvioCheckResult(object): class CopyvioMixIn(object): """ - EarwigBot's Wiki Toolset: Copyright Violation Mixin + **EarwigBot: Wiki Toolset: Copyright Violation MixIn** - This is a mixin that provides two public methods, copyvio_check() and - copyvio_compare(). The former checks the page for copyright violations - using a search engine API, and the latter compares the page against a - specified URL. Credentials for the search engine API are stored in the - site's config. + This is a mixin that provides two public methods, :py:meth:`copyvio_check` + and :py:meth:`copyvio_compare`. The former checks the page for copyright + violations using a search engine API, and the latter compares the page + against a given URL. Credentials for the search engine API are stored in + the :py:class:`~earwigbot.wiki.site.Site`'s config. """ + def __init__(self, site): self._opener = build_opener() self._opener.addheaders = site._opener.addheaders @@ -100,10 +117,10 @@ class CopyvioMixIn(object): def _select_search_engine(self): """Return a function that can be called to do web searches. - The "function" is a functools.partial object that takes one argument, a - query, and returns a list of URLs, ranked by importance. The underlying - logic depends on the 'engine' argument; for example, if 'engine' is - "Yahoo! BOSS", we'll use self._yahoo_boss_query for querying. + The function takes one argument, a search query, and returns a list of + URLs, ranked by importance. The underlying logic depends on the + *engine* argument within our config; for example, if *engine* is + "Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying. Raises UnknownSearchEngineError if the 'engine' listed in our config is unknown to us, and UnsupportedSearchEngineError if we are missing a @@ -122,8 +139,8 @@ class CopyvioMixIn(object): def _copyvio_compare_content(self, article, url): """Return a number comparing an article and a URL. - The *article* is a Markov chain, whereas the URL is a string that we - will try to open ourselves. + The *article* is a Markov chain, whereas the *url* is just a string + that we'll try to open and read ourselves. """ html = self._open_url_ignoring_errors(url) if not html: @@ -134,30 +151,22 @@ class CopyvioMixIn(object): return float(delta.size()) / article.size(), (source, delta) def copyvio_check(self, min_confidence=0.5, max_queries=-1, - interquery_sleep=1, force=False): + interquery_sleep=1): """Check the page for copyright violations. - Returns a _CopyvioCheckResult object with four useful attributes: - "violation", "confidence", "url", and "queries". "confidence" is a - number between 0 and 1; if it is less than "min_confidence", we could - not find any indication of a violation (so "violation" will be False - and "url" may or may not be None), otherwise it indicates the relative - faith in our results, "violation" will be True, and "url" will be the - place the article is suspected of being copied from. "queries" is the - number of queries used to determine the results. + Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` + object with information on the results of the check. - "max_queries" is self-explanatory; we will never make more than this - number of queries in a given check. If it's less than 0, we will not - limit our number of queries. + *max_queries* is self-explanatory; we will never make more than this + number of queries in a given check. If it's lower than 0, we will not + limit the number of queries. - "interquery_sleep" is the minimum amount of time we will sleep between + *interquery_sleep* is the minimum amount of time we will sleep between search engine queries, in seconds. - "force" is simply passed to page.get() - it has the same behavior there - as it does here. - - Raises CopyvioCheckError or subclasses (UnknownSearchEngineError, - SearchQueryError, ...) on errors. + Raises :py:exc:`~earwigbot.exceptions.CopyvioCheckError` or subclasses + (:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError`, + :py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors. """ searcher = self._select_search_engine() handled_urls = [] @@ -166,9 +175,9 @@ class CopyvioMixIn(object): num_queries = 0 empty = MarkovChain("") best_chains = (empty, MarkovChainIntersection(empty, empty)) - content = self.get(force) - clean = ArticleTextParser(content).strip() - chunks = ArticleTextParser(clean).chunk(max_queries) + parser = ArticleTextParser(self.get()) + clean = parser.strip() + chunks = parser.chunk(max_queries) article_chain = MarkovChain(clean) last_query = time() @@ -200,13 +209,14 @@ class CopyvioMixIn(object): return CopyvioCheckResult(v, best_confidence, best_match, num_queries, article_chain, best_chains) - def copyvio_compare(self, url, min_confidence=0.5, force=False): - """Check the page like copyvio_check(), but against a specific URL. + def copyvio_compare(self, url, min_confidence=0.5): + """Check the page like :py:meth:`copyvio_check` against a specific URL. This is essentially a reduced version of the above - a copyivo comparison is made using Markov chains and the result is returned in a - _CopyvioCheckResult object - without using a search engine, as the - suspected "violated" URL is supplied from the start. + :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but + without using a search engine, since the suspected "violated" URL is + supplied from the start. Its primary use is to generate a result when the URL is retrieved from a cache, like the one used in EarwigBot's Toolserver site. After a @@ -217,10 +227,11 @@ class CopyvioMixIn(object): be stored for data retention reasons, so a fresh comparison is made using this function. - Since no searching is done, neither UnknownSearchEngineError nor - SearchQueryError will be raised. + Since no searching is done, neither + :py:exc:`~earwigbot.exceptions.UnknownSearchEngineError` nor + :py:exc:`~earwigbot.exceptions.SearchQueryError` will be raised. """ - content = self.get(force) + content = self.get() clean = ArticleTextParser(content).strip() article_chain = MarkovChain(clean) confidence, chains = self._copyvio_compare_content(article_chain, url) diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py index 081469f..657b4b9 100644 --- a/earwigbot/wiki/copyvios/markov.py +++ b/earwigbot/wiki/copyvios/markov.py @@ -26,6 +26,7 @@ from re import sub, UNICODE __all__ = ["MarkovChain", "MarkovChainIntersection"] class MarkovChain(object): + """Implements a basic bigram Markov chain of words.""" START = -1 END = -2 @@ -51,6 +52,7 @@ class MarkovChain(object): return "".format(self.size()) def size(self): + """Return the size of the Markov chain: the total number of nodes.""" count = 0 for node in self.chain.itervalues(): for hits in node.itervalues(): @@ -59,6 +61,8 @@ class MarkovChain(object): class MarkovChainIntersection(MarkovChain): + """Implements the intersection of two chains (i.e., their shared nodes).""" + def __init__(self, mc1, mc2): self.chain = defaultdict(lambda: defaultdict(lambda: 0)) self.mc1, self.mc2 = mc1, mc2 diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 0c3c17b..8a31127 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -20,9 +20,19 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +try: + import mwparserfromhell +except ImportError: + mwparserfromhell = None + __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] class BaseTextParser(object): + """Base class for a parser that handles text.""" + + def __init__(self, text): + self.text = text + def __repr__(self): """Return the canonical string representation of the text parser.""" return "{0}(text={1!r})".format(self.__class__.__name__, self.text) @@ -32,60 +42,40 @@ class BaseTextParser(object): name = self.__class__.__name__ return "<{0} of text with size {1}>".format(name, len(text)) - def __init__(self, text): - self.text = text - class ArticleTextParser(BaseTextParser): + """A parser that can strip and chunk wikicode article text.""" + def strip(self): """Clean the page's raw text by removing templates and formatting. - Returns the page's text with all HTML and wikicode formatting removed, - including templates, tables, references, and the Bibliography/ - References/Sources/See also section(s). It retains punctuation + Return the page's text with all HTML and wikicode formatting removed, + including templates, tables, and references. It retains punctuation (spacing, paragraphs, periods, commas, (semi)-colons, parentheses, - quotes) and original capitalization, but not brackets (square and - angular), abnormal spacing, nor anything else. HTML entities are + quotes), original capitalization, and so forth. HTML entities are replaced by their unicode equivalents. - The actual replacement is handled by a few private methods within this - class. + The actual stripping is handled by :py:mod:`mwparserfromhell`. """ - text = self._strip_tags(self.text) - text = self._strip_templates(text) - text = self._strip_sections(text) - text = self._strip_wikicode(text) - text = self._normalize(text) - return text + wikicode = mwparserfromhell.parse(self.text) + self.clean = u" ".join(wikicode.normalize().ifilter_text()) + return self.clean def chunk(self, max_chunks): - """Convert the article text into a list of web-searchable chunks. + """Convert the clean article text into a list of web-searchable chunks. - No greater than max_chunks will be returned. Each chunk will only be a - couple sentences long at most. The idea here is to return a + No greater than *max_chunks* will be returned. Each chunk will only be + a couple sentences long at most. The idea here is to return a representative sample of the article text rather than the entire article, so we'll probably pick and choose from its introduction, body, - and conclusion, especially if the article is large and max_chunks are - few, so we don't end up just searching for the first paragraph. + and conclusion, especially if the article is large and *max_chunks* is + low, so we don't end up just searching for the first paragraph. """ - return [self.text] - - def _strip_tags(self, text): - return text - - def _strip_templates(self, text): - return text - - def _strip_sections(self, text): - return text - - def _strip_wikicode(self, text): - return text - - def _normalize(self, text): - return text + return [self.text] # TODO: NotImplemented class HTMLTextParser(BaseTextParser): + """A parser that can extract the text from an HTML document.""" + def strip(self): - return self.text + return self.text # TODO: NotImplemented diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index 4345b29..ac40613 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -33,8 +33,10 @@ from earwigbot.exceptions import SearchQueryError __all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] class BaseSearchEngine(object): + """Base class for a simple search engine interface.""" + def __init__(self, cred): - """Store credentials 'cred' for searching later on.""" + """Store credentials *cred* for searching later on.""" self.cred = cred def __repr__(self): @@ -46,25 +48,30 @@ class BaseSearchEngine(object): return "<{0}>".format(self.__class__.__name__) def search(self, query): - """Use this engine to search for 'query'. + """Use this engine to search for *query*. - Not implemented in this base class; overridden in subclasses.""" + Not implemented in this base class; overridden in subclasses. + """ raise NotImplementedError() class YahooBOSSSearchEngine(BaseSearchEngine): + """A search engine interface with Yahoo! BOSS.""" + def search(self, query): - """Do a Yahoo! BOSS web search for 'query'. + """Do a Yahoo! BOSS web search for *query*. Returns a list of URLs, no more than fifty, ranked by relevance (as - determined by Yahoo). Raises SearchQueryError() on errors. + determined by Yahoo). Raises + :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. """ base_url = "http://yboss.yahooapis.com/ysearch/web" query = quote_plus(query.join('"', '"')) params = {"q": query, "style": "raw", "format": "json"} url = "{0}?{1}".format(base_url, urlencode(params)) - consumer = oauth.Consumer(key=self.cred["key"], secret=self.cred["secret"]) + consumer = oauth.Consumer(key=self.cred["key"], + secret=self.cred["secret"]) client = oauth.Client(consumer) headers, body = client.request(url, "GET")