Also implementing MWParserFromHell, plus some cleanup.tags/v0.1^2
@@ -0,0 +1,33 @@ | |||||
copyvios Package | |||||
================ | |||||
:mod:`copyvios` Package | |||||
----------------------- | |||||
.. automodule:: earwigbot.wiki.copyvios | |||||
:members: | |||||
:undoc-members: | |||||
:mod:`markov` Module | |||||
-------------------- | |||||
.. automodule:: earwigbot.wiki.copyvios.markov | |||||
:members: | |||||
:undoc-members: | |||||
:show-inheritance: | |||||
:mod:`parsers` Module | |||||
--------------------- | |||||
.. automodule:: earwigbot.wiki.copyvios.parsers | |||||
:members: | |||||
:undoc-members: | |||||
:show-inheritance: | |||||
:mod:`search` Module | |||||
-------------------- | |||||
.. automodule:: earwigbot.wiki.copyvios.search | |||||
:members: | |||||
:undoc-members: | |||||
:show-inheritance: |
@@ -22,13 +22,6 @@ wiki Package | |||||
:members: | :members: | ||||
:undoc-members: | :undoc-members: | ||||
:mod:`copyright` Module | |||||
.. automodule:: earwigbot.wiki.copyright | |||||
:members: | |||||
:undoc-members: | |||||
:mod:`page` Module | :mod:`page` Module | ||||
------------------ | ------------------ | ||||
@@ -57,3 +50,10 @@ wiki Package | |||||
.. automodule:: earwigbot.wiki.user | .. automodule:: earwigbot.wiki.user | ||||
:members: | :members: | ||||
:undoc-members: | :undoc-members: | ||||
Subpackages | |||||
----------- | |||||
.. toctree:: | |||||
earwigbot.wiki.copyvios |
@@ -38,6 +38,22 @@ from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine | |||||
__all__ = ["CopyvioCheckResult", "CopyvioMixIn"] | __all__ = ["CopyvioCheckResult", "CopyvioMixIn"] | ||||
class CopyvioCheckResult(object): | class CopyvioCheckResult(object): | ||||
""" | |||||
**EarwigBot: Wiki Toolset: Copyvio Check Result** | |||||
A class holding information about the results of a copyvio check. | |||||
*Attributes:* | |||||
- :py:attr:`violation`: ``True`` if this is a violation, else ``False`` | |||||
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy | |||||
- :py:attr:`url`: the URL of the violated page | |||||
- :py:attr:`queries`: the number of queries used to reach a result | |||||
- :py:attr:`article_chain`: the MarkovChain of the article text | |||||
- :py:attr:`source_chain`: the MarkovChain of the violated page text | |||||
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two | |||||
""" | |||||
def __init__(self, violation, confidence, url, queries, article, chains): | def __init__(self, violation, confidence, url, queries, article, chains): | ||||
self.violation = violation | self.violation = violation | ||||
self.confidence = confidence | self.confidence = confidence | ||||
@@ -61,14 +77,15 @@ class CopyvioCheckResult(object): | |||||
class CopyvioMixIn(object): | class CopyvioMixIn(object): | ||||
""" | """ | ||||
EarwigBot's Wiki Toolset: Copyright Violation Mixin | |||||
**EarwigBot: Wiki Toolset: Copyright Violation MixIn** | |||||
This is a mixin that provides two public methods, copyvio_check() and | |||||
copyvio_compare(). The former checks the page for copyright violations | |||||
using a search engine API, and the latter compares the page against a | |||||
specified URL. Credentials for the search engine API are stored in the | |||||
site's config. | |||||
This is a mixin that provides two public methods, :py:meth:`copyvio_check` | |||||
and :py:meth:`copyvio_compare`. The former checks the page for copyright | |||||
violations using a search engine API, and the latter compares the page | |||||
against a given URL. Credentials for the search engine API are stored in | |||||
the :py:class:`~earwigbot.wiki.site.Site`'s config. | |||||
""" | """ | ||||
def __init__(self, site): | def __init__(self, site): | ||||
self._opener = build_opener() | self._opener = build_opener() | ||||
self._opener.addheaders = site._opener.addheaders | self._opener.addheaders = site._opener.addheaders | ||||
@@ -100,10 +117,10 @@ class CopyvioMixIn(object): | |||||
def _select_search_engine(self): | def _select_search_engine(self): | ||||
"""Return a function that can be called to do web searches. | """Return a function that can be called to do web searches. | ||||
The "function" is a functools.partial object that takes one argument, a | |||||
query, and returns a list of URLs, ranked by importance. The underlying | |||||
logic depends on the 'engine' argument; for example, if 'engine' is | |||||
"Yahoo! BOSS", we'll use self._yahoo_boss_query for querying. | |||||
The function takes one argument, a search query, and returns a list of | |||||
URLs, ranked by importance. The underlying logic depends on the | |||||
*engine* argument within our config; for example, if *engine* is | |||||
"Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying. | |||||
Raises UnknownSearchEngineError if the 'engine' listed in our config is | Raises UnknownSearchEngineError if the 'engine' listed in our config is | ||||
unknown to us, and UnsupportedSearchEngineError if we are missing a | unknown to us, and UnsupportedSearchEngineError if we are missing a | ||||
@@ -122,8 +139,8 @@ class CopyvioMixIn(object): | |||||
def _copyvio_compare_content(self, article, url): | def _copyvio_compare_content(self, article, url): | ||||
"""Return a number comparing an article and a URL. | """Return a number comparing an article and a URL. | ||||
The *article* is a Markov chain, whereas the URL is a string that we | |||||
will try to open ourselves. | |||||
The *article* is a Markov chain, whereas the *url* is just a string | |||||
that we'll try to open and read ourselves. | |||||
""" | """ | ||||
html = self._open_url_ignoring_errors(url) | html = self._open_url_ignoring_errors(url) | ||||
if not html: | if not html: | ||||
@@ -134,30 +151,22 @@ class CopyvioMixIn(object): | |||||
return float(delta.size()) / article.size(), (source, delta) | return float(delta.size()) / article.size(), (source, delta) | ||||
def copyvio_check(self, min_confidence=0.5, max_queries=-1, | def copyvio_check(self, min_confidence=0.5, max_queries=-1, | ||||
interquery_sleep=1, force=False): | |||||
interquery_sleep=1): | |||||
"""Check the page for copyright violations. | """Check the page for copyright violations. | ||||
Returns a _CopyvioCheckResult object with four useful attributes: | |||||
"violation", "confidence", "url", and "queries". "confidence" is a | |||||
number between 0 and 1; if it is less than "min_confidence", we could | |||||
not find any indication of a violation (so "violation" will be False | |||||
and "url" may or may not be None), otherwise it indicates the relative | |||||
faith in our results, "violation" will be True, and "url" will be the | |||||
place the article is suspected of being copied from. "queries" is the | |||||
number of queries used to determine the results. | |||||
Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` | |||||
object with information on the results of the check. | |||||
"max_queries" is self-explanatory; we will never make more than this | |||||
number of queries in a given check. If it's less than 0, we will not | |||||
limit our number of queries. | |||||
*max_queries* is self-explanatory; we will never make more than this | |||||
number of queries in a given check. If it's lower than 0, we will not | |||||
limit the number of queries. | |||||
"interquery_sleep" is the minimum amount of time we will sleep between | |||||
*interquery_sleep* is the minimum amount of time we will sleep between | |||||
search engine queries, in seconds. | search engine queries, in seconds. | ||||
"force" is simply passed to page.get() - it has the same behavior there | |||||
as it does here. | |||||
Raises CopyvioCheckError or subclasses (UnknownSearchEngineError, | |||||
SearchQueryError, ...) on errors. | |||||
Raises :py:exc:`~earwigbot.exceptions.CopyvioCheckError` or subclasses | |||||
(:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError`, | |||||
:py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors. | |||||
""" | """ | ||||
searcher = self._select_search_engine() | searcher = self._select_search_engine() | ||||
handled_urls = [] | handled_urls = [] | ||||
@@ -166,9 +175,9 @@ class CopyvioMixIn(object): | |||||
num_queries = 0 | num_queries = 0 | ||||
empty = MarkovChain("") | empty = MarkovChain("") | ||||
best_chains = (empty, MarkovChainIntersection(empty, empty)) | best_chains = (empty, MarkovChainIntersection(empty, empty)) | ||||
content = self.get(force) | |||||
clean = ArticleTextParser(content).strip() | |||||
chunks = ArticleTextParser(clean).chunk(max_queries) | |||||
parser = ArticleTextParser(self.get()) | |||||
clean = parser.strip() | |||||
chunks = parser.chunk(max_queries) | |||||
article_chain = MarkovChain(clean) | article_chain = MarkovChain(clean) | ||||
last_query = time() | last_query = time() | ||||
@@ -200,13 +209,14 @@ class CopyvioMixIn(object): | |||||
return CopyvioCheckResult(v, best_confidence, best_match, num_queries, | return CopyvioCheckResult(v, best_confidence, best_match, num_queries, | ||||
article_chain, best_chains) | article_chain, best_chains) | ||||
def copyvio_compare(self, url, min_confidence=0.5, force=False): | |||||
"""Check the page like copyvio_check(), but against a specific URL. | |||||
def copyvio_compare(self, url, min_confidence=0.5): | |||||
"""Check the page like :py:meth:`copyvio_check` against a specific URL. | |||||
This is essentially a reduced version of the above - a copyivo | This is essentially a reduced version of the above - a copyivo | ||||
comparison is made using Markov chains and the result is returned in a | comparison is made using Markov chains and the result is returned in a | ||||
_CopyvioCheckResult object - without using a search engine, as the | |||||
suspected "violated" URL is supplied from the start. | |||||
:py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but | |||||
without using a search engine, since the suspected "violated" URL is | |||||
supplied from the start. | |||||
Its primary use is to generate a result when the URL is retrieved from | Its primary use is to generate a result when the URL is retrieved from | ||||
a cache, like the one used in EarwigBot's Toolserver site. After a | a cache, like the one used in EarwigBot's Toolserver site. After a | ||||
@@ -217,10 +227,11 @@ class CopyvioMixIn(object): | |||||
be stored for data retention reasons, so a fresh comparison is made | be stored for data retention reasons, so a fresh comparison is made | ||||
using this function. | using this function. | ||||
Since no searching is done, neither UnknownSearchEngineError nor | |||||
SearchQueryError will be raised. | |||||
Since no searching is done, neither | |||||
:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError` nor | |||||
:py:exc:`~earwigbot.exceptions.SearchQueryError` will be raised. | |||||
""" | """ | ||||
content = self.get(force) | |||||
content = self.get() | |||||
clean = ArticleTextParser(content).strip() | clean = ArticleTextParser(content).strip() | ||||
article_chain = MarkovChain(clean) | article_chain = MarkovChain(clean) | ||||
confidence, chains = self._copyvio_compare_content(article_chain, url) | confidence, chains = self._copyvio_compare_content(article_chain, url) | ||||
@@ -26,6 +26,7 @@ from re import sub, UNICODE | |||||
__all__ = ["MarkovChain", "MarkovChainIntersection"] | __all__ = ["MarkovChain", "MarkovChainIntersection"] | ||||
class MarkovChain(object): | class MarkovChain(object): | ||||
"""Implements a basic bigram Markov chain of words.""" | |||||
START = -1 | START = -1 | ||||
END = -2 | END = -2 | ||||
@@ -51,6 +52,7 @@ class MarkovChain(object): | |||||
return "<MarkovChain of size {0}>".format(self.size()) | return "<MarkovChain of size {0}>".format(self.size()) | ||||
def size(self): | def size(self): | ||||
"""Return the size of the Markov chain: the total number of nodes.""" | |||||
count = 0 | count = 0 | ||||
for node in self.chain.itervalues(): | for node in self.chain.itervalues(): | ||||
for hits in node.itervalues(): | for hits in node.itervalues(): | ||||
@@ -59,6 +61,8 @@ class MarkovChain(object): | |||||
class MarkovChainIntersection(MarkovChain): | class MarkovChainIntersection(MarkovChain): | ||||
"""Implements the intersection of two chains (i.e., their shared nodes).""" | |||||
def __init__(self, mc1, mc2): | def __init__(self, mc1, mc2): | ||||
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) | self.chain = defaultdict(lambda: defaultdict(lambda: 0)) | ||||
self.mc1, self.mc2 = mc1, mc2 | self.mc1, self.mc2 = mc1, mc2 | ||||
@@ -20,9 +20,19 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
try: | |||||
import mwparserfromhell | |||||
except ImportError: | |||||
mwparserfromhell = None | |||||
__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] | __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] | ||||
class BaseTextParser(object): | class BaseTextParser(object): | ||||
"""Base class for a parser that handles text.""" | |||||
def __init__(self, text): | |||||
self.text = text | |||||
def __repr__(self): | def __repr__(self): | ||||
"""Return the canonical string representation of the text parser.""" | """Return the canonical string representation of the text parser.""" | ||||
return "{0}(text={1!r})".format(self.__class__.__name__, self.text) | return "{0}(text={1!r})".format(self.__class__.__name__, self.text) | ||||
@@ -32,60 +42,40 @@ class BaseTextParser(object): | |||||
name = self.__class__.__name__ | name = self.__class__.__name__ | ||||
return "<{0} of text with size {1}>".format(name, len(text)) | return "<{0} of text with size {1}>".format(name, len(text)) | ||||
def __init__(self, text): | |||||
self.text = text | |||||
class ArticleTextParser(BaseTextParser): | class ArticleTextParser(BaseTextParser): | ||||
"""A parser that can strip and chunk wikicode article text.""" | |||||
def strip(self): | def strip(self): | ||||
"""Clean the page's raw text by removing templates and formatting. | """Clean the page's raw text by removing templates and formatting. | ||||
Returns the page's text with all HTML and wikicode formatting removed, | |||||
including templates, tables, references, and the Bibliography/ | |||||
References/Sources/See also section(s). It retains punctuation | |||||
Return the page's text with all HTML and wikicode formatting removed, | |||||
including templates, tables, and references. It retains punctuation | |||||
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses, | (spacing, paragraphs, periods, commas, (semi)-colons, parentheses, | ||||
quotes) and original capitalization, but not brackets (square and | |||||
angular), abnormal spacing, nor anything else. HTML entities are | |||||
quotes), original capitalization, and so forth. HTML entities are | |||||
replaced by their unicode equivalents. | replaced by their unicode equivalents. | ||||
The actual replacement is handled by a few private methods within this | |||||
class. | |||||
The actual stripping is handled by :py:mod:`mwparserfromhell`. | |||||
""" | """ | ||||
text = self._strip_tags(self.text) | |||||
text = self._strip_templates(text) | |||||
text = self._strip_sections(text) | |||||
text = self._strip_wikicode(text) | |||||
text = self._normalize(text) | |||||
return text | |||||
wikicode = mwparserfromhell.parse(self.text) | |||||
self.clean = u" ".join(wikicode.normalize().ifilter_text()) | |||||
return self.clean | |||||
def chunk(self, max_chunks): | def chunk(self, max_chunks): | ||||
"""Convert the article text into a list of web-searchable chunks. | |||||
"""Convert the clean article text into a list of web-searchable chunks. | |||||
No greater than max_chunks will be returned. Each chunk will only be a | |||||
couple sentences long at most. The idea here is to return a | |||||
No greater than *max_chunks* will be returned. Each chunk will only be | |||||
a couple sentences long at most. The idea here is to return a | |||||
representative sample of the article text rather than the entire | representative sample of the article text rather than the entire | ||||
article, so we'll probably pick and choose from its introduction, body, | article, so we'll probably pick and choose from its introduction, body, | ||||
and conclusion, especially if the article is large and max_chunks are | |||||
few, so we don't end up just searching for the first paragraph. | |||||
and conclusion, especially if the article is large and *max_chunks* is | |||||
low, so we don't end up just searching for the first paragraph. | |||||
""" | """ | ||||
return [self.text] | |||||
def _strip_tags(self, text): | |||||
return text | |||||
def _strip_templates(self, text): | |||||
return text | |||||
def _strip_sections(self, text): | |||||
return text | |||||
def _strip_wikicode(self, text): | |||||
return text | |||||
def _normalize(self, text): | |||||
return text | |||||
return [self.text] # TODO: NotImplemented | |||||
class HTMLTextParser(BaseTextParser): | class HTMLTextParser(BaseTextParser): | ||||
"""A parser that can extract the text from an HTML document.""" | |||||
def strip(self): | def strip(self): | ||||
return self.text | |||||
return self.text # TODO: NotImplemented |
@@ -33,8 +33,10 @@ from earwigbot.exceptions import SearchQueryError | |||||
__all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] | __all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] | ||||
class BaseSearchEngine(object): | class BaseSearchEngine(object): | ||||
"""Base class for a simple search engine interface.""" | |||||
def __init__(self, cred): | def __init__(self, cred): | ||||
"""Store credentials 'cred' for searching later on.""" | |||||
"""Store credentials *cred* for searching later on.""" | |||||
self.cred = cred | self.cred = cred | ||||
def __repr__(self): | def __repr__(self): | ||||
@@ -46,25 +48,30 @@ class BaseSearchEngine(object): | |||||
return "<{0}>".format(self.__class__.__name__) | return "<{0}>".format(self.__class__.__name__) | ||||
def search(self, query): | def search(self, query): | ||||
"""Use this engine to search for 'query'. | |||||
"""Use this engine to search for *query*. | |||||
Not implemented in this base class; overridden in subclasses.""" | |||||
Not implemented in this base class; overridden in subclasses. | |||||
""" | |||||
raise NotImplementedError() | raise NotImplementedError() | ||||
class YahooBOSSSearchEngine(BaseSearchEngine): | class YahooBOSSSearchEngine(BaseSearchEngine): | ||||
"""A search engine interface with Yahoo! BOSS.""" | |||||
def search(self, query): | def search(self, query): | ||||
"""Do a Yahoo! BOSS web search for 'query'. | |||||
"""Do a Yahoo! BOSS web search for *query*. | |||||
Returns a list of URLs, no more than fifty, ranked by relevance (as | Returns a list of URLs, no more than fifty, ranked by relevance (as | ||||
determined by Yahoo). Raises SearchQueryError() on errors. | |||||
determined by Yahoo). Raises | |||||
:py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. | |||||
""" | """ | ||||
base_url = "http://yboss.yahooapis.com/ysearch/web" | base_url = "http://yboss.yahooapis.com/ysearch/web" | ||||
query = quote_plus(query.join('"', '"')) | query = quote_plus(query.join('"', '"')) | ||||
params = {"q": query, "style": "raw", "format": "json"} | params = {"q": query, "style": "raw", "format": "json"} | ||||
url = "{0}?{1}".format(base_url, urlencode(params)) | url = "{0}?{1}".format(base_url, urlencode(params)) | ||||
consumer = oauth.Consumer(key=self.cred["key"], secret=self.cred["secret"]) | |||||
consumer = oauth.Consumer(key=self.cred["key"], | |||||
secret=self.cred["secret"]) | |||||
client = oauth.Client(consumer) | client = oauth.Client(consumer) | ||||
headers, body = client.request(url, "GET") | headers, body = client.request(url, "GET") | ||||