Преглед изворни кода

DOCUMENT EVERYTHING (#5)

Also implementing MWParserFromHell, plus some cleanup.
tags/v0.1^2
Ben Kurtovic пре 12 година
родитељ
комит
d45e342bac
6 измењених фајлова са 136 додато и 90 уклоњено
  1. +33
    -0
      docs/api/earwigbot.wiki.copyvios.rst
  2. +7
    -6
      docs/api/earwigbot.wiki.rst
  3. +51
    -40
      earwigbot/wiki/copyvios/__init__.py
  4. +4
    -0
      earwigbot/wiki/copyvios/markov.py
  5. +28
    -38
      earwigbot/wiki/copyvios/parsers.py
  6. +13
    -6
      earwigbot/wiki/copyvios/search.py

+ 33
- 0
docs/api/earwigbot.wiki.copyvios.rst Прегледај датотеку

@@ -0,0 +1,33 @@
copyvios Package
================

:mod:`copyvios` Package
-----------------------

.. automodule:: earwigbot.wiki.copyvios
:members:
:undoc-members:

:mod:`markov` Module
--------------------

.. automodule:: earwigbot.wiki.copyvios.markov
:members:
:undoc-members:
:show-inheritance:

:mod:`parsers` Module
---------------------

.. automodule:: earwigbot.wiki.copyvios.parsers
:members:
:undoc-members:
:show-inheritance:

:mod:`search` Module
--------------------

.. automodule:: earwigbot.wiki.copyvios.search
:members:
:undoc-members:
:show-inheritance:

+ 7
- 6
docs/api/earwigbot.wiki.rst Прегледај датотеку

@@ -22,13 +22,6 @@ wiki Package
:members:
:undoc-members:

:mod:`copyright` Module

.. automodule:: earwigbot.wiki.copyright
:members:
:undoc-members:

:mod:`page` Module
------------------

@@ -57,3 +50,10 @@ wiki Package
.. automodule:: earwigbot.wiki.user
:members:
:undoc-members:

Subpackages
-----------

.. toctree::

earwigbot.wiki.copyvios

+ 51
- 40
earwigbot/wiki/copyvios/__init__.py Прегледај датотеку

@@ -38,6 +38,22 @@ from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
__all__ = ["CopyvioCheckResult", "CopyvioMixIn"]

class CopyvioCheckResult(object):
"""
**EarwigBot: Wiki Toolset: Copyvio Check Result**

A class holding information about the results of a copyvio check.

*Attributes:*

- :py:attr:`violation`: ``True`` if this is a violation, else ``False``
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy
- :py:attr:`url`: the URL of the violated page
- :py:attr:`queries`: the number of queries used to reach a result
- :py:attr:`article_chain`: the MarkovChain of the article text
- :py:attr:`source_chain`: the MarkovChain of the violated page text
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two
"""

def __init__(self, violation, confidence, url, queries, article, chains):
self.violation = violation
self.confidence = confidence
@@ -61,14 +77,15 @@ class CopyvioCheckResult(object):

class CopyvioMixIn(object):
"""
EarwigBot's Wiki Toolset: Copyright Violation Mixin
**EarwigBot: Wiki Toolset: Copyright Violation MixIn**

This is a mixin that provides two public methods, copyvio_check() and
copyvio_compare(). The former checks the page for copyright violations
using a search engine API, and the latter compares the page against a
specified URL. Credentials for the search engine API are stored in the
site's config.
This is a mixin that provides two public methods, :py:meth:`copyvio_check`
and :py:meth:`copyvio_compare`. The former checks the page for copyright
violations using a search engine API, and the latter compares the page
against a given URL. Credentials for the search engine API are stored in
the :py:class:`~earwigbot.wiki.site.Site`'s config.
"""

def __init__(self, site):
self._opener = build_opener()
self._opener.addheaders = site._opener.addheaders
@@ -100,10 +117,10 @@ class CopyvioMixIn(object):
def _select_search_engine(self):
"""Return a function that can be called to do web searches.

The "function" is a functools.partial object that takes one argument, a
query, and returns a list of URLs, ranked by importance. The underlying
logic depends on the 'engine' argument; for example, if 'engine' is
"Yahoo! BOSS", we'll use self._yahoo_boss_query for querying.
The function takes one argument, a search query, and returns a list of
URLs, ranked by importance. The underlying logic depends on the
*engine* argument within our config; for example, if *engine* is
"Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying.

Raises UnknownSearchEngineError if the 'engine' listed in our config is
unknown to us, and UnsupportedSearchEngineError if we are missing a
@@ -122,8 +139,8 @@ class CopyvioMixIn(object):
def _copyvio_compare_content(self, article, url):
"""Return a number comparing an article and a URL.

The *article* is a Markov chain, whereas the URL is a string that we
will try to open ourselves.
The *article* is a Markov chain, whereas the *url* is just a string
that we'll try to open and read ourselves.
"""
html = self._open_url_ignoring_errors(url)
if not html:
@@ -134,30 +151,22 @@ class CopyvioMixIn(object):
return float(delta.size()) / article.size(), (source, delta)

def copyvio_check(self, min_confidence=0.5, max_queries=-1,
interquery_sleep=1, force=False):
interquery_sleep=1):
"""Check the page for copyright violations.

Returns a _CopyvioCheckResult object with four useful attributes:
"violation", "confidence", "url", and "queries". "confidence" is a
number between 0 and 1; if it is less than "min_confidence", we could
not find any indication of a violation (so "violation" will be False
and "url" may or may not be None), otherwise it indicates the relative
faith in our results, "violation" will be True, and "url" will be the
place the article is suspected of being copied from. "queries" is the
number of queries used to determine the results.
Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult`
object with information on the results of the check.

"max_queries" is self-explanatory; we will never make more than this
number of queries in a given check. If it's less than 0, we will not
limit our number of queries.
*max_queries* is self-explanatory; we will never make more than this
number of queries in a given check. If it's lower than 0, we will not
limit the number of queries.

"interquery_sleep" is the minimum amount of time we will sleep between
*interquery_sleep* is the minimum amount of time we will sleep between
search engine queries, in seconds.

"force" is simply passed to page.get() - it has the same behavior there
as it does here.

Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
SearchQueryError, ...) on errors.
Raises :py:exc:`~earwigbot.exceptions.CopyvioCheckError` or subclasses
(:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError`,
:py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors.
"""
searcher = self._select_search_engine()
handled_urls = []
@@ -166,9 +175,9 @@ class CopyvioMixIn(object):
num_queries = 0
empty = MarkovChain("")
best_chains = (empty, MarkovChainIntersection(empty, empty))
content = self.get(force)
clean = ArticleTextParser(content).strip()
chunks = ArticleTextParser(clean).chunk(max_queries)
parser = ArticleTextParser(self.get())
clean = parser.strip()
chunks = parser.chunk(max_queries)
article_chain = MarkovChain(clean)
last_query = time()

@@ -200,13 +209,14 @@ class CopyvioMixIn(object):
return CopyvioCheckResult(v, best_confidence, best_match, num_queries,
article_chain, best_chains)

def copyvio_compare(self, url, min_confidence=0.5, force=False):
"""Check the page like copyvio_check(), but against a specific URL.
def copyvio_compare(self, url, min_confidence=0.5):
"""Check the page like :py:meth:`copyvio_check` against a specific URL.

This is essentially a reduced version of the above - a copyivo
comparison is made using Markov chains and the result is returned in a
_CopyvioCheckResult object - without using a search engine, as the
suspected "violated" URL is supplied from the start.
:py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but
without using a search engine, since the suspected "violated" URL is
supplied from the start.

Its primary use is to generate a result when the URL is retrieved from
a cache, like the one used in EarwigBot's Toolserver site. After a
@@ -217,10 +227,11 @@ class CopyvioMixIn(object):
be stored for data retention reasons, so a fresh comparison is made
using this function.

Since no searching is done, neither UnknownSearchEngineError nor
SearchQueryError will be raised.
Since no searching is done, neither
:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError` nor
:py:exc:`~earwigbot.exceptions.SearchQueryError` will be raised.
"""
content = self.get(force)
content = self.get()
clean = ArticleTextParser(content).strip()
article_chain = MarkovChain(clean)
confidence, chains = self._copyvio_compare_content(article_chain, url)


+ 4
- 0
earwigbot/wiki/copyvios/markov.py Прегледај датотеку

@@ -26,6 +26,7 @@ from re import sub, UNICODE
__all__ = ["MarkovChain", "MarkovChainIntersection"]

class MarkovChain(object):
"""Implements a basic bigram Markov chain of words."""
START = -1
END = -2

@@ -51,6 +52,7 @@ class MarkovChain(object):
return "<MarkovChain of size {0}>".format(self.size())

def size(self):
"""Return the size of the Markov chain: the total number of nodes."""
count = 0
for node in self.chain.itervalues():
for hits in node.itervalues():
@@ -59,6 +61,8 @@ class MarkovChain(object):


class MarkovChainIntersection(MarkovChain):
"""Implements the intersection of two chains (i.e., their shared nodes)."""

def __init__(self, mc1, mc2):
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
self.mc1, self.mc2 = mc1, mc2


+ 28
- 38
earwigbot/wiki/copyvios/parsers.py Прегледај датотеку

@@ -20,9 +20,19 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

try:
import mwparserfromhell
except ImportError:
mwparserfromhell = None

__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"]

class BaseTextParser(object):
"""Base class for a parser that handles text."""

def __init__(self, text):
self.text = text

def __repr__(self):
"""Return the canonical string representation of the text parser."""
return "{0}(text={1!r})".format(self.__class__.__name__, self.text)
@@ -32,60 +42,40 @@ class BaseTextParser(object):
name = self.__class__.__name__
return "<{0} of text with size {1}>".format(name, len(text))

def __init__(self, text):
self.text = text


class ArticleTextParser(BaseTextParser):
"""A parser that can strip and chunk wikicode article text."""

def strip(self):
"""Clean the page's raw text by removing templates and formatting.

Returns the page's text with all HTML and wikicode formatting removed,
including templates, tables, references, and the Bibliography/
References/Sources/See also section(s). It retains punctuation
Return the page's text with all HTML and wikicode formatting removed,
including templates, tables, and references. It retains punctuation
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
quotes) and original capitalization, but not brackets (square and
angular), abnormal spacing, nor anything else. HTML entities are
quotes), original capitalization, and so forth. HTML entities are
replaced by their unicode equivalents.

The actual replacement is handled by a few private methods within this
class.
The actual stripping is handled by :py:mod:`mwparserfromhell`.
"""
text = self._strip_tags(self.text)
text = self._strip_templates(text)
text = self._strip_sections(text)
text = self._strip_wikicode(text)
text = self._normalize(text)
return text
wikicode = mwparserfromhell.parse(self.text)
self.clean = u" ".join(wikicode.normalize().ifilter_text())
return self.clean

def chunk(self, max_chunks):
"""Convert the article text into a list of web-searchable chunks.
"""Convert the clean article text into a list of web-searchable chunks.

No greater than max_chunks will be returned. Each chunk will only be a
couple sentences long at most. The idea here is to return a
No greater than *max_chunks* will be returned. Each chunk will only be
a couple sentences long at most. The idea here is to return a
representative sample of the article text rather than the entire
article, so we'll probably pick and choose from its introduction, body,
and conclusion, especially if the article is large and max_chunks are
few, so we don't end up just searching for the first paragraph.
and conclusion, especially if the article is large and *max_chunks* is
low, so we don't end up just searching for the first paragraph.
"""
return [self.text]

def _strip_tags(self, text):
return text

def _strip_templates(self, text):
return text

def _strip_sections(self, text):
return text

def _strip_wikicode(self, text):
return text

def _normalize(self, text):
return text
return [self.text] # TODO: NotImplemented


class HTMLTextParser(BaseTextParser):
"""A parser that can extract the text from an HTML document."""

def strip(self):
return self.text
return self.text # TODO: NotImplemented

+ 13
- 6
earwigbot/wiki/copyvios/search.py Прегледај датотеку

@@ -33,8 +33,10 @@ from earwigbot.exceptions import SearchQueryError
__all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"]

class BaseSearchEngine(object):
"""Base class for a simple search engine interface."""

def __init__(self, cred):
"""Store credentials 'cred' for searching later on."""
"""Store credentials *cred* for searching later on."""
self.cred = cred

def __repr__(self):
@@ -46,25 +48,30 @@ class BaseSearchEngine(object):
return "<{0}>".format(self.__class__.__name__)

def search(self, query):
"""Use this engine to search for 'query'.
"""Use this engine to search for *query*.

Not implemented in this base class; overridden in subclasses."""
Not implemented in this base class; overridden in subclasses.
"""
raise NotImplementedError()


class YahooBOSSSearchEngine(BaseSearchEngine):
"""A search engine interface with Yahoo! BOSS."""

def search(self, query):
"""Do a Yahoo! BOSS web search for 'query'.
"""Do a Yahoo! BOSS web search for *query*.

Returns a list of URLs, no more than fifty, ranked by relevance (as
determined by Yahoo). Raises SearchQueryError() on errors.
determined by Yahoo). Raises
:py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
"""
base_url = "http://yboss.yahooapis.com/ysearch/web"
query = quote_plus(query.join('"', '"'))
params = {"q": query, "style": "raw", "format": "json"}
url = "{0}?{1}".format(base_url, urlencode(params))

consumer = oauth.Consumer(key=self.cred["key"], secret=self.cred["secret"])
consumer = oauth.Consumer(key=self.cred["key"],
secret=self.cred["secret"])
client = oauth.Client(consumer)
headers, body = client.request(url, "GET")



Loading…
Откажи
Сачувај