@@ -1,17 +1,17 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
@@ -34,6 +34,7 @@ Import with `from earwigbot.wiki import constants` or `from earwigbot.wiki.const | |||
from earwigbot import __version__ as _v | |||
from platform import python_version as _p | |||
USER_AGENT = "EarwigBot/{0} (Python/{1}; https://github.com/earwig/earwigbot)".format(_v, _p()) | |||
del _v, _p | |||
# Default namespace IDs: | |||
NS_MAIN = 0 | |||
@@ -30,9 +30,10 @@ try: | |||
except ImportError: | |||
oauth = None | |||
from earwigbot.wiki.exceptions import * | |||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | |||
from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser | |||
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine | |||
from earwigbot.wiki.exceptions import * | |||
class CopyvioCheckResult(object): | |||
def __init__(self, violation, confidence, url, queries, article, chains): | |||
@@ -109,33 +110,6 @@ class CopyvioMixin(object): | |||
raise UnknownSearchEngineError(engine) | |||
def _copyvio_strip_html(self, html): | |||
""" | |||
STUB | |||
""" | |||
return html | |||
def _copyvio_strip_article(self, content): | |||
"""Clean the page's raw text by removing templates and formatting. | |||
Returns the page's text with all HTML and wikicode formatting removed, | |||
including templates, tables, references, and the Bibliography/ | |||
References/Sources/See also section(s). It retains punctuation | |||
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses, | |||
quotes) and original capitalization, but not brackets (square and | |||
angular), abnormal spacing, nor anything else. HTML entities are | |||
replaced by their unicode equivalents. | |||
STUB | |||
""" | |||
return content | |||
def _copyvio_chunk_article(self, content, max_chunks): | |||
""" | |||
STUB | |||
""" | |||
return [content] | |||
def _copyvio_compare_content(self, article, url): | |||
""" | |||
DOCSTRING NEEDED | |||
@@ -144,7 +118,7 @@ class CopyvioMixin(object): | |||
if not html: | |||
return 0 | |||
source = MarkovChain(self._copyvio_strip_html(html)) | |||
source = MarkovChain(HTMLTextParser(html).strip()) | |||
delta = MarkovChainIntersection(article, source) | |||
return float(delta.size()) / article.size(), (source, delta) | |||
@@ -182,8 +156,8 @@ class CopyvioMixin(object): | |||
empty = MarkovChain("") | |||
best_chains = (empty, MarkovChainIntersection(empty, empty)) | |||
content = self.get(force) | |||
clean = self._copyvio_strip_article(content) | |||
chunks = self._copyvio_chunk_article(clean, max_queries) | |||
clean = ArticleTextParser(content).strip() | |||
chunks = ArticleTextParser(clean).chunk(max_queries) | |||
article_chain = MarkovChain(clean) | |||
last_query = time() | |||
@@ -236,7 +210,7 @@ class CopyvioMixin(object): | |||
SearchQueryError will be raised. | |||
""" | |||
content = self.get(force) | |||
clean = self._copyvio_strip_article(content) | |||
clean = ArticleTextParser(content).strip() | |||
article_chain = MarkovChain(clean) | |||
confidence, chains = self._copyvio_compare_content(article_chain, url) | |||
@@ -0,0 +1,80 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
class BaseTextParser(object): | |||
def __init__(self, text): | |||
self.text = text | |||
class ArticleTextParser(BaseTextParser): | |||
def strip(self): | |||
"""Clean the page's raw text by removing templates and formatting. | |||
Returns the page's text with all HTML and wikicode formatting removed, | |||
including templates, tables, references, and the Bibliography/ | |||
References/Sources/See also section(s). It retains punctuation | |||
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses, | |||
quotes) and original capitalization, but not brackets (square and | |||
angular), abnormal spacing, nor anything else. HTML entities are | |||
replaced by their unicode equivalents. | |||
The actual replacement is handled by a few private methods within this | |||
class. | |||
""" | |||
text = self._strip_tags(self.text) | |||
text = self._strip_templates(text) | |||
text = self._strip_sections(text) | |||
text = self._strip_wikicode(text) | |||
text = self._normalize(text) | |||
return text | |||
def chunk(self, max_chunks): | |||
"""Convert the article text into a list of web-searchable chunks. | |||
No greater than max_chunks will be returned. Each chunk will only be a | |||
couple sentences long at most. The idea here is to return a | |||
representative sample of the article text rather than the entire | |||
article, so we'll probably pick and choose from its introduction, body, | |||
and conclusion, especially if the article is large and max_chunks are | |||
few, so we don't end up just searching for the first paragraph. | |||
""" | |||
return [self.text] | |||
def _strip_tags(self, text): | |||
return text | |||
def _strip_templates(self, text): | |||
return text | |||
def _strip_sections(self, text): | |||
return text | |||
def _strip_wikicode(self, text): | |||
return text | |||
def _normalize(self, text): | |||
return text | |||
class HTMLTextParser(BaseTextParser): | |||
def strip(self): | |||
return self.text |