@@ -1,17 +1,17 @@ | |||||
# -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||
# | # | ||||
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net> | # Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net> | ||||
# | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | # Permission is hereby granted, free of charge, to any person obtaining a copy | ||||
# of this software and associated documentation files (the "Software"), to deal | # of this software and associated documentation files (the "Software"), to deal | ||||
# in the Software without restriction, including without limitation the rights | # in the Software without restriction, including without limitation the rights | ||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | # furnished to do so, subject to the following conditions: | ||||
# | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | # The above copyright notice and this permission notice shall be included in | ||||
# all copies or substantial portions of the Software. | # all copies or substantial portions of the Software. | ||||
# | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||||
@@ -34,6 +34,7 @@ Import with `from earwigbot.wiki import constants` or `from earwigbot.wiki.const | |||||
from earwigbot import __version__ as _v | from earwigbot import __version__ as _v | ||||
from platform import python_version as _p | from platform import python_version as _p | ||||
USER_AGENT = "EarwigBot/{0} (Python/{1}; https://github.com/earwig/earwigbot)".format(_v, _p()) | USER_AGENT = "EarwigBot/{0} (Python/{1}; https://github.com/earwig/earwigbot)".format(_v, _p()) | ||||
del _v, _p | |||||
# Default namespace IDs: | # Default namespace IDs: | ||||
NS_MAIN = 0 | NS_MAIN = 0 | ||||
@@ -30,9 +30,10 @@ try: | |||||
except ImportError: | except ImportError: | ||||
oauth = None | oauth = None | ||||
from earwigbot.wiki.exceptions import * | |||||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | ||||
from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser | |||||
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine | from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine | ||||
from earwigbot.wiki.exceptions import * | |||||
class CopyvioCheckResult(object): | class CopyvioCheckResult(object): | ||||
def __init__(self, violation, confidence, url, queries, article, chains): | def __init__(self, violation, confidence, url, queries, article, chains): | ||||
@@ -109,33 +110,6 @@ class CopyvioMixin(object): | |||||
raise UnknownSearchEngineError(engine) | raise UnknownSearchEngineError(engine) | ||||
def _copyvio_strip_html(self, html): | |||||
""" | |||||
STUB | |||||
""" | |||||
return html | |||||
def _copyvio_strip_article(self, content): | |||||
"""Clean the page's raw text by removing templates and formatting. | |||||
Returns the page's text with all HTML and wikicode formatting removed, | |||||
including templates, tables, references, and the Bibliography/ | |||||
References/Sources/See also section(s). It retains punctuation | |||||
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses, | |||||
quotes) and original capitalization, but not brackets (square and | |||||
angular), abnormal spacing, nor anything else. HTML entities are | |||||
replaced by their unicode equivalents. | |||||
STUB | |||||
""" | |||||
return content | |||||
def _copyvio_chunk_article(self, content, max_chunks): | |||||
""" | |||||
STUB | |||||
""" | |||||
return [content] | |||||
def _copyvio_compare_content(self, article, url): | def _copyvio_compare_content(self, article, url): | ||||
""" | """ | ||||
DOCSTRING NEEDED | DOCSTRING NEEDED | ||||
@@ -144,7 +118,7 @@ class CopyvioMixin(object): | |||||
if not html: | if not html: | ||||
return 0 | return 0 | ||||
source = MarkovChain(self._copyvio_strip_html(html)) | |||||
source = MarkovChain(HTMLTextParser(html).strip()) | |||||
delta = MarkovChainIntersection(article, source) | delta = MarkovChainIntersection(article, source) | ||||
return float(delta.size()) / article.size(), (source, delta) | return float(delta.size()) / article.size(), (source, delta) | ||||
@@ -182,8 +156,8 @@ class CopyvioMixin(object): | |||||
empty = MarkovChain("") | empty = MarkovChain("") | ||||
best_chains = (empty, MarkovChainIntersection(empty, empty)) | best_chains = (empty, MarkovChainIntersection(empty, empty)) | ||||
content = self.get(force) | content = self.get(force) | ||||
clean = self._copyvio_strip_article(content) | |||||
chunks = self._copyvio_chunk_article(clean, max_queries) | |||||
clean = ArticleTextParser(content).strip() | |||||
chunks = ArticleTextParser(clean).chunk(max_queries) | |||||
article_chain = MarkovChain(clean) | article_chain = MarkovChain(clean) | ||||
last_query = time() | last_query = time() | ||||
@@ -236,7 +210,7 @@ class CopyvioMixin(object): | |||||
SearchQueryError will be raised. | SearchQueryError will be raised. | ||||
""" | """ | ||||
content = self.get(force) | content = self.get(force) | ||||
clean = self._copyvio_strip_article(content) | |||||
clean = ArticleTextParser(content).strip() | |||||
article_chain = MarkovChain(clean) | article_chain = MarkovChain(clean) | ||||
confidence, chains = self._copyvio_compare_content(article_chain, url) | confidence, chains = self._copyvio_compare_content(article_chain, url) | ||||
@@ -0,0 +1,80 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
class BaseTextParser(object): | |||||
def __init__(self, text): | |||||
self.text = text | |||||
class ArticleTextParser(BaseTextParser): | |||||
def strip(self): | |||||
"""Clean the page's raw text by removing templates and formatting. | |||||
Returns the page's text with all HTML and wikicode formatting removed, | |||||
including templates, tables, references, and the Bibliography/ | |||||
References/Sources/See also section(s). It retains punctuation | |||||
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses, | |||||
quotes) and original capitalization, but not brackets (square and | |||||
angular), abnormal spacing, nor anything else. HTML entities are | |||||
replaced by their unicode equivalents. | |||||
The actual replacement is handled by a few private methods within this | |||||
class. | |||||
""" | |||||
text = self._strip_tags(self.text) | |||||
text = self._strip_templates(text) | |||||
text = self._strip_sections(text) | |||||
text = self._strip_wikicode(text) | |||||
text = self._normalize(text) | |||||
return text | |||||
def chunk(self, max_chunks): | |||||
"""Convert the article text into a list of web-searchable chunks. | |||||
No greater than max_chunks will be returned. Each chunk will only be a | |||||
couple sentences long at most. The idea here is to return a | |||||
representative sample of the article text rather than the entire | |||||
article, so we'll probably pick and choose from its introduction, body, | |||||
and conclusion, especially if the article is large and max_chunks are | |||||
few, so we don't end up just searching for the first paragraph. | |||||
""" | |||||
return [self.text] | |||||
def _strip_tags(self, text): | |||||
return text | |||||
def _strip_templates(self, text): | |||||
return text | |||||
def _strip_sections(self, text): | |||||
return text | |||||
def _strip_wikicode(self, text): | |||||
return text | |||||
def _normalize(self, text): | |||||
return text | |||||
class HTMLTextParser(BaseTextParser): | |||||
def strip(self): | |||||
return self.text |