diff --git a/earwigbot/wiki/constants.py b/earwigbot/wiki/constants.py index 22aef9c..2431884 100644 --- a/earwigbot/wiki/constants.py +++ b/earwigbot/wiki/constants.py @@ -1,17 +1,17 @@ # -*- coding: utf-8 -*- # # Copyright (C) 2009-2012 by Ben Kurtovic -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is +# copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -34,6 +34,7 @@ Import with `from earwigbot.wiki import constants` or `from earwigbot.wiki.const from earwigbot import __version__ as _v from platform import python_version as _p USER_AGENT = "EarwigBot/{0} (Python/{1}; https://github.com/earwig/earwigbot)".format(_v, _p()) +del _v, _p # Default namespace IDs: NS_MAIN = 0 diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 0aaa9b5..46b27e2 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -30,9 +30,10 @@ try: except ImportError: oauth = None -from earwigbot.wiki.exceptions import * from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection +from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine +from earwigbot.wiki.exceptions import * class CopyvioCheckResult(object): def __init__(self, violation, confidence, url, queries, article, chains): @@ -109,33 +110,6 @@ class CopyvioMixin(object): raise UnknownSearchEngineError(engine) - def _copyvio_strip_html(self, html): - """ - STUB - """ - return html - - def _copyvio_strip_article(self, content): - """Clean the page's raw text by removing templates and formatting. - - Returns the page's text with all HTML and wikicode formatting removed, - including templates, tables, references, and the Bibliography/ - References/Sources/See also section(s). It retains punctuation - (spacing, paragraphs, periods, commas, (semi)-colons, parentheses, - quotes) and original capitalization, but not brackets (square and - angular), abnormal spacing, nor anything else. HTML entities are - replaced by their unicode equivalents. - - STUB - """ - return content - - def _copyvio_chunk_article(self, content, max_chunks): - """ - STUB - """ - return [content] - def _copyvio_compare_content(self, article, url): """ DOCSTRING NEEDED @@ -144,7 +118,7 @@ class CopyvioMixin(object): if not html: return 0 - source = MarkovChain(self._copyvio_strip_html(html)) + source = MarkovChain(HTMLTextParser(html).strip()) delta = MarkovChainIntersection(article, source) return float(delta.size()) / article.size(), (source, delta) @@ -182,8 +156,8 @@ class CopyvioMixin(object): empty = MarkovChain("") best_chains = (empty, MarkovChainIntersection(empty, empty)) content = self.get(force) - clean = self._copyvio_strip_article(content) - chunks = self._copyvio_chunk_article(clean, max_queries) + clean = ArticleTextParser(content).strip() + chunks = ArticleTextParser(clean).chunk(max_queries) article_chain = MarkovChain(clean) last_query = time() @@ -236,7 +210,7 @@ class CopyvioMixin(object): SearchQueryError will be raised. """ content = self.get(force) - clean = self._copyvio_strip_article(content) + clean = ArticleTextParser(content).strip() article_chain = MarkovChain(clean) confidence, chains = self._copyvio_compare_content(article_chain, url) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py new file mode 100644 index 0000000..f9bb4c2 --- /dev/null +++ b/earwigbot/wiki/copyvios/parsers.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2009-2012 by Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +class BaseTextParser(object): + def __init__(self, text): + self.text = text + + +class ArticleTextParser(BaseTextParser): + def strip(self): + """Clean the page's raw text by removing templates and formatting. + + Returns the page's text with all HTML and wikicode formatting removed, + including templates, tables, references, and the Bibliography/ + References/Sources/See also section(s). It retains punctuation + (spacing, paragraphs, periods, commas, (semi)-colons, parentheses, + quotes) and original capitalization, but not brackets (square and + angular), abnormal spacing, nor anything else. HTML entities are + replaced by their unicode equivalents. + + The actual replacement is handled by a few private methods within this + class. + """ + text = self._strip_tags(self.text) + text = self._strip_templates(text) + text = self._strip_sections(text) + text = self._strip_wikicode(text) + text = self._normalize(text) + return text + + def chunk(self, max_chunks): + """Convert the article text into a list of web-searchable chunks. + + No greater than max_chunks will be returned. Each chunk will only be a + couple sentences long at most. The idea here is to return a + representative sample of the article text rather than the entire + article, so we'll probably pick and choose from its introduction, body, + and conclusion, especially if the article is large and max_chunks are + few, so we don't end up just searching for the first paragraph. + """ + return [self.text] + + def _strip_tags(self, text): + return text + + def _strip_templates(self, text): + return text + + def _strip_sections(self, text): + return text + + def _strip_wikicode(self, text): + return text + + def _normalize(self, text): + return text + + +class HTMLTextParser(BaseTextParser): + def strip(self): + return self.text