From e6a381f3f7eb4ac37171c44b5ee930bdf4dda354 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 20 Mar 2012 12:25:45 -0400 Subject: [PATCH 01/19] Restructuring copyvio stuff as its own package. --- .../wiki/{copyright.py => copyvios/__init__.py} | 83 ++++++---------------- earwigbot/wiki/copyvios/markov.py | 63 ++++++++++++++++ earwigbot/wiki/page.py | 18 ++--- 3 files changed, 93 insertions(+), 71 deletions(-) rename earwigbot/wiki/{copyright.py => copyvios/__init__.py} (82%) create mode 100644 earwigbot/wiki/copyvios/markov.py diff --git a/earwigbot/wiki/copyright.py b/earwigbot/wiki/copyvios/__init__.py similarity index 82% rename from earwigbot/wiki/copyright.py rename to earwigbot/wiki/copyvios/__init__.py index c003ebb..68b4134 100644 --- a/earwigbot/wiki/copyright.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -1,17 +1,17 @@ # -*- coding: utf-8 -*- # # Copyright (C) 2009-2012 by Ben Kurtovic -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is +# copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -20,11 +20,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from collections import defaultdict from functools import partial from gzip import GzipFile from json import loads -from re import sub, UNICODE from StringIO import StringIO from time import sleep, time from urllib import quote_plus, urlencode @@ -36,8 +34,9 @@ except ImportError: oauth = None from earwigbot.wiki.exceptions import * +from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection -class _CopyvioCheckResult(object): +class CopyvioCheckResult(object): def __init__(self, violation, confidence, url, queries, article, chains): self.violation = violation self.confidence = confidence @@ -48,51 +47,11 @@ class _CopyvioCheckResult(object): self.delta_chain = chains[1] def __repr__(self): - r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" + r = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" return r.format(self.violation, self.confidence, self.url, self.queries) -class _MarkovChain(object): - START = -1 - END = -2 - - def __init__(self, text): - self.text = text - self.chain = defaultdict(lambda: defaultdict(lambda: 0)) - words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split() - prev = self.START - for word in words: - self.chain[prev][word] += 1 - prev = word - try: # This won't work if the source text is completely blank - self.chain[word][self.END] += 1 - except KeyError: - pass - - def size(self): - count = 0 - for node in self.chain.itervalues(): - for hits in node.itervalues(): - count += hits - return count - - -class _MarkovChainIntersection(_MarkovChain): - def __init__(self, mc1, mc2): - self.chain = defaultdict(lambda: defaultdict(lambda: 0)) - c1 = mc1.chain - c2 = mc2.chain - - for word, nodes1 in c1.iteritems(): - if word in c2: - nodes2 = c2[word] - for node, count1 in nodes1.iteritems(): - if node in nodes2: - count2 = nodes2[node] - self.chain[word][node] = min(count1, count2) - - -class CopyrightMixin(object): +class CopyvioMixin(object): """ EarwigBot's Wiki Toolset: Copyright Violation Mixin @@ -220,8 +179,8 @@ class CopyrightMixin(object): if not html: return 0 - source = _MarkovChain(self._copyvio_strip_html(html)) - delta = _MarkovChainIntersection(article, source) + source = MarkovChain(self._copyvio_strip_html(html)) + delta = MarkovChainIntersection(article, source) return float(delta.size()) / article.size(), (source, delta) def copyvio_check(self, min_confidence=0.5, max_queries=-1, @@ -255,17 +214,17 @@ class CopyrightMixin(object): best_confidence = 0 best_match = None num_queries = 0 - empty = _MarkovChain("") - best_chains = (empty, _MarkovChainIntersection(empty, empty)) + empty = MarkovChain("") + best_chains = (empty, MarkovChainIntersection(empty, empty)) content = self.get(force) clean = self._copyvio_strip_article(content) chunks = self._copyvio_chunk_article(clean, max_queries) - article_chain = _MarkovChain(clean) + article_chain = MarkovChain(clean) last_query = time() if article_chain.size() < 20: # Auto-fail very small articles - return _CopyvioCheckResult(False, best_confidence, best_match, - num_queries, article_chain, best_chains) + return CopyvioCheckResult(False, best_confidence, best_match, + num_queries, article_chain, best_chains) while (chunks and best_confidence < min_confidence and (max_queries < 0 or num_queries < max_queries)): @@ -288,8 +247,8 @@ class CopyrightMixin(object): v = True else: v = False - return _CopyvioCheckResult(v, best_confidence, best_match, num_queries, - article_chain, best_chains) + return CopyvioCheckResult(v, best_confidence, best_match, num_queries, + article_chain, best_chains) def copyvio_compare(self, url, min_confidence=0.5, force=False): """Check the page like copyvio_check(), but against a specific URL. @@ -298,7 +257,7 @@ class CopyrightMixin(object): comparison is made using Markov chains and the result is returned in a _CopyvioCheckResult object - without using a search engine, as the suspected "violated" URL is supplied from the start. - + Its primary use is to generate a result when the URL is retrieved from a cache, like the one used in EarwigBot's Toolserver site. After a search is done, the resulting URL is stored in a cache for 24 hours so @@ -313,12 +272,12 @@ class CopyrightMixin(object): """ content = self.get(force) clean = self._copyvio_strip_article(content) - article_chain = _MarkovChain(clean) + article_chain = MarkovChain(clean) confidence, chains = self._copyvio_compare_content(article_chain, url) if confidence >= min_confidence: is_violation = True else: is_violation = False - return _CopyvioCheckResult(is_violation, confidence, url, 0, - article_chain, chains) + return CopyvioCheckResult(is_violation, confidence, url, 0, + article_chain, chains) diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py new file mode 100644 index 0000000..4e77ebc --- /dev/null +++ b/earwigbot/wiki/copyvios/markov.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2009-2012 by Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from collections import defaultdict +from re import sub, UNICODE + +class MarkovChain(object): + START = -1 + END = -2 + + def __init__(self, text): + self.text = text + self.chain = defaultdict(lambda: defaultdict(lambda: 0)) + words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split() + prev = self.START + for word in words: + self.chain[prev][word] += 1 + prev = word + try: # This won't work if the source text is completely blank + self.chain[word][self.END] += 1 + except KeyError: + pass + + def size(self): + count = 0 + for node in self.chain.itervalues(): + for hits in node.itervalues(): + count += hits + return count + + +class MarkovChainIntersection(MarkovChain): + def __init__(self, mc1, mc2): + self.chain = defaultdict(lambda: defaultdict(lambda: 0)) + c1 = mc1.chain + c2 = mc2.chain + + for word, nodes1 in c1.iteritems(): + if word in c2: + nodes2 = c2[word] + for node, count1 in nodes1.iteritems(): + if node in nodes2: + count2 = nodes2[node] + self.chain[word][node] = min(count1, count2) diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py index 8407108..0d266b7 100644 --- a/earwigbot/wiki/page.py +++ b/earwigbot/wiki/page.py @@ -1,17 +1,17 @@ # -*- coding: utf-8 -*- # # Copyright (C) 2009-2012 by Ben Kurtovic -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is +# copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -25,10 +25,10 @@ import re from time import gmtime, strftime from urllib import quote -from earwigbot.wiki.copyright import CopyrightMixin +from earwigbot.wiki.copyvios import CopyvioMixin from earwigbot.wiki.exceptions import * -class Page(CopyrightMixin): +class Page(CopyvioMixin): """ EarwigBot's Wiki Toolset: Page Class @@ -264,7 +264,7 @@ class Page(CopyrightMixin): If `params` is given, we'll use it as our API query parameters. Otherwise, we'll build params using the given kwargs via _build_edit_params(). - + We'll then try to do the API query, and catch any errors the API raises in _handle_edit_errors(). We'll then throw these back as subclasses of EditError. @@ -275,7 +275,7 @@ class Page(CopyrightMixin): if not self._token: e = "You don't have permission to edit this page." raise PermissionsError(e) - + # Weed out invalid pages before we get too far: self._force_validity() @@ -336,7 +336,7 @@ class Page(CopyrightMixin): # Page does not exist; don't edit if it already exists: params["createonly"] = "true" else: - params["recreate"] = "true" + params["recreate"] = "true" return params From d4e947b98bffc3ef156a0308f7856530d61cb987 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 24 Mar 2012 00:51:32 -0400 Subject: [PATCH 02/19] earwigbot.wiki.copyvios.search module split --- earwigbot/wiki/copyvios/__init__.py | 45 +++------------------- earwigbot/wiki/copyvios/search.py | 75 +++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 40 deletions(-) create mode 100644 earwigbot/wiki/copyvios/search.py diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 68b4134..0aaa9b5 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -20,12 +20,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from functools import partial from gzip import GzipFile -from json import loads from StringIO import StringIO from time import sleep, time -from urllib import quote_plus, urlencode from urllib2 import build_opener, URLError try: @@ -35,6 +32,7 @@ except ImportError: from earwigbot.wiki.exceptions import * from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection +from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine class CopyvioCheckResult(object): def __init__(self, violation, confidence, url, queries, article, chains): @@ -107,42 +105,9 @@ class CopyvioMixin(object): if not oauth: e = "The package 'oauth2' could not be imported" raise UnsupportedSearchEngineError(e) - searcher = self._yahoo_boss_query - else: - raise UnknownSearchEngineError(engine) - - return partial(searcher, credentials) - - def _yahoo_boss_query(self, cred, query): - """Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials. - - Returns a list of URLs, no more than fifty, ranked by relevance (as - determined by Yahoo). Raises SearchQueryError() on errors. - """ - base_url = "http://yboss.yahooapis.com/ysearch/web" - query = quote_plus(query.join('"', '"')) - params = {"q": query, "style": "raw", "format": "json"} - url = "{0}?{1}".format(base_url, urlencode(params)) + return YahooBOSSSearchEngine(credentials) - consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"]) - client = oauth.Client(consumer) - headers, body = client.request(url, "GET") - - if headers["status"] != "200": - e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" - raise SearchQueryError(e.format(headers["status"], body)) - - try: - res = loads(body) - except ValueError: - e = "Yahoo! BOSS Error: JSON could not be decoded" - raise SearchQueryError(e) - - try: - results = res["bossresponse"]["web"]["results"] - except KeyError: - return [] - return [result["url"] for result in results] + raise UnknownSearchEngineError(engine) def _copyvio_strip_html(self, html): """ @@ -209,7 +174,7 @@ class CopyvioMixin(object): Raises CopyvioCheckError or subclasses (UnknownSearchEngineError, SearchQueryError, ...) on errors. """ - search = self._select_search_engine() + searcher = self._select_search_engine() handled_urls = [] best_confidence = 0 best_match = None @@ -228,7 +193,7 @@ class CopyvioMixin(object): while (chunks and best_confidence < min_confidence and (max_queries < 0 or num_queries < max_queries)): - urls = search(chunks.pop(0)) + urls = searcher.search(chunks.pop(0)) urls = [url for url in urls if url not in handled_urls] for url in urls: handled_urls.append(url) diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py new file mode 100644 index 0000000..59287cc --- /dev/null +++ b/earwigbot/wiki/copyvios/search.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2009-2012 by Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from json import loads +from urllib import quote_plus, urlencode + +try: + import oauth2 as oauth +except ImportError: + oauth = None + +from earwigbot.wiki.exceptions import SearchQueryError + +class BaseSearchEngine(object): + def __init__(self, cred): + """Store credentials 'cred' for searching later on.""" + self.cred = cred + + def search(self, query): + """Use this engine to search for 'query'. + + Not implemented in this base class; overridden in subclasses.""" + raise NotImplementedError() + + +class YahooBOSSSearchEngine(BaseSearchEngine): + def search(self, query): + """Do a Yahoo! BOSS web search for 'query'. + + Returns a list of URLs, no more than fifty, ranked by relevance (as + determined by Yahoo). Raises SearchQueryError() on errors. + """ + base_url = "http://yboss.yahooapis.com/ysearch/web" + query = quote_plus(query.join('"', '"')) + params = {"q": query, "style": "raw", "format": "json"} + url = "{0}?{1}".format(base_url, urlencode(params)) + + consumer = oauth.Consumer(key=self.cred["key"], secret=self.cred["secret"]) + client = oauth.Client(consumer) + headers, body = client.request(url, "GET") + + if headers["status"] != "200": + e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" + raise SearchQueryError(e.format(headers["status"], body)) + + try: + res = loads(body) + except ValueError: + e = "Yahoo! BOSS Error: JSON could not be decoded" + raise SearchQueryError(e) + + try: + results = res["bossresponse"]["web"]["results"] + except KeyError: + return [] + return [result["url"] for result in results] From 86a84407304aef0850d98ef8e5b5e10a35d41b13 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 28 Mar 2012 15:23:09 -0400 Subject: [PATCH 03/19] Moving parsers to own file. --- earwigbot/wiki/constants.py | 9 +++-- earwigbot/wiki/copyvios/__init__.py | 38 +++--------------- earwigbot/wiki/copyvios/parsers.py | 80 +++++++++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 36 deletions(-) create mode 100644 earwigbot/wiki/copyvios/parsers.py diff --git a/earwigbot/wiki/constants.py b/earwigbot/wiki/constants.py index 22aef9c..2431884 100644 --- a/earwigbot/wiki/constants.py +++ b/earwigbot/wiki/constants.py @@ -1,17 +1,17 @@ # -*- coding: utf-8 -*- # # Copyright (C) 2009-2012 by Ben Kurtovic -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is +# copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -34,6 +34,7 @@ Import with `from earwigbot.wiki import constants` or `from earwigbot.wiki.const from earwigbot import __version__ as _v from platform import python_version as _p USER_AGENT = "EarwigBot/{0} (Python/{1}; https://github.com/earwig/earwigbot)".format(_v, _p()) +del _v, _p # Default namespace IDs: NS_MAIN = 0 diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 0aaa9b5..46b27e2 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -30,9 +30,10 @@ try: except ImportError: oauth = None -from earwigbot.wiki.exceptions import * from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection +from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine +from earwigbot.wiki.exceptions import * class CopyvioCheckResult(object): def __init__(self, violation, confidence, url, queries, article, chains): @@ -109,33 +110,6 @@ class CopyvioMixin(object): raise UnknownSearchEngineError(engine) - def _copyvio_strip_html(self, html): - """ - STUB - """ - return html - - def _copyvio_strip_article(self, content): - """Clean the page's raw text by removing templates and formatting. - - Returns the page's text with all HTML and wikicode formatting removed, - including templates, tables, references, and the Bibliography/ - References/Sources/See also section(s). It retains punctuation - (spacing, paragraphs, periods, commas, (semi)-colons, parentheses, - quotes) and original capitalization, but not brackets (square and - angular), abnormal spacing, nor anything else. HTML entities are - replaced by their unicode equivalents. - - STUB - """ - return content - - def _copyvio_chunk_article(self, content, max_chunks): - """ - STUB - """ - return [content] - def _copyvio_compare_content(self, article, url): """ DOCSTRING NEEDED @@ -144,7 +118,7 @@ class CopyvioMixin(object): if not html: return 0 - source = MarkovChain(self._copyvio_strip_html(html)) + source = MarkovChain(HTMLTextParser(html).strip()) delta = MarkovChainIntersection(article, source) return float(delta.size()) / article.size(), (source, delta) @@ -182,8 +156,8 @@ class CopyvioMixin(object): empty = MarkovChain("") best_chains = (empty, MarkovChainIntersection(empty, empty)) content = self.get(force) - clean = self._copyvio_strip_article(content) - chunks = self._copyvio_chunk_article(clean, max_queries) + clean = ArticleTextParser(content).strip() + chunks = ArticleTextParser(clean).chunk(max_queries) article_chain = MarkovChain(clean) last_query = time() @@ -236,7 +210,7 @@ class CopyvioMixin(object): SearchQueryError will be raised. """ content = self.get(force) - clean = self._copyvio_strip_article(content) + clean = ArticleTextParser(content).strip() article_chain = MarkovChain(clean) confidence, chains = self._copyvio_compare_content(article_chain, url) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py new file mode 100644 index 0000000..f9bb4c2 --- /dev/null +++ b/earwigbot/wiki/copyvios/parsers.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2009-2012 by Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +class BaseTextParser(object): + def __init__(self, text): + self.text = text + + +class ArticleTextParser(BaseTextParser): + def strip(self): + """Clean the page's raw text by removing templates and formatting. + + Returns the page's text with all HTML and wikicode formatting removed, + including templates, tables, references, and the Bibliography/ + References/Sources/See also section(s). It retains punctuation + (spacing, paragraphs, periods, commas, (semi)-colons, parentheses, + quotes) and original capitalization, but not brackets (square and + angular), abnormal spacing, nor anything else. HTML entities are + replaced by their unicode equivalents. + + The actual replacement is handled by a few private methods within this + class. + """ + text = self._strip_tags(self.text) + text = self._strip_templates(text) + text = self._strip_sections(text) + text = self._strip_wikicode(text) + text = self._normalize(text) + return text + + def chunk(self, max_chunks): + """Convert the article text into a list of web-searchable chunks. + + No greater than max_chunks will be returned. Each chunk will only be a + couple sentences long at most. The idea here is to return a + representative sample of the article text rather than the entire + article, so we'll probably pick and choose from its introduction, body, + and conclusion, especially if the article is large and max_chunks are + few, so we don't end up just searching for the first paragraph. + """ + return [self.text] + + def _strip_tags(self, text): + return text + + def _strip_templates(self, text): + return text + + def _strip_sections(self, text): + return text + + def _strip_wikicode(self, text): + return text + + def _normalize(self, text): + return text + + +class HTMLTextParser(BaseTextParser): + def strip(self): + return self.text From 5ca1d91f3e398545c476bd317826b04941e6f98d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 7 Apr 2012 12:33:13 -0400 Subject: [PATCH 04/19] Use __all__ within e.w.copyvios and shorter imports --- earwigbot/wiki/copyvios/__init__.py | 8 +++++--- earwigbot/wiki/copyvios/markov.py | 2 ++ earwigbot/wiki/copyvios/parsers.py | 2 ++ earwigbot/wiki/copyvios/search.py | 2 ++ 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 46b27e2..a206a70 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -30,11 +30,13 @@ try: except ImportError: oauth = None -from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection -from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser -from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine +from earwigbot.wiki.copyvios.markov import * +from earwigbot.wiki.copyvios.parsers import * +from earwigbot.wiki.copyvios.search import * from earwigbot.wiki.exceptions import * +__all__ = ["CopyvioCheckResult", "CopyvioMixin"] + class CopyvioCheckResult(object): def __init__(self, violation, confidence, url, queries, article, chains): self.violation = violation diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py index 4e77ebc..74783d0 100644 --- a/earwigbot/wiki/copyvios/markov.py +++ b/earwigbot/wiki/copyvios/markov.py @@ -23,6 +23,8 @@ from collections import defaultdict from re import sub, UNICODE +__all__ = ["MarkovChain", "MarkovChainIntersection"] + class MarkovChain(object): START = -1 END = -2 diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index f9bb4c2..9e97267 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -20,6 +20,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] + class BaseTextParser(object): def __init__(self, text): self.text = text diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index 59287cc..bc9dfe4 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -30,6 +30,8 @@ except ImportError: from earwigbot.wiki.exceptions import SearchQueryError +__all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] + class BaseSearchEngine(object): def __init__(self, cred): """Store credentials 'cred' for searching later on.""" From 7dbbe9683cbe6799528e8a7d8f6c1104f9813e67 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 6 Jul 2012 21:22:22 -0400 Subject: [PATCH 05/19] Update imports and exceptions. --- earwigbot/wiki/copyvios/__init__.py | 16 +++++++++------- earwigbot/wiki/copyvios/search.py | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index f657f5b..30d4681 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -31,9 +31,9 @@ except ImportError: oauth = None from earwigbot import exceptions -from earwigbot.wiki.copyvios.markov import * -from earwigbot.wiki.copyvios.parsers import * -from earwigbot.wiki.copyvios.search import * +from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection +from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser +from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine __all__ = ["CopyvioCheckResult", "CopyvioMixIn"] @@ -107,14 +107,16 @@ class CopyvioMixIn(object): if engine == "Yahoo! BOSS": if not oauth: e = "The package 'oauth2' could not be imported" - raise UnsupportedSearchEngineError(e) + raise exceptions.UnsupportedSearchEngineError(e) return YahooBOSSSearchEngine(credentials) - raise UnknownSearchEngineError(engine) + raise exceptions.UnknownSearchEngineError(engine) def _copyvio_compare_content(self, article, url): - """ - DOCSTRING NEEDED + """Return a number comparing an article and a URL. + + The *article* is a Markov chain, whereas the URL is a string that we + will try to open ourselves. """ html = self._open_url_ignoring_errors(url) if not html: diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index bc9dfe4..d8091ee 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -28,7 +28,7 @@ try: except ImportError: oauth = None -from earwigbot.wiki.exceptions import SearchQueryError +from earwigbot.exceptions import SearchQueryError __all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] From d87c226417f08716713e6f190f72d8b59d2eef35 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 6 Jul 2012 21:50:57 -0400 Subject: [PATCH 06/19] __repr__ and __str__ for everything per #5 and #22. --- earwigbot/wiki/copyvios/__init__.py | 11 +++++++++-- earwigbot/wiki/copyvios/markov.py | 19 +++++++++++++++++++ earwigbot/wiki/copyvios/parsers.py | 9 +++++++++ earwigbot/wiki/copyvios/search.py | 8 ++++++++ 4 files changed, 45 insertions(+), 2 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 30d4681..f85ab22 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -48,8 +48,15 @@ class CopyvioCheckResult(object): self.delta_chain = chains[1] def __repr__(self): - r = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" - return r.format(self.violation, self.confidence, self.url, self.queries) + """Return the canonical string representation of the result.""" + res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" + return res.format(self.violation, self.confidence, self.url, + self.queries) + + def __str__(self): + """Return a nice string representation of the result.""" + res = "" + return res.format(self.violation, self.confidence) class CopyvioMixIn(object): diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py index 74783d0..081469f 100644 --- a/earwigbot/wiki/copyvios/markov.py +++ b/earwigbot/wiki/copyvios/markov.py @@ -42,6 +42,14 @@ class MarkovChain(object): except KeyError: pass + def __repr__(self): + """Return the canonical string representation of the MarkovChain.""" + return "MarkovChain(text={0!r})".format(self.text) + + def __str__(self): + """Return a nice string representation of the MarkovChain.""" + return "".format(self.size()) + def size(self): count = 0 for node in self.chain.itervalues(): @@ -53,6 +61,7 @@ class MarkovChain(object): class MarkovChainIntersection(MarkovChain): def __init__(self, mc1, mc2): self.chain = defaultdict(lambda: defaultdict(lambda: 0)) + self.mc1, self.mc2 = mc1, mc2 c1 = mc1.chain c2 = mc2.chain @@ -63,3 +72,13 @@ class MarkovChainIntersection(MarkovChain): if node in nodes2: count2 = nodes2[node] self.chain[word][node] = min(count1, count2) + + def __repr__(self): + """Return the canonical string representation of the intersection.""" + res = "MarkovChainIntersection(mc1={0!r}, mc2={1!r})" + return res.format(self.mc1, self.mc2) + + def __str__(self): + """Return a nice string representation of the intersection.""" + res = "" + return res.format(self.size(), self.mc1, self.mc2) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 9e97267..0c3c17b 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -23,6 +23,15 @@ __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] class BaseTextParser(object): + def __repr__(self): + """Return the canonical string representation of the text parser.""" + return "{0}(text={1!r})".format(self.__class__.__name__, self.text) + + def __str__(self): + """Return a nice string representation of the text parser.""" + name = self.__class__.__name__ + return "<{0} of text with size {1}>".format(name, len(text)) + def __init__(self, text): self.text = text diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index d8091ee..4345b29 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -37,6 +37,14 @@ class BaseSearchEngine(object): """Store credentials 'cred' for searching later on.""" self.cred = cred + def __repr__(self): + """Return the canonical string representation of the search engine.""" + return "{0}()".format(self.__class__.__name__) + + def __str__(self): + """Return a nice string representation of the search engine.""" + return "<{0}>".format(self.__class__.__name__) + def search(self, query): """Use this engine to search for 'query'. From d45e342bac59c8587c8e34c2c794023452ef6fda Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 6 Jul 2012 22:55:23 -0400 Subject: [PATCH 07/19] DOCUMENT EVERYTHING (#5) Also implementing MWParserFromHell, plus some cleanup. --- docs/api/earwigbot.wiki.copyvios.rst | 33 +++++++++++++ docs/api/earwigbot.wiki.rst | 14 +++--- earwigbot/wiki/copyvios/__init__.py | 91 ++++++++++++++++++++---------------- earwigbot/wiki/copyvios/markov.py | 4 ++ earwigbot/wiki/copyvios/parsers.py | 66 +++++++++++--------------- earwigbot/wiki/copyvios/search.py | 19 +++++--- 6 files changed, 136 insertions(+), 91 deletions(-) create mode 100644 docs/api/earwigbot.wiki.copyvios.rst diff --git a/docs/api/earwigbot.wiki.copyvios.rst b/docs/api/earwigbot.wiki.copyvios.rst new file mode 100644 index 0000000..7dbcf39 --- /dev/null +++ b/docs/api/earwigbot.wiki.copyvios.rst @@ -0,0 +1,33 @@ +copyvios Package +================ + +:mod:`copyvios` Package +----------------------- + +.. automodule:: earwigbot.wiki.copyvios + :members: + :undoc-members: + +:mod:`markov` Module +-------------------- + +.. automodule:: earwigbot.wiki.copyvios.markov + :members: + :undoc-members: + :show-inheritance: + +:mod:`parsers` Module +--------------------- + +.. automodule:: earwigbot.wiki.copyvios.parsers + :members: + :undoc-members: + :show-inheritance: + +:mod:`search` Module +-------------------- + +.. automodule:: earwigbot.wiki.copyvios.search + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/earwigbot.wiki.rst b/docs/api/earwigbot.wiki.rst index 806b3eb..45b009b 100644 --- a/docs/api/earwigbot.wiki.rst +++ b/docs/api/earwigbot.wiki.rst @@ -22,13 +22,6 @@ wiki Package :members: :undoc-members: -:mod:`copyright` Module ------------------------ - -.. automodule:: earwigbot.wiki.copyright - :members: - :undoc-members: - :mod:`page` Module ------------------ @@ -57,3 +50,10 @@ wiki Package .. automodule:: earwigbot.wiki.user :members: :undoc-members: + +Subpackages +----------- + +.. toctree:: + + earwigbot.wiki.copyvios diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index f85ab22..2c2bb23 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -38,6 +38,22 @@ from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine __all__ = ["CopyvioCheckResult", "CopyvioMixIn"] class CopyvioCheckResult(object): + """ + **EarwigBot: Wiki Toolset: Copyvio Check Result** + + A class holding information about the results of a copyvio check. + + *Attributes:* + + - :py:attr:`violation`: ``True`` if this is a violation, else ``False`` + - :py:attr:`confidence`: a float between 0 and 1 indicating accuracy + - :py:attr:`url`: the URL of the violated page + - :py:attr:`queries`: the number of queries used to reach a result + - :py:attr:`article_chain`: the MarkovChain of the article text + - :py:attr:`source_chain`: the MarkovChain of the violated page text + - :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two + """ + def __init__(self, violation, confidence, url, queries, article, chains): self.violation = violation self.confidence = confidence @@ -61,14 +77,15 @@ class CopyvioCheckResult(object): class CopyvioMixIn(object): """ - EarwigBot's Wiki Toolset: Copyright Violation Mixin + **EarwigBot: Wiki Toolset: Copyright Violation MixIn** - This is a mixin that provides two public methods, copyvio_check() and - copyvio_compare(). The former checks the page for copyright violations - using a search engine API, and the latter compares the page against a - specified URL. Credentials for the search engine API are stored in the - site's config. + This is a mixin that provides two public methods, :py:meth:`copyvio_check` + and :py:meth:`copyvio_compare`. The former checks the page for copyright + violations using a search engine API, and the latter compares the page + against a given URL. Credentials for the search engine API are stored in + the :py:class:`~earwigbot.wiki.site.Site`'s config. """ + def __init__(self, site): self._opener = build_opener() self._opener.addheaders = site._opener.addheaders @@ -100,10 +117,10 @@ class CopyvioMixIn(object): def _select_search_engine(self): """Return a function that can be called to do web searches. - The "function" is a functools.partial object that takes one argument, a - query, and returns a list of URLs, ranked by importance. The underlying - logic depends on the 'engine' argument; for example, if 'engine' is - "Yahoo! BOSS", we'll use self._yahoo_boss_query for querying. + The function takes one argument, a search query, and returns a list of + URLs, ranked by importance. The underlying logic depends on the + *engine* argument within our config; for example, if *engine* is + "Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying. Raises UnknownSearchEngineError if the 'engine' listed in our config is unknown to us, and UnsupportedSearchEngineError if we are missing a @@ -122,8 +139,8 @@ class CopyvioMixIn(object): def _copyvio_compare_content(self, article, url): """Return a number comparing an article and a URL. - The *article* is a Markov chain, whereas the URL is a string that we - will try to open ourselves. + The *article* is a Markov chain, whereas the *url* is just a string + that we'll try to open and read ourselves. """ html = self._open_url_ignoring_errors(url) if not html: @@ -134,30 +151,22 @@ class CopyvioMixIn(object): return float(delta.size()) / article.size(), (source, delta) def copyvio_check(self, min_confidence=0.5, max_queries=-1, - interquery_sleep=1, force=False): + interquery_sleep=1): """Check the page for copyright violations. - Returns a _CopyvioCheckResult object with four useful attributes: - "violation", "confidence", "url", and "queries". "confidence" is a - number between 0 and 1; if it is less than "min_confidence", we could - not find any indication of a violation (so "violation" will be False - and "url" may or may not be None), otherwise it indicates the relative - faith in our results, "violation" will be True, and "url" will be the - place the article is suspected of being copied from. "queries" is the - number of queries used to determine the results. + Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` + object with information on the results of the check. - "max_queries" is self-explanatory; we will never make more than this - number of queries in a given check. If it's less than 0, we will not - limit our number of queries. + *max_queries* is self-explanatory; we will never make more than this + number of queries in a given check. If it's lower than 0, we will not + limit the number of queries. - "interquery_sleep" is the minimum amount of time we will sleep between + *interquery_sleep* is the minimum amount of time we will sleep between search engine queries, in seconds. - "force" is simply passed to page.get() - it has the same behavior there - as it does here. - - Raises CopyvioCheckError or subclasses (UnknownSearchEngineError, - SearchQueryError, ...) on errors. + Raises :py:exc:`~earwigbot.exceptions.CopyvioCheckError` or subclasses + (:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError`, + :py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors. """ searcher = self._select_search_engine() handled_urls = [] @@ -166,9 +175,9 @@ class CopyvioMixIn(object): num_queries = 0 empty = MarkovChain("") best_chains = (empty, MarkovChainIntersection(empty, empty)) - content = self.get(force) - clean = ArticleTextParser(content).strip() - chunks = ArticleTextParser(clean).chunk(max_queries) + parser = ArticleTextParser(self.get()) + clean = parser.strip() + chunks = parser.chunk(max_queries) article_chain = MarkovChain(clean) last_query = time() @@ -200,13 +209,14 @@ class CopyvioMixIn(object): return CopyvioCheckResult(v, best_confidence, best_match, num_queries, article_chain, best_chains) - def copyvio_compare(self, url, min_confidence=0.5, force=False): - """Check the page like copyvio_check(), but against a specific URL. + def copyvio_compare(self, url, min_confidence=0.5): + """Check the page like :py:meth:`copyvio_check` against a specific URL. This is essentially a reduced version of the above - a copyivo comparison is made using Markov chains and the result is returned in a - _CopyvioCheckResult object - without using a search engine, as the - suspected "violated" URL is supplied from the start. + :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but + without using a search engine, since the suspected "violated" URL is + supplied from the start. Its primary use is to generate a result when the URL is retrieved from a cache, like the one used in EarwigBot's Toolserver site. After a @@ -217,10 +227,11 @@ class CopyvioMixIn(object): be stored for data retention reasons, so a fresh comparison is made using this function. - Since no searching is done, neither UnknownSearchEngineError nor - SearchQueryError will be raised. + Since no searching is done, neither + :py:exc:`~earwigbot.exceptions.UnknownSearchEngineError` nor + :py:exc:`~earwigbot.exceptions.SearchQueryError` will be raised. """ - content = self.get(force) + content = self.get() clean = ArticleTextParser(content).strip() article_chain = MarkovChain(clean) confidence, chains = self._copyvio_compare_content(article_chain, url) diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py index 081469f..657b4b9 100644 --- a/earwigbot/wiki/copyvios/markov.py +++ b/earwigbot/wiki/copyvios/markov.py @@ -26,6 +26,7 @@ from re import sub, UNICODE __all__ = ["MarkovChain", "MarkovChainIntersection"] class MarkovChain(object): + """Implements a basic bigram Markov chain of words.""" START = -1 END = -2 @@ -51,6 +52,7 @@ class MarkovChain(object): return "".format(self.size()) def size(self): + """Return the size of the Markov chain: the total number of nodes.""" count = 0 for node in self.chain.itervalues(): for hits in node.itervalues(): @@ -59,6 +61,8 @@ class MarkovChain(object): class MarkovChainIntersection(MarkovChain): + """Implements the intersection of two chains (i.e., their shared nodes).""" + def __init__(self, mc1, mc2): self.chain = defaultdict(lambda: defaultdict(lambda: 0)) self.mc1, self.mc2 = mc1, mc2 diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 0c3c17b..8a31127 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -20,9 +20,19 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +try: + import mwparserfromhell +except ImportError: + mwparserfromhell = None + __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] class BaseTextParser(object): + """Base class for a parser that handles text.""" + + def __init__(self, text): + self.text = text + def __repr__(self): """Return the canonical string representation of the text parser.""" return "{0}(text={1!r})".format(self.__class__.__name__, self.text) @@ -32,60 +42,40 @@ class BaseTextParser(object): name = self.__class__.__name__ return "<{0} of text with size {1}>".format(name, len(text)) - def __init__(self, text): - self.text = text - class ArticleTextParser(BaseTextParser): + """A parser that can strip and chunk wikicode article text.""" + def strip(self): """Clean the page's raw text by removing templates and formatting. - Returns the page's text with all HTML and wikicode formatting removed, - including templates, tables, references, and the Bibliography/ - References/Sources/See also section(s). It retains punctuation + Return the page's text with all HTML and wikicode formatting removed, + including templates, tables, and references. It retains punctuation (spacing, paragraphs, periods, commas, (semi)-colons, parentheses, - quotes) and original capitalization, but not brackets (square and - angular), abnormal spacing, nor anything else. HTML entities are + quotes), original capitalization, and so forth. HTML entities are replaced by their unicode equivalents. - The actual replacement is handled by a few private methods within this - class. + The actual stripping is handled by :py:mod:`mwparserfromhell`. """ - text = self._strip_tags(self.text) - text = self._strip_templates(text) - text = self._strip_sections(text) - text = self._strip_wikicode(text) - text = self._normalize(text) - return text + wikicode = mwparserfromhell.parse(self.text) + self.clean = u" ".join(wikicode.normalize().ifilter_text()) + return self.clean def chunk(self, max_chunks): - """Convert the article text into a list of web-searchable chunks. + """Convert the clean article text into a list of web-searchable chunks. - No greater than max_chunks will be returned. Each chunk will only be a - couple sentences long at most. The idea here is to return a + No greater than *max_chunks* will be returned. Each chunk will only be + a couple sentences long at most. The idea here is to return a representative sample of the article text rather than the entire article, so we'll probably pick and choose from its introduction, body, - and conclusion, especially if the article is large and max_chunks are - few, so we don't end up just searching for the first paragraph. + and conclusion, especially if the article is large and *max_chunks* is + low, so we don't end up just searching for the first paragraph. """ - return [self.text] - - def _strip_tags(self, text): - return text - - def _strip_templates(self, text): - return text - - def _strip_sections(self, text): - return text - - def _strip_wikicode(self, text): - return text - - def _normalize(self, text): - return text + return [self.text] # TODO: NotImplemented class HTMLTextParser(BaseTextParser): + """A parser that can extract the text from an HTML document.""" + def strip(self): - return self.text + return self.text # TODO: NotImplemented diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index 4345b29..ac40613 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -33,8 +33,10 @@ from earwigbot.exceptions import SearchQueryError __all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] class BaseSearchEngine(object): + """Base class for a simple search engine interface.""" + def __init__(self, cred): - """Store credentials 'cred' for searching later on.""" + """Store credentials *cred* for searching later on.""" self.cred = cred def __repr__(self): @@ -46,25 +48,30 @@ class BaseSearchEngine(object): return "<{0}>".format(self.__class__.__name__) def search(self, query): - """Use this engine to search for 'query'. + """Use this engine to search for *query*. - Not implemented in this base class; overridden in subclasses.""" + Not implemented in this base class; overridden in subclasses. + """ raise NotImplementedError() class YahooBOSSSearchEngine(BaseSearchEngine): + """A search engine interface with Yahoo! BOSS.""" + def search(self, query): - """Do a Yahoo! BOSS web search for 'query'. + """Do a Yahoo! BOSS web search for *query*. Returns a list of URLs, no more than fifty, ranked by relevance (as - determined by Yahoo). Raises SearchQueryError() on errors. + determined by Yahoo). Raises + :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. """ base_url = "http://yboss.yahooapis.com/ysearch/web" query = quote_plus(query.join('"', '"')) params = {"q": query, "style": "raw", "format": "json"} url = "{0}?{1}".format(base_url, urlencode(params)) - consumer = oauth.Consumer(key=self.cred["key"], secret=self.cred["secret"]) + consumer = oauth.Consumer(key=self.cred["key"], + secret=self.cred["secret"]) client = oauth.Client(consumer) headers, body = client.request(url, "GET") From 1af4217b63a10faf41547501a9d2ec688344945d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 7 Jul 2012 00:16:54 -0400 Subject: [PATCH 08/19] Update copyright notices and some other improvements. --- docs/api/modules.rst | 2 +- earwigbot/wiki/copyvios/__init__.py | 2 +- earwigbot/wiki/copyvios/markov.py | 2 +- earwigbot/wiki/copyvios/parsers.py | 2 +- earwigbot/wiki/copyvios/search.py | 2 +- earwigbot/wiki/page.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/api/modules.rst b/docs/api/modules.rst index 7c4c110..3bf56b4 100644 --- a/docs/api/modules.rst +++ b/docs/api/modules.rst @@ -2,6 +2,6 @@ earwigbot ========= .. toctree:: - :maxdepth: 4 + :maxdepth: 6 earwigbot diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 2c2bb23..a17f800 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2012 by Ben Kurtovic +# Copyright (C) 2009-2012 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py index 657b4b9..28cdb97 100644 --- a/earwigbot/wiki/copyvios/markov.py +++ b/earwigbot/wiki/copyvios/markov.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2012 by Ben Kurtovic +# Copyright (C) 2009-2012 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 8a31127..565acff 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2012 by Ben Kurtovic +# Copyright (C) 2009-2012 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index ac40613..a768141 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2012 by Ben Kurtovic +# Copyright (C) 2009-2012 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py index 248334e..3125b33 100644 --- a/earwigbot/wiki/page.py +++ b/earwigbot/wiki/page.py @@ -35,7 +35,7 @@ from earwigbot.wiki.copyvios import CopyvioMixIn __all__ = ["Page"] -class Page(CopyvioMixin): +class Page(CopyvioMixIn): """ **EarwigBot: Wiki Toolset: Page** From cb870041079843c521a65e74784d41e224ffadd9 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 7 Jul 2012 03:37:15 -0400 Subject: [PATCH 09/19] Primitive screen scraper for HTML using BeautifulSoup and LXML. Obviously this can and should be improved significantly later, but it seems good enough for now. --- earwigbot/wiki/copyvios/parsers.py | 27 ++++++++++++++++++++++++++- setup.py | 2 ++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 565acff..8b9655b 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -20,6 +20,13 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import htmlentitydefs + +try: + from bs4 import BeautifulSoup +except ImportError: + BeautifulSoup = None + try: import mwparserfromhell except ImportError: @@ -76,6 +83,24 @@ class ArticleTextParser(BaseTextParser): class HTMLTextParser(BaseTextParser): """A parser that can extract the text from an HTML document.""" + hidden_tags = [ + "script", "style" + ] def strip(self): - return self.text # TODO: NotImplemented + """Return the actual text contained within an HTML document. + + Implemented using :py:mod:`BeautifulSoup ` + (http://www.crummy.com/software/BeautifulSoup/). + """ + try: + soup = BeautifulSoup(self.text, "lxml").body + except ValueError: + soup = BeautifulSoup(self.text).body + + is_comment = lambda text: isinstance(text, bs4.element.Comment) + [comment.extract() for comment in soup.find_all(text=is_comment)] + for tag in self.hidden_tags: + [element.extract() for element in soup.find_all(tag)] + + return "\n".join(soup.stripped_strings) diff --git a/setup.py b/setup.py index 9db6676..3c3c7cd 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,8 @@ setup( entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]}, install_requires = ["GitPython >= 0.3.2.RC1", # Interfacing with git "PyYAML >= 3.10", # Config parsing + "beautifulsoup4 >= 4.1.1", # HTML parsing/scraping + "lxml >= 2.3.4", # Faster parser for BeautifulSoup "mwparserfromhell >= 0.1", # Wikicode parsing "oursql >= 0.9.3", # Talking with MediaWiki databases "oauth2 >= 1.5.211", # Talking with Yahoo BOSS Search From bf1ad08dc66ce4cc8e3e0110a7a9e311ef95f44b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 7 Jul 2012 04:30:42 -0400 Subject: [PATCH 10/19] Make Markov chain degree-independent. Testing trigrams. --- earwigbot/wiki/copyvios/markov.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py index 28cdb97..00567b2 100644 --- a/earwigbot/wiki/copyvios/markov.py +++ b/earwigbot/wiki/copyvios/markov.py @@ -26,22 +26,21 @@ from re import sub, UNICODE __all__ = ["MarkovChain", "MarkovChainIntersection"] class MarkovChain(object): - """Implements a basic bigram Markov chain of words.""" + """Implements a basic ngram Markov chain of words.""" START = -1 END = -2 + degree = 3 # 2 for bigrams, 3 for trigrams, etc. def __init__(self, text): self.text = text self.chain = defaultdict(lambda: defaultdict(lambda: 0)) words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split() - prev = self.START - for word in words: - self.chain[prev][word] += 1 - prev = word - try: # This won't work if the source text is completely blank - self.chain[word][self.END] += 1 - except KeyError: - pass + + padding = self.degree - 1 + words = ([self.START] * padding) + words + ([self.END] * padding) + for i in range(len(words) - self.degree + 1): + last = i + self.degree - 1 + self.chain[words[i:last]][last] += 1 def __repr__(self): """Return the canonical string representation of the MarkovChain.""" From 17eee28a4bf42b01df072daddfefba611eb4171f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 7 Jul 2012 04:32:52 -0400 Subject: [PATCH 11/19] Whoops, got the slicing wrong. --- earwigbot/wiki/copyvios/markov.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py index 00567b2..7813f61 100644 --- a/earwigbot/wiki/copyvios/markov.py +++ b/earwigbot/wiki/copyvios/markov.py @@ -40,7 +40,7 @@ class MarkovChain(object): words = ([self.START] * padding) + words + ([self.END] * padding) for i in range(len(words) - self.degree + 1): last = i + self.degree - 1 - self.chain[words[i:last]][last] += 1 + self.chain[words[i:last]][words[last]] += 1 def __repr__(self): """Return the canonical string representation of the MarkovChain.""" From 569c815d994587347c8734e01751523eab193bf4 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 7 Jul 2012 16:40:27 -0400 Subject: [PATCH 12/19] Implement NLTK for chunking article content (#5). --- earwigbot/wiki/copyvios/__init__.py | 6 ++++-- earwigbot/wiki/copyvios/parsers.py | 34 ++++++++++++++++++++++++++-------- earwigbot/wiki/copyvios/search.py | 2 +- earwigbot/wiki/site.py | 2 +- earwigbot/wiki/sitesdb.py | 15 ++++++++++++++- setup.py | 31 ++++++++++++++++++++----------- 6 files changed, 66 insertions(+), 24 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index a17f800..5fb7bf2 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -87,6 +87,7 @@ class CopyvioMixIn(object): """ def __init__(self, site): + self._search_config = site._search_config self._opener = build_opener() self._opener.addheaders = site._opener.addheaders @@ -126,7 +127,8 @@ class CopyvioMixIn(object): unknown to us, and UnsupportedSearchEngineError if we are missing a required package or module, like oauth2 for "Yahoo! BOSS". """ - engine, credentials = self._site._search_config + engine = self._search_config["engine"] + credentials = self._search_config["credentials"] if engine == "Yahoo! BOSS": if not oauth: @@ -177,7 +179,7 @@ class CopyvioMixIn(object): best_chains = (empty, MarkovChainIntersection(empty, empty)) parser = ArticleTextParser(self.get()) clean = parser.strip() - chunks = parser.chunk(max_queries) + chunks = parser.chunk(max_queries, self._search_config["nltk_dir"]) article_chain = MarkovChain(clean) last_query = time() diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 8b9655b..a00369d 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import htmlentitydefs +from os import path try: from bs4 import BeautifulSoup @@ -32,6 +32,11 @@ try: except ImportError: mwparserfromhell = None +try: + import nltk +except ImportError: + nltk = None + __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] class BaseTextParser(object): @@ -68,17 +73,30 @@ class ArticleTextParser(BaseTextParser): self.clean = u" ".join(wikicode.normalize().ifilter_text()) return self.clean - def chunk(self, max_chunks): + def chunk(self, max_chunks, nltk_dir): """Convert the clean article text into a list of web-searchable chunks. No greater than *max_chunks* will be returned. Each chunk will only be - a couple sentences long at most. The idea here is to return a - representative sample of the article text rather than the entire - article, so we'll probably pick and choose from its introduction, body, - and conclusion, especially if the article is large and *max_chunks* is - low, so we don't end up just searching for the first paragraph. + a sentence or two long at most. The idea here is to return a + representative sample of the article text rather than the whole, so + we'll probably pick and choose from its introduction, body, and + conclusion, especially if the article is large and *max_chunks* is low, + so we don't end up just searching for the first paragraph. + + This is implemented using :py:mod:`nltk` (http://nltk.org/). A base + directory (*nltk_dir*) is required to store nltk's punctuation + database. This is typically located in the bot's working directory. """ - return [self.text] # TODO: NotImplemented + datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") + try: + tokenizer = nltk.data.load(datafile) + except LookupError: + nltk.download("punkt", nltk_dir) + tokenizer = nltk.data.load(datafile) + + sentences = tokenizer.tokenize(self.clean) + #if max_chunks >= len(sentences): + # return sentences class HTMLTextParser(BaseTextParser): diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index a768141..cf2edb4 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -67,7 +67,7 @@ class YahooBOSSSearchEngine(BaseSearchEngine): """ base_url = "http://yboss.yahooapis.com/ysearch/web" query = quote_plus(query.join('"', '"')) - params = {"q": query, "style": "raw", "format": "json"} + params = {"q": query, "type": "html,text", "format": "json"} url = "{0}?{1}".format(base_url, urlencode(params)) consumer = oauth.Consumer(key=self.cred["key"], diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index 4d88505..f627a02 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -92,7 +92,7 @@ class Site(object): namespaces=None, login=(None, None), cookiejar=None, user_agent=None, use_https=False, assert_edit=None, maxlag=None, wait_between_queries=3, logger=None, - search_config=(None, None)): + search_config=None): """Constructor for new Site instances. This probably isn't necessary to call yourself unless you're building a diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py index 1f3265b..5af7e3a 100644 --- a/earwigbot/wiki/sitesdb.py +++ b/earwigbot/wiki/sitesdb.py @@ -192,6 +192,10 @@ class SitesDB(object): user_agent = user_agent.replace("$1", __version__) user_agent = user_agent.replace("$2", python_version()) + if search_config: + nltk_dir = path.join(self.config.root_dir, ".nltk") + search_config["nltk_dir"] = nltk_dir + return Site(name=name, project=project, lang=lang, base_url=base_url, article_path=article_path, script_path=script_path, sql=sql, namespaces=namespaces, login=login, @@ -360,14 +364,23 @@ class SitesDB(object): assert_edit = config.wiki.get("assert") maxlag = config.wiki.get("maxlag") wait_between_queries = config.wiki.get("waitTime", 5) + logger = self._logger.getChild(name) search_config = config.wiki.get("search") + if user_agent: + user_agent = user_agent.replace("$1", __version__) + user_agent = user_agent.replace("$2", python_version()) + + if search_config: + nltk_dir = path.join(self.config.root_dir, ".nltk") + search_config["nltk_dir"] = nltk_dir + # Create a Site object to log in and load the other attributes: site = Site(base_url=base_url, script_path=script_path, sql=sql, login=login, cookiejar=cookiejar, user_agent=user_agent, use_https=use_https, assert_edit=assert_edit, maxlag=maxlag, wait_between_queries=wait_between_queries, - search_config=search_config) + logger=logger, search_config=search_config) self._add_site_to_sitesdb(site) self._sites[site.name] = site diff --git a/setup.py b/setup.py index 3c3c7cd..b68ae4d 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,25 @@ from setuptools import setup, find_packages from earwigbot import __version__ +# Not all of these dependencies are required, particularly the copyvio-specific +# ones (bs4, lxml, nltk, and oauth2) or the command-specific ones (GitPython, +# pytz). The bot should run fine without them, but will raise an exception if +# you try to detect copyvios or run a command that requries one. + +dependencies = [ + "GitPython >= 0.3.2.RC1", # Interfacing with git for !git and __version__ + "PyYAML >= 3.10", # Parsing config files + "beautifulsoup4 >= 4.1.1", # Parsing/scraping HTML for copyvios + "lxml >= 2.3.4", # Faster parser for BeautifulSoup + "mwparserfromhell >= 0.1", # Parsing wikicode for manipulation + "nltk >= 2.0.2", # Parsing sentences to split article content for copyvios + "oursql >= 0.9.3", # Interfacing with MediaWiki databases + "oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search for copyvios + "py-bcrypt >= 0.2", # Hashing the bot key in the config file + "pycrypto >= 2.5", # Storing bot passwords and keys in the config file + "pytz >= 2012c", # Handling timezones for the !time IRC command +] + with open("README.rst") as fp: long_docs = fp.read() @@ -32,17 +51,7 @@ setup( name = "earwigbot", packages = find_packages(exclude=("tests",)), entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]}, - install_requires = ["GitPython >= 0.3.2.RC1", # Interfacing with git - "PyYAML >= 3.10", # Config parsing - "beautifulsoup4 >= 4.1.1", # HTML parsing/scraping - "lxml >= 2.3.4", # Faster parser for BeautifulSoup - "mwparserfromhell >= 0.1", # Wikicode parsing - "oursql >= 0.9.3", # Talking with MediaWiki databases - "oauth2 >= 1.5.211", # Talking with Yahoo BOSS Search - "py-bcrypt >= 0.2", # Password hashing in config - "pycrypto >= 2.5", # Storing bot passwords and keys - "pytz >= 2012c", # Timezone handling - ], + install_requires = dependencies, test_suite = "tests", version = __version__, author = "Ben Kurtovic", From c260648bdb2a45a9c0a76f6e4df53889f28f270c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 7 Jul 2012 21:40:54 -0400 Subject: [PATCH 13/19] Finish chunking algorithm, improve !link, other fixes. --- earwigbot/commands/link.py | 14 ++++------- earwigbot/wiki/copyvios/__init__.py | 2 +- earwigbot/wiki/copyvios/parsers.py | 50 +++++++++++++++++++++++++++---------- earwigbot/wiki/site.py | 6 ++--- earwigbot/wiki/sitesdb.py | 2 +- 5 files changed, 47 insertions(+), 27 deletions(-) diff --git a/earwigbot/commands/link.py b/earwigbot/commands/link.py index 0b54554..ebe3669 100644 --- a/earwigbot/commands/link.py +++ b/earwigbot/commands/link.py @@ -30,6 +30,7 @@ class Link(Command): name = "link" def process(self, data): + self.site = self.bot.wiki.get_site() msg = data.msg if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg): @@ -41,8 +42,8 @@ class Link(Command): if not data.args: self.reply(data, "what do you want me to link to?") return - pagename = ' '.join(data.args) - link = self.parse_link(pagename) + pagename = " ".join(data.args) + link = self.site.get_page(pagename).url self.reply(data, link) def parse_line(self, line): @@ -56,8 +57,7 @@ class Link(Command): if links: # re.findall() returns a list of tuples, but we only want the 2nd # item in each tuple: - links = [i[1] for i in links] - results = map(self.parse_link, links) + results = [self.site.get_page(name[1]).url for name in links] # Find all {{templates}} templates = re.findall("(\{\{(.*?)(\||\}\}))", line) @@ -67,10 +67,6 @@ class Link(Command): return results - def parse_link(self, pagename): - link = quote(pagename.replace(" ", "_"), safe="/:") - return "".join(("http://enwp.org/", link)) - def parse_template(self, pagename): pagename = "".join(("Template:", pagename)) - return self.parse_link(pagename) + return self.site.get_page(pagename).url diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 5fb7bf2..cf2ddde 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -179,7 +179,7 @@ class CopyvioMixIn(object): best_chains = (empty, MarkovChainIntersection(empty, empty)) parser = ArticleTextParser(self.get()) clean = parser.strip() - chunks = parser.chunk(max_queries, self._search_config["nltk_dir"]) + chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) article_chain = MarkovChain(clean) last_query = time() diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index a00369d..b258730 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -70,18 +70,18 @@ class ArticleTextParser(BaseTextParser): The actual stripping is handled by :py:mod:`mwparserfromhell`. """ wikicode = mwparserfromhell.parse(self.text) - self.clean = u" ".join(wikicode.normalize().ifilter_text()) + self.clean = wikicode.strip_code(normalize=True) return self.clean - def chunk(self, max_chunks, nltk_dir): + def chunk(self, nltk_dir, max_chunks, max_query=256): """Convert the clean article text into a list of web-searchable chunks. No greater than *max_chunks* will be returned. Each chunk will only be - a sentence or two long at most. The idea here is to return a - representative sample of the article text rather than the whole, so - we'll probably pick and choose from its introduction, body, and - conclusion, especially if the article is large and *max_chunks* is low, - so we don't end up just searching for the first paragraph. + a sentence or two long at most (no more than *max_query*). The idea is + to return a sample of the article text rather than the whole, so we'll + pick and choose from parts of it, especially if the article is large + and *max_chunks* is low, so we don't end up just searching for just the + first paragraph. This is implemented using :py:mod:`nltk` (http://nltk.org/). A base directory (*nltk_dir*) is required to store nltk's punctuation @@ -89,14 +89,38 @@ class ArticleTextParser(BaseTextParser): """ datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") try: - tokenizer = nltk.data.load(datafile) + tokenizer = nltk.data.load("file:" + datafile) except LookupError: nltk.download("punkt", nltk_dir) - tokenizer = nltk.data.load(datafile) - - sentences = tokenizer.tokenize(self.clean) - #if max_chunks >= len(sentences): - # return sentences + tokenizer = nltk.data.load("file:" + datafile) + + sentences = [] + for sentence in tokenizer.tokenize(self.clean): + if len(sentence) > max_query: + words = sentence.split() + while len(" ".join(words)) > max_query: + words.pop() + sentence = " ".join(words) + sentences.append(sentence) + + if max_chunks >= len(sentences): + return sentences + + chunks = [] + while len(chunks) < max_chunks: + if len(chunks) % 5 == 0: + chunk = sentences.pop(0) # Pop from beginning + elif len(chunks) % 5 == 1: + chunk = sentences.pop() # Pop from end + elif len(chunks) % 5 == 2: + chunk = sentences.pop(len(sentences) / 2) # Pop from Q2 + elif len(chunks) % 5 == 3: + chunk = sentences.pop(len(sentences) / 4) # Pop from Q1 + else: + chunk = sentences.pop(3 * len(sentences) / 4) # Pop from Q3 + chunks.append(chunk) + + return chunks class HTMLTextParser(BaseTextParser): diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index f627a02..8261703 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -560,10 +560,10 @@ class Site(object): return [self.SERVICE_API] sqllag = self._sql_info_cache["replag"] - if sqllag > 180: + if sqllag > 300: if not self._maxlag: return [self.SERVICE_API, self.SERVICE_SQL] - if now - self._api_info_cache["lastcheck"] > 120: + if now - self._api_info_cache["lastcheck"] > 300: self._api_info_cache["lastcheck"] = now try: self._api_info_cache["maxlag"] = apilag = self.get_maxlag() @@ -571,7 +571,7 @@ class Site(object): self._api_info_cache["maxlag"] = apilag = 0 else: apilag = self._api_info_cache["maxlag"] - if sqllag / (180.0 / self._maxlag) < apilag: + if apilag > self._maxlag: return [self.SERVICE_SQL, self.SERVICE_API] return [self.SERVICE_API, self.SERVICE_SQL] diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py index 5af7e3a..fd3c521 100644 --- a/earwigbot/wiki/sitesdb.py +++ b/earwigbot/wiki/sitesdb.py @@ -363,7 +363,7 @@ class SitesDB(object): use_https = config.wiki.get("useHTTPS", False) assert_edit = config.wiki.get("assert") maxlag = config.wiki.get("maxlag") - wait_between_queries = config.wiki.get("waitTime", 5) + wait_between_queries = config.wiki.get("waitTime", 3) logger = self._logger.getChild(name) search_config = config.wiki.get("search") From 3744a34f28f88c94f71aa79bc823ba20aca2b3c3 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 7 Jul 2012 22:59:15 -0400 Subject: [PATCH 14/19] Allow templated SQL connection info. --- docs/toolset.rst | 3 ++- earwigbot/wiki/sitesdb.py | 25 ++++++++++++++++++------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/docs/toolset.rst b/docs/toolset.rst index c7808d2..fcdfc6d 100644 --- a/docs/toolset.rst +++ b/docs/toolset.rst @@ -47,7 +47,8 @@ wikis, you can usually use code like this:: site = bot.wiki.add_site(project=project, lang=lang) This works because EarwigBot assumes that the URL for the site is -``"//{lang}.{project}.org"`` and the API is at ``/w/api.php``; this might +``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL +connection info (if any) are stored as ``config.wiki["sql"]``. This might change if you're dealing with non-WMF wikis, where the code might look something more like:: diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py index fd3c521..cdff1fe 100644 --- a/earwigbot/wiki/sitesdb.py +++ b/earwigbot/wiki/sitesdb.py @@ -196,6 +196,12 @@ class SitesDB(object): nltk_dir = path.join(self.config.root_dir, ".nltk") search_config["nltk_dir"] = nltk_dir + if not sql: + sql = config.wiki.get("sql", {}) + for key, value in sql.iteritems(): + if "$1" in value: + sql[key] = value.replace("$1", name) + return Site(name=name, project=project, lang=lang, base_url=base_url, article_path=article_path, script_path=script_path, sql=sql, namespaces=namespaces, login=login, @@ -336,13 +342,12 @@ class SitesDB(object): the script path (meaning the API is located at ``"{base_url}{script_path}/api.php"`` -> ``"//{lang}.{project}.org/w/api.php"``), so this is the default. If - your wiki is different, provide the script_path as an argument. The - only other argument to :py:class:`~earwigbot.wiki.site.Site` that we - can't get from config files or by querying the wiki itself is SQL - connection info, so provide a dict of kwargs as *sql* and Site will - pass it to :py:func:`oursql.connect(**sql) `, allowing - you to make queries with :py:meth:`site.sql_query - `. + your wiki is different, provide the script_path as an argument. SQL + connection settings are guessed automatically using config's template + value. If this is wrong or not specified, provide a dict of kwargs as + *sql* and Site will pass it to :py:func:`oursql.connect(**sql) + `, allowing you to make queries with + :py:meth:`site.sql_query `. Returns ``True`` if the site was added successfully or ``False`` if the site is already in our sitesdb (this can be done purposefully to update @@ -375,6 +380,12 @@ class SitesDB(object): nltk_dir = path.join(self.config.root_dir, ".nltk") search_config["nltk_dir"] = nltk_dir + if not sql: + sql = config.wiki.get("sql", {}) + for key, value in sql.iteritems(): + if "$1" in value: + sql[key] = value.replace("$1", name) + # Create a Site object to log in and load the other attributes: site = Site(base_url=base_url, script_path=script_path, sql=sql, login=login, cookiejar=cookiejar, user_agent=user_agent, From a074da853bd8956803b9f0061e12d4ca1d32cff0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 8 Jul 2012 14:44:15 -0400 Subject: [PATCH 15/19] More work on copyvios, including an exclusions database (#5) * Added exclusions module with a fully implemented ExclusionsDB that can pull from multiple sources for different sites. * Moved CopyvioCheckResult to its own module, to be imported by __init__. * Some other related changes. --- docs/api/earwigbot.wiki.copyvios.rst | 14 +++ docs/toolset.rst | 6 +- earwigbot/wiki/copyvios/__init__.py | 56 +++--------- earwigbot/wiki/copyvios/exclusions.py | 155 ++++++++++++++++++++++++++++++++++ earwigbot/wiki/copyvios/result.py | 60 +++++++++++++ earwigbot/wiki/sitesdb.py | 8 ++ 6 files changed, 252 insertions(+), 47 deletions(-) create mode 100644 earwigbot/wiki/copyvios/exclusions.py create mode 100644 earwigbot/wiki/copyvios/result.py diff --git a/docs/api/earwigbot.wiki.copyvios.rst b/docs/api/earwigbot.wiki.copyvios.rst index 7dbcf39..abddf7a 100644 --- a/docs/api/earwigbot.wiki.copyvios.rst +++ b/docs/api/earwigbot.wiki.copyvios.rst @@ -8,6 +8,13 @@ copyvios Package :members: :undoc-members: +:mod:`exclusions` Module +------------------------ + +.. automodule:: earwigbot.wiki.copyvios.exclusions + :members: + :undoc-members: + :mod:`markov` Module -------------------- @@ -24,6 +31,13 @@ copyvios Package :undoc-members: :show-inheritance: +:mod:`result` Module +-------------------- + +.. automodule:: earwigbot.wiki.copyvios.result + :members: + :undoc-members: + :mod:`search` Module -------------------- diff --git a/docs/toolset.rst b/docs/toolset.rst index fcdfc6d..e2258c8 100644 --- a/docs/toolset.rst +++ b/docs/toolset.rst @@ -48,9 +48,9 @@ wikis, you can usually use code like this:: This works because EarwigBot assumes that the URL for the site is ``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL -connection info (if any) are stored as ``config.wiki["sql"]``. This might -change if you're dealing with non-WMF wikis, where the code might look -something more like:: +connection info (if any) is stored as ``config.wiki["sql"]``. This might change +if you're dealing with non-WMF wikis, where the code might look something more +like:: project, lang = "mywiki", "it" try: diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index cf2ddde..0f29403 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -33,47 +33,10 @@ except ImportError: from earwigbot import exceptions from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser +from earwigbot.wiki.copyvios.result import CopyvioCheckResult from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine -__all__ = ["CopyvioCheckResult", "CopyvioMixIn"] - -class CopyvioCheckResult(object): - """ - **EarwigBot: Wiki Toolset: Copyvio Check Result** - - A class holding information about the results of a copyvio check. - - *Attributes:* - - - :py:attr:`violation`: ``True`` if this is a violation, else ``False`` - - :py:attr:`confidence`: a float between 0 and 1 indicating accuracy - - :py:attr:`url`: the URL of the violated page - - :py:attr:`queries`: the number of queries used to reach a result - - :py:attr:`article_chain`: the MarkovChain of the article text - - :py:attr:`source_chain`: the MarkovChain of the violated page text - - :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two - """ - - def __init__(self, violation, confidence, url, queries, article, chains): - self.violation = violation - self.confidence = confidence - self.url = url - self.queries = queries - self.article_chain = article - self.source_chain = chains[0] - self.delta_chain = chains[1] - - def __repr__(self): - """Return the canonical string representation of the result.""" - res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" - return res.format(self.violation, self.confidence, self.url, - self.queries) - - def __str__(self): - """Return a nice string representation of the result.""" - res = "" - return res.format(self.violation, self.confidence) - +__all__ = ["CopyvioMixIn"] class CopyvioMixIn(object): """ @@ -88,6 +51,7 @@ class CopyvioMixIn(object): def __init__(self, site): self._search_config = site._search_config + self._exclusions_db = self._search_config["exclusions_db"] self._opener = build_opener() self._opener.addheaders = site._opener.addheaders @@ -156,8 +120,9 @@ class CopyvioMixIn(object): interquery_sleep=1): """Check the page for copyright violations. - Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` - object with information on the results of the check. + Returns a + :py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object + with information on the results of the check. *max_queries* is self-explanatory; we will never make more than this number of queries in a given check. If it's lower than 0, we will not @@ -171,6 +136,7 @@ class CopyvioMixIn(object): :py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors. """ searcher = self._select_search_engine() + self._exclusions_db.sync(self.site.name) handled_urls = [] best_confidence = 0 best_match = None @@ -193,6 +159,8 @@ class CopyvioMixIn(object): urls = [url for url in urls if url not in handled_urls] for url in urls: handled_urls.append(url) + if self._exclusions_db.check(self.site.name, url): + continue conf, chains = self._copyvio_compare_content(article_chain, url) if conf > best_confidence: best_confidence = conf @@ -216,9 +184,9 @@ class CopyvioMixIn(object): This is essentially a reduced version of the above - a copyivo comparison is made using Markov chains and the result is returned in a - :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but - without using a search engine, since the suspected "violated" URL is - supplied from the start. + :py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object - + but without using a search engine, since the suspected "violated" URL + is supplied from the start. Its primary use is to generate a result when the URL is retrieved from a cache, like the one used in EarwigBot's Toolserver site. After a diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py new file mode 100644 index 0000000..fdbaa39 --- /dev/null +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2009-2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import re +import sqlite3 as sqlite +from threading import Lock +from time import time + +from earwigbot import exceptions + +__all__ = ["ExclusionsDB"] + +default_sources = { + "enwiki": [ + "Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def", + "Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl", + "Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr", + "Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz" + ] +} + +class ExclusionsDB(object): + """ + **EarwigBot: Wiki Toolset: Exclusions Database Manager** + + Controls the :file:`.exclusions.db` file, which stores URLs excluded from + copyright violation checks on account of being known mirrors, for example. + """ + + def __init__(self, sitesdb, dbfile, logger): + self._sitesdb = sitesdb + self._dbfile = dbfile + self._logger = logger + self._db_access_lock = Lock() + + def _create(self): + """Initialize the exclusions database with its necessary tables.""" + script = """ + CREATE TABLE sources (source_sitename, source_page); + CREATE TABLE updates (update_sitename, update_time); + CREATE TABLE exclusions (exclusion_sitename, exclusion_url); + """ + query = "INSERT INTO sources VALUES (?, ?);" + sources = [] + for sitename, pages in default_sources.iteritems(): + [sources.append((sitename, page)) for page in pages] + + with sqlite.connect(self._dbfile) as conn: + conn.executescript(script) + conn.executemany(query, sources) + + def _load_source(self, site, source): + """Load from a specific source and return a set of URLs.""" + urls = set() + try: + data = site.get_page(source).get() + except exceptions.PageNotFoundError: + return urls + + regexes = [ + "url\s*=\s*(?:https?:)?(?://)?(.*)", + "\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?" + ] + for regex in regexes: + [urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)] + return urls + + def _update(self, sitename): + """Update the database from listed sources in the index.""" + query1 = "SELECT source_page FROM sources WHERE source_sitename = ?;" + query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" + query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?" + query4 = "INSERT INTO exclusions VALUES (?, ?);" + query5 = "SELECT 1 FROM updates WHERE update_sitename = ?;" + query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?;" + query7 = "INSERT INTO updates VALUES (?, ?);" + + site = self._sitesdb.get_site(sitename) + with sqlite.connect(self._dbfile) as conn, self._db_access_lock: + urls = set() + for (source,) in conn.execute(query1, (sitename,)): + urls |= self._load_source(site, source) + for (url,) in conn.execute(query2, (sitename,)): + if url in urls: + urls.remove(url) + else: + conn.execute(query3, (sitename, url)) + conn.executemany(query4, [(sitename, url) for url in urls]) + if conn.execute(query5, (name,)).fetchone(): + conn.execute(query6, (time(), sitename)) + else: + conn.execute(query7, (sitename, time())) + + def _get_last_update(self, sitename): + """Return the UNIX timestamp of the last time the db was updated.""" + query = "SELECT update_time FROM updates WHERE update_sitename = ?;" + with sqlite.connect(self._dbfile) as conn, self._db_access_lock: + try: + result = conn.execute(query, (sitename,)).fetchone() + except sqlite.OperationalError: + self._create() + return 0 + return result[0] if result else 0 + + def sync(self, sitename): + """Update the database if it hasn't been updated in the past month. + + This only updates the exclusions database for the *sitename* site. + """ + max_staleness = 60 * 60 * 24 * 30 + time_since_update = int(time() - self._get_last_update()) + if time_since_update > max_staleness: + log = "Updating stale database: {0} (last updated {1} seconds ago)" + self._logger.info(log.format(sitename, time_since_update)) + self._update(sitename) + else: + log = "Database for {0} is still fresh (last updated {1} seconds ago)" + self._logger.debug(log.format(sitename, time_since_update)) + + def check(self, sitename, url): + """Check whether a given URL is in the exclusions database. + + Return ``True`` if the URL is in the database, or ``False`` otherwise. + """ + normalized = re.sub("https?://", "", url.lower()) + query = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" + with sqlite.connect(self._dbfile) as conn, self._db_access_lock: + for row in conn.execute(query, (sitename,)): + if normalized.startswith(row[0]): + log = "Exclusion detected in {0} for {1}" + self._logger.debug(log.format(sitename, url)) + return True + + log = "No exclusions in {0} for {1}".format(sitename, url) + self._logger.debug(log) + return False diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py new file mode 100644 index 0000000..0c3e98f --- /dev/null +++ b/earwigbot/wiki/copyvios/result.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2009-2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +__all__ = ["CopyvioCheckResult"] + +class CopyvioCheckResult(object): + """ + **EarwigBot: Wiki Toolset: Copyvio Check Result** + + A class holding information about the results of a copyvio check. + + *Attributes:* + + - :py:attr:`violation`: ``True`` if this is a violation, else ``False`` + - :py:attr:`confidence`: a float between 0 and 1 indicating accuracy + - :py:attr:`url`: the URL of the violated page + - :py:attr:`queries`: the number of queries used to reach a result + - :py:attr:`article_chain`: the MarkovChain of the article text + - :py:attr:`source_chain`: the MarkovChain of the violated page text + - :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two + """ + + def __init__(self, violation, confidence, url, queries, article, chains): + self.violation = violation + self.confidence = confidence + self.url = url + self.queries = queries + self.article_chain = article + self.source_chain = chains[0] + self.delta_chain = chains[1] + + def __repr__(self): + """Return the canonical string representation of the result.""" + res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" + return res.format(self.violation, self.confidence, self.url, + self.queries) + + def __str__(self): + """Return a nice string representation of the result.""" + res = "" + return res.format(self.violation, self.confidence) diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py index cdff1fe..9d2c828 100644 --- a/earwigbot/wiki/sitesdb.py +++ b/earwigbot/wiki/sitesdb.py @@ -29,6 +29,7 @@ import sqlite3 as sqlite from earwigbot import __version__ from earwigbot.exceptions import SiteNotFoundError +from earwigbot.wiki.copyvios.exclusions import ExclusionsDB from earwigbot.wiki.site import Site __all__ = ["SitesDB"] @@ -58,11 +59,16 @@ class SitesDB(object): """Set up the manager with an attribute for the base Bot object.""" self.config = bot.config self._logger = bot.logger.getChild("wiki") + self._sites = {} # Internal site cache self._sitesdb = path.join(bot.config.root_dir, "sites.db") self._cookie_file = path.join(bot.config.root_dir, ".cookies") self._cookiejar = None + excl_db = path.join(bot.config.root_dir, "exclusions.db") + excl_logger = self._logger.getChild("exclusionsdb") + self._exclusions_db = ExclusionsDB(self, excl_db, excl_logger) + def __repr__(self): """Return the canonical string representation of the SitesDB.""" res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})" @@ -195,6 +201,7 @@ class SitesDB(object): if search_config: nltk_dir = path.join(self.config.root_dir, ".nltk") search_config["nltk_dir"] = nltk_dir + search_config["exclusions_db"] = self._exclusions_db if not sql: sql = config.wiki.get("sql", {}) @@ -379,6 +386,7 @@ class SitesDB(object): if search_config: nltk_dir = path.join(self.config.root_dir, ".nltk") search_config["nltk_dir"] = nltk_dir + search_config["exclusions_db"] = self._exclusions_db if not sql: sql = config.wiki.get("sql", {}) From 1c2dcc999a7ce8593630931f5a40fe5a317daff6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 8 Jul 2012 14:53:21 -0400 Subject: [PATCH 16/19] __repr__ and __str__ for ExclusionsDB (#5). --- earwigbot/wiki/copyvios/exclusions.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py index fdbaa39..7eb6a80 100644 --- a/earwigbot/wiki/copyvios/exclusions.py +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -52,6 +52,15 @@ class ExclusionsDB(object): self._logger = logger self._db_access_lock = Lock() + def __repr__(self): + """Return the canonical string representation of the ExclusionsDB.""" + res = "ExclusionsDB(sitesdb={0!r}, dbfile={1!r}, logger={2!r})" + return res.format(self._sitesdb, self._dbfile, self._logger) + + def __str__(self): + """Return a nice string representation of the ExclusionsDB.""" + return "".format(self._dbfile) + def _create(self): """Initialize the exclusions database with its necessary tables.""" script = """ From d07f0b5f9af88dd532815e8156bcf56955830af2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 8 Jul 2012 15:04:44 -0400 Subject: [PATCH 17/19] Add loggers to Category, Page, and User. --- earwigbot/wiki/page.py | 12 +++++++++++- earwigbot/wiki/site.py | 9 +++++---- earwigbot/wiki/user.py | 10 +++++++++- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py index 3125b33..92bb5b7 100644 --- a/earwigbot/wiki/page.py +++ b/earwigbot/wiki/page.py @@ -21,6 +21,7 @@ # SOFTWARE. from hashlib import md5 +from logging import getLogger, NullHandler import re from time import gmtime, strftime from urllib import quote @@ -82,7 +83,8 @@ class Page(CopyvioMixIn): PAGE_MISSING = 2 PAGE_EXISTS = 3 - def __init__(self, site, title, follow_redirects=False, pageid=None): + def __init__(self, site, title, follow_redirects=False, pageid=None, + logger=None): """Constructor for new Page instances. Takes four arguments: a Site object, the Page's title (or pagename), @@ -101,6 +103,14 @@ class Page(CopyvioMixIn): self._follow_redirects = self._keep_following = follow_redirects self._pageid = pageid + # Set up our internal logger: + if logger: + self._logger = logger + else: # Just set up a null logger to eat up our messages: + self._logger = getLogger("earwigbot.wiki") + self._logger.addHandler(NullHandler()) + + # Attributes to be loaded through the API: self._exists = self.PAGE_UNKNOWN self._is_redirect = None self._lastrevid = None diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index 8261703..bd6c95b 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -789,8 +789,9 @@ class Site(object): prefix = title.split(":", 1)[0] if prefix != title: # Avoid a page that is simply "Category" if prefix in prefixes: - return Category(self, title, follow_redirects, pageid) - return Page(self, title, follow_redirects, pageid) + return Category(self, title, follow_redirects, pageid, + self._logger) + return Page(self, title, follow_redirects, pageid, self._logger) def get_category(self, catname, follow_redirects=False, pageid=None): """Return a :py:class:`Category` object for the given category name. @@ -802,7 +803,7 @@ class Site(object): catname = self._unicodeify(catname) prefix = self.namespace_id_to_name(constants.NS_CATEGORY) pagename = u':'.join((prefix, catname)) - return Category(self, pagename, follow_redirects, pageid) + return Category(self, pagename, follow_redirects, pageid, self._logger) def get_user(self, username=None): """Return a :py:class:`User` object for the given username. @@ -815,7 +816,7 @@ class Site(object): username = self._unicodeify(username) else: username = self._get_username() - return User(self, username) + return User(self, username, self._logger) def delegate(self, services, args=None, kwargs=None): """Delegate a task to either the API or SQL depending on conditions. diff --git a/earwigbot/wiki/user.py b/earwigbot/wiki/user.py index b71b502..92da1e6 100644 --- a/earwigbot/wiki/user.py +++ b/earwigbot/wiki/user.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from logging import getLogger, NullHandler from time import gmtime, strptime from earwigbot.exceptions import UserNotFoundError @@ -60,7 +61,7 @@ class User(object): talkpage """ - def __init__(self, site, name): + def __init__(self, site, name, logger=None): """Constructor for new User instances. Takes two arguments, a Site object (necessary for doing API queries), @@ -76,6 +77,13 @@ class User(object): self._site = site self._name = name + # Set up our internal logger: + if logger: + self._logger = logger + else: # Just set up a null logger to eat up our messages: + self._logger = getLogger("earwigbot.wiki") + self._logger.addHandler(NullHandler()) + def __repr__(self): """Return the canonical string representation of the User.""" return "User(name={0!r}, site={1!r})".format(self._name, self._site) From 439b8552540f8253820a3ce5ffd4a47026dd79ce Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 8 Jul 2012 15:28:58 -0400 Subject: [PATCH 18/19] Fully implement logging; fix non-unicode log messages. --- earwigbot/tasks/afc_copyvios.py | 14 +++++++------- earwigbot/wiki/copyvios/__init__.py | 27 +++++++++++++++++++++------ earwigbot/wiki/copyvios/exclusions.py | 8 ++++---- earwigbot/wiki/copyvios/search.py | 2 ++ 4 files changed, 34 insertions(+), 17 deletions(-) diff --git a/earwigbot/tasks/afc_copyvios.py b/earwigbot/tasks/afc_copyvios.py index afcb7f9..12c6b80 100644 --- a/earwigbot/tasks/afc_copyvios.py +++ b/earwigbot/tasks/afc_copyvios.py @@ -70,17 +70,17 @@ class AFCCopyvios(Task): """Detect copyvios in 'page' and add a note if any are found.""" title = page.title if title in self.ignore_list: - msg = "Skipping page in ignore list: [[{0}]]" + msg = u"Skipping page in ignore list: [[{0}]]" self.logger.info(msg.format(title)) return pageid = page.pageid if self.has_been_processed(pageid): - msg = "Skipping check on already processed page [[{0}]]" + msg = u"Skipping check on already processed page [[{0}]]" self.logger.info(msg.format(title)) return - self.logger.info("Checking [[{0}]]".format(title)) + self.logger.info(u"Checking [[{0}]]".format(title)) result = page.copyvio_check(self.min_confidence, self.max_queries) url = result.url confidence = "{0}%".format(round(result.confidence * 100, 2)) @@ -94,11 +94,11 @@ class AFCCopyvios(Task): page.edit(newtext, self.summary.format(url=url)) else: page.edit(newtext, self.summary) - msg = "Found violation: [[{0}]] -> {1} ({2} confidence)" - self.logger.warn(msg.format(title, url, confidence)) + msg = u"Found violation: [[{0}]] -> {1} ({2} confidence)" + self.logger.info(msg.format(title, url, confidence)) else: - msg = "No violations detected (best: {1} at {2} confidence)" - self.logger.debug(msg.format(url, confidence)) + msg = u"No violations detected in [[{0}]] (best: {1} at {2} confidence)" + self.logger.info(msg.format(title, url, confidence)) self.log_processed(pageid) if self.cache_results: diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 0f29403..e89a322 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -155,7 +155,10 @@ class CopyvioMixIn(object): while (chunks and best_confidence < min_confidence and (max_queries < 0 or num_queries < max_queries)): - urls = searcher.search(chunks.pop(0)) + chunk = chunks.pop(0) + log = u"[[{0}]] -> querying {1} for {2!r}" + self._logger.debug(log.format(self.title, searcher.name, chunk)) + urls = searcher.search(chunk) urls = [url for url in urls if url not in handled_urls] for url in urls: handled_urls.append(url) @@ -172,12 +175,19 @@ class CopyvioMixIn(object): sleep(interquery_sleep - diff) last_query = time() - if best_confidence >= min_confidence: # violation? - v = True + if best_confidence >= min_confidence: + is_violation = True + log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2}; using {3} queries)" + self._logger.debug(log.format(self.title, best_confidence, + best_match, num_queries)) else: - v = False - return CopyvioCheckResult(v, best_confidence, best_match, num_queries, - article_chain, best_chains) + is_violation = False + log = u"No violation for [[{0}]] (confidence: {1}; using {2} queries)" + self._logger.debug(log.format(self.title, best_confidence, + num_queries)) + + return CopyvioCheckResult(is_violation, best_confidence, best_match, + num_queries, article_chain, best_chains) def copyvio_compare(self, url, min_confidence=0.5): """Check the page like :py:meth:`copyvio_check` against a specific URL. @@ -208,7 +218,12 @@ class CopyvioMixIn(object): if confidence >= min_confidence: is_violation = True + log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2})" + self._logger.debug(log.format(self.title, confidence, url)) else: is_violation = False + log = u"No violation for [[{0}]] (confidence: {1}; URL: {2})" + self._logger.debug(log.format(self.title, confidence, url)) + return CopyvioCheckResult(is_violation, confidence, url, 0, article_chain, chains) diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py index 7eb6a80..4640b1f 100644 --- a/earwigbot/wiki/copyvios/exclusions.py +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -138,11 +138,11 @@ class ExclusionsDB(object): max_staleness = 60 * 60 * 24 * 30 time_since_update = int(time() - self._get_last_update()) if time_since_update > max_staleness: - log = "Updating stale database: {0} (last updated {1} seconds ago)" + log = u"Updating stale database: {0} (last updated {1} seconds ago)" self._logger.info(log.format(sitename, time_since_update)) self._update(sitename) else: - log = "Database for {0} is still fresh (last updated {1} seconds ago)" + log = u"Database for {0} is still fresh (last updated {1} seconds ago)" self._logger.debug(log.format(sitename, time_since_update)) def check(self, sitename, url): @@ -155,10 +155,10 @@ class ExclusionsDB(object): with sqlite.connect(self._dbfile) as conn, self._db_access_lock: for row in conn.execute(query, (sitename,)): if normalized.startswith(row[0]): - log = "Exclusion detected in {0} for {1}" + log = u"Exclusion detected in {0} for {1}" self._logger.debug(log.format(sitename, url)) return True - log = "No exclusions in {0} for {1}".format(sitename, url) + log = u"No exclusions in {0} for {1}".format(sitename, url) self._logger.debug(log) return False diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index cf2edb4..0ccd62e 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -34,6 +34,7 @@ __all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] class BaseSearchEngine(object): """Base class for a simple search engine interface.""" + name = "Base" def __init__(self, cred): """Store credentials *cred* for searching later on.""" @@ -57,6 +58,7 @@ class BaseSearchEngine(object): class YahooBOSSSearchEngine(BaseSearchEngine): """A search engine interface with Yahoo! BOSS.""" + name = "Yahoo! BOSS" def search(self, query): """Do a Yahoo! BOSS web search for *query*. From becd135c5242b1a093bdcf88026c7c7328d4e7d7 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 8 Jul 2012 16:09:00 -0400 Subject: [PATCH 19/19] Minor cleanup for afc_copyvios, mainly Unicode fixes. --- earwigbot/tasks/afc_copyvios.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/earwigbot/tasks/afc_copyvios.py b/earwigbot/tasks/afc_copyvios.py index 12c6b80..3dc3902 100644 --- a/earwigbot/tasks/afc_copyvios.py +++ b/earwigbot/tasks/afc_copyvios.py @@ -23,6 +23,7 @@ from hashlib import sha256 from os.path import expanduser from threading import Lock +from urllib import quote import oursql @@ -86,9 +87,10 @@ class AFCCopyvios(Task): confidence = "{0}%".format(round(result.confidence * 100, 2)) if result.violation: + safeurl = quote(url.encode("utf8"), safe="/:").decode("utf8") content = page.get() - template = "\{\{{0}|url={1}|confidence={2}\}\}\n" - template = template.format(self.template, url, confidence) + template = u"\{\{{0}|url={1}|confidence={2}\}\}\n" + template = template.format(self.template, safeurl, confidence) newtext = template + content if "{url}" in self.summary: page.edit(newtext, self.summary.format(url=url)) @@ -110,9 +112,7 @@ class AFCCopyvios(Task): with self.conn.cursor() as cursor: cursor.execute(query, (pageid,)) results = cursor.fetchall() - if results: - return True - return False + return True if results else False def log_processed(self, pageid): """Adds pageid to our database of processed pages. @@ -138,8 +138,8 @@ class AFCCopyvios(Task): be) retained for one day; this task does not remove old entries (that is handled by the Toolserver component). - This will only be called if "cache_results" == True in the task's - config, which is False by default. + This will only be called if ``cache_results == True`` in the task's + config, which is ``False`` by default. """ pageid = page.pageid hash = sha256(page.get()).hexdigest()