A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

209 lines
8.1 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009, 2010, 2011 by Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from functools import partial
  23. from gzip import GzipFile
  24. from json import loads
  25. from StringIO import StringIO
  26. from time import sleep, time
  27. from urllib import quote_plus, urlencode
  28. from urllib2 import build_opener, URLError
  29. try:
  30. import oauth2 as oauth
  31. except ImportError:
  32. oauth = None
  33. from earwigbot.wiki.exceptions import *
  34. class CopyvioCheckResult(object):
  35. def __init__(self, violation, confidence, url, queries):
  36. self.violation = violation
  37. self.confidence = confidence
  38. self.url = url
  39. self.queries = queries
  40. def __repr__(self):
  41. r = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
  42. return r.format(self.violation, self.confidence, self.url, self.queries)
  43. class CopyrightMixin(object):
  44. """
  45. EarwigBot's Wiki Toolset: Copyright Violation Mixin
  46. This is a mixin that provides one public method, copyvio_check(), which
  47. checks the page for copyright violations using a search engine API. The
  48. API keys must be provided to the method as arguments.
  49. """
  50. def __init__(self):
  51. self._opener = build_opener()
  52. self._opener.addheaders = self._site._opener.addheaders
  53. def _open_url_ignoring_errors(self, url):
  54. """Open a URL using self._opener and return its content, or None.
  55. Will decompress the content if the headers contain "gzip" as its
  56. content encoding, and will return None if URLError is raised while
  57. opening the URL. IOErrors while gunzipping a compressed response are
  58. ignored, and the original content is returned.
  59. """
  60. try:
  61. response = self._opener.open(url)
  62. except URLError:
  63. return None
  64. result = response.read()
  65. if response.headers.get("Content-Encoding") == "gzip":
  66. stream = StringIO(result)
  67. gzipper = GzipFile(fileobj=stream)
  68. try:
  69. result = gzipper.read()
  70. except IOError:
  71. pass
  72. return result
  73. def _select_search_engine(self, engine, credentials):
  74. """Return a function that can be called to do web searches.
  75. The "function" is a functools.partial object that takes one argument, a
  76. query, and returns a list of URLs, ranked by importance. The underlying
  77. logic depends on the 'engine' argument; for example, if 'engine' is
  78. "Yahoo! BOSS", we'll use self._yahoo_boss_query for querying.
  79. Raises UnknownSearchEngineError if 'engine' is not known to us, and
  80. UnsupportedSearchEngineError if we are missing a required package or
  81. module, like oauth2 for "Yahoo! BOSS".
  82. """
  83. if engine == "Yahoo! BOSS":
  84. if not oauth:
  85. e = "The package 'oauth2' could not be imported"
  86. raise UnsupportedSearchEngineError(e)
  87. searcher = self._yahoo_boss_query
  88. else:
  89. raise UnknownSearchEngineError(engine)
  90. return partial(searcher, credentials)
  91. def _yahoo_boss_query(self, cred, query):
  92. """Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials.
  93. Returns a list of URLs, no more than fifty, ranked by relevance (as
  94. determined by Yahoo). Raises SearchQueryError() on errors.
  95. """
  96. base_url = "http://yboss.yahooapis.com/ysearch/web"
  97. query = quote_plus(query.join('"', '"'))
  98. params = {"q": query, "style": "raw", "format": "json"}
  99. url = "{0}?{1}".format(base_url, urlencode(params))
  100. consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"])
  101. client = oauth.Client(consumer)
  102. headers, body = client.request(url, "GET")
  103. if headers["status"] != "200":
  104. e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
  105. raise SearchQueryError(e.format(headers["status"], body))
  106. try:
  107. res = loads(body)
  108. except ValueError:
  109. e = "Yahoo! BOSS Error: JSON could not be decoded"
  110. raise SearchQueryError(e)
  111. try:
  112. results = res["bossresponse"]["web"]["results"]
  113. except KeyError:
  114. return []
  115. return [result["url"] for result in results]
  116. def _copyvio_strip_content(self, content):
  117. return content
  118. def _copyvio_chunk_content(self, content):
  119. return [content]
  120. def _copyvio_compare_content(self, content, url):
  121. html = self._open_url_ignoring_errors(url)
  122. if not html:
  123. return 0
  124. confidence = 0
  125. return confidence
  126. def copyvio_check(self, engine, credentials, min_confidence=0.75,
  127. max_queries=-1, interquery_sleep=1, force=False):
  128. """Check the page for copyright violations.
  129. Returns a CopyvioCheckResult object, with four useful attributes:
  130. "violation", "confidence", "url", and "queries". "confidence" is a
  131. number between 0 and 1; if it is less than "min_confidence", we could
  132. not find any indication of a violation (so "violation" will be False
  133. and "url" may or may not be None), otherwise it indicates the relative
  134. faith in our results, "violation" will be True, and "url" will be the
  135. place the article is suspected of being copied from. "queries" is the
  136. number of queries used to determine the results.
  137. "max_queries" is self-explanatory; we will never make more than this
  138. number of queries in a given check. If it's less than 0, we will not
  139. limit our number of queries.
  140. "interquery_sleep" is the minimum amount of time we will sleep between
  141. search engine queries, in seconds.
  142. "force" is simply passed to page.get() - it has the same behavior there
  143. as it does here.
  144. Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
  145. SearchQueryError, ...) on errors.
  146. """
  147. search = self._select_search_engine(engine, credentials)
  148. handled_urls = []
  149. best_confidence = 0
  150. best_match = None
  151. num_queries = 0
  152. content = self.get(force)
  153. clean = self._copyvio_strip_content(content)
  154. chunks = self._copyvio_chunk_content(clean)
  155. last_query = time()
  156. while (chunks and best_confidence < min_confidence and
  157. (max_queries < 0 or num_queries < max_queries)):
  158. urls = search(chunks.pop(0))
  159. urls = [url for url in urls if url not in handled_urls]
  160. for url in urls:
  161. confidence = self._copyvio_compare_content(content, url)
  162. if confidence > best_confidence:
  163. best_confidence = confidence
  164. best_match = url
  165. num_queries += 1
  166. diff = time() - last_query
  167. if diff < interquery_sleep:
  168. sleep(interquery_sleep - diff)
  169. last_query = time()
  170. if best_confidence >= min_confidence: # violation?
  171. vi = True
  172. else:
  173. vi = False
  174. return CopyvioCheckResult(vi, best_confidence, best_match, num_queries)