A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

252 rindas
11 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2014 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from gzip import GzipFile
  23. from socket import timeout
  24. from StringIO import StringIO
  25. from time import sleep, time
  26. from urllib2 import build_opener, URLError
  27. from earwigbot import exceptions, importer
  28. from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
  29. from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser
  30. from earwigbot.wiki.copyvios.result import CopyvioCheckResult
  31. from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
  32. oauth = importer.new("oauth2")
  33. __all__ = ["CopyvioMixIn"]
  34. class CopyvioMixIn(object):
  35. """
  36. **EarwigBot: Wiki Toolset: Copyright Violation MixIn**
  37. This is a mixin that provides two public methods, :py:meth:`copyvio_check`
  38. and :py:meth:`copyvio_compare`. The former checks the page for copyright
  39. violations using a search engine API, and the latter compares the page
  40. against a given URL. Credentials for the search engine API are stored in
  41. the :py:class:`~earwigbot.wiki.site.Site`'s config.
  42. """
  43. def __init__(self, site):
  44. self._search_config = site._search_config
  45. self._exclusions_db = self._search_config.get("exclusions_db")
  46. self._opener = build_opener()
  47. self._opener.addheaders = site._opener.addheaders
  48. def _open_url_ignoring_errors(self, url):
  49. """Open a URL using self._opener and return its content, or None.
  50. Will decompress the content if the headers contain "gzip" as its
  51. content encoding, and will return None if URLError is raised while
  52. opening the URL. IOErrors while gunzipping a compressed response are
  53. ignored, and the original content is returned.
  54. """
  55. try:
  56. response = self._opener.open(url.encode("utf8"), timeout=5)
  57. result = response.read()
  58. except (URLError, timeout):
  59. return None
  60. if response.headers.get("Content-Encoding") == "gzip":
  61. stream = StringIO(result)
  62. gzipper = GzipFile(fileobj=stream)
  63. try:
  64. result = gzipper.read()
  65. except IOError:
  66. pass
  67. return result
  68. def _select_search_engine(self):
  69. """Return a function that can be called to do web searches.
  70. The function takes one argument, a search query, and returns a list of
  71. URLs, ranked by importance. The underlying logic depends on the
  72. *engine* argument within our config; for example, if *engine* is
  73. "Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying.
  74. Raises UnknownSearchEngineError if the 'engine' listed in our config is
  75. unknown to us, and UnsupportedSearchEngineError if we are missing a
  76. required package or module, like oauth2 for "Yahoo! BOSS".
  77. """
  78. engine = self._search_config["engine"]
  79. credentials = self._search_config["credentials"]
  80. if engine == "Yahoo! BOSS":
  81. try:
  82. oauth.__version__ # Force-load the lazy module
  83. except ImportError:
  84. e = "Yahoo! BOSS requires the 'oauth2' package: https://github.com/simplegeo/python-oauth2"
  85. raise exceptions.UnsupportedSearchEngineError(e)
  86. return YahooBOSSSearchEngine(credentials, self._opener)
  87. raise exceptions.UnknownSearchEngineError(engine)
  88. def _copyvio_compare_content(self, article, url):
  89. """Return a number comparing an article and a URL.
  90. The *article* is a Markov chain, whereas the *url* is just a string
  91. that we'll try to open and read ourselves.
  92. """
  93. html = self._open_url_ignoring_errors(url)
  94. if not html:
  95. return 0, ()
  96. source = MarkovChain(HTMLTextParser(html).strip())
  97. delta = MarkovChainIntersection(article, source)
  98. return float(delta.size()) / article.size(), (source, delta)
  99. def copyvio_check(self, min_confidence=0.5, max_queries=-1, max_time=-1,
  100. interquery_sleep=1):
  101. """Check the page for copyright violations.
  102. Returns a
  103. :py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object
  104. with information on the results of the check.
  105. *max_queries* is self-explanatory; we will never make more than this
  106. number of queries in a given check. If it's lower than 0, we will not
  107. limit the number of queries.
  108. *max_time* can be set to prevent copyvio checks from taking longer than
  109. a set amount of time (generally around a minute), which can be useful
  110. if checks are called through a web server with timeouts. We will stop
  111. checking new URLs as soon as this limit is reached.
  112. *interquery_sleep* is the minimum amount of time we will sleep between
  113. search engine queries, in seconds.
  114. Raises :py:exc:`~earwigbot.exceptions.CopyvioCheckError` or subclasses
  115. (:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError`,
  116. :py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors.
  117. """
  118. start_time = time()
  119. searcher = self._select_search_engine()
  120. if self._exclusions_db:
  121. self._exclusions_db.sync(self.site.name)
  122. handled_urls = []
  123. best_confidence = 0
  124. best_match = None
  125. num_queries = 0
  126. empty = MarkovChain("")
  127. best_chains = (empty, MarkovChainIntersection(empty, empty))
  128. parser = ArticleTextParser(self.get())
  129. clean = parser.strip()
  130. chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
  131. article_chain = MarkovChain(clean)
  132. last_query = time()
  133. if article_chain.size() < 20: # Auto-fail very small articles
  134. return CopyvioCheckResult(False, best_confidence, best_match,
  135. num_queries, 0, article_chain,
  136. best_chains)
  137. while (chunks and best_confidence < min_confidence and
  138. (max_queries < 0 or num_queries < max_queries)):
  139. chunk = chunks.pop(0)
  140. log = u"[[{0}]] -> querying {1} for {2!r}"
  141. self._logger.debug(log.format(self.title, searcher.name, chunk))
  142. urls = searcher.search(chunk)
  143. urls = [url for url in urls if url not in handled_urls]
  144. for url in urls:
  145. handled_urls.append(url)
  146. if self._exclusions_db:
  147. if self._exclusions_db.check(self.site.name, url):
  148. continue
  149. conf, chns = self._copyvio_compare_content(article_chain, url)
  150. if conf > best_confidence:
  151. best_confidence = conf
  152. best_match = url
  153. best_chains = chns
  154. if time() - start_time > max_time:
  155. break
  156. num_queries += 1
  157. if time() - start_time > max_time:
  158. break
  159. diff = time() - last_query
  160. if diff < interquery_sleep:
  161. sleep(interquery_sleep - diff)
  162. last_query = time()
  163. ctime = time() - start_time
  164. if best_confidence >= min_confidence:
  165. is_violation = True
  166. log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2}; using {3} queries in {4} seconds)"
  167. self._logger.debug(log.format(self.title, best_confidence,
  168. best_match, num_queries, ctime))
  169. else:
  170. is_violation = False
  171. log = u"No violation for [[{0}]] (confidence: {1}; using {2} queries in {3} seconds)"
  172. self._logger.debug(log.format(self.title, best_confidence,
  173. num_queries, ctime))
  174. return CopyvioCheckResult(is_violation, best_confidence, best_match,
  175. num_queries, ctime, article_chain,
  176. best_chains)
  177. def copyvio_compare(self, url, min_confidence=0.5):
  178. """Check the page like :py:meth:`copyvio_check` against a specific URL.
  179. This is essentially a reduced version of the above - a copyivo
  180. comparison is made using Markov chains and the result is returned in a
  181. :py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object -
  182. but without using a search engine, since the suspected "violated" URL
  183. is supplied from the start.
  184. Its primary use is to generate a result when the URL is retrieved from
  185. a cache, like the one used in EarwigBot's Toolserver site. After a
  186. search is done, the resulting URL is stored in a cache for 24 hours so
  187. future checks against that page will not require another set of
  188. time-and-money-consuming search engine queries. However, the comparison
  189. itself (which includes the article's and the source's content) cannot
  190. be stored for data retention reasons, so a fresh comparison is made
  191. using this function.
  192. Since no searching is done, neither
  193. :py:exc:`~earwigbot.exceptions.UnknownSearchEngineError` nor
  194. :py:exc:`~earwigbot.exceptions.SearchQueryError` will be raised.
  195. """
  196. start_time = time()
  197. content = self.get()
  198. clean = ArticleTextParser(content).strip()
  199. article_chain = MarkovChain(clean)
  200. if not url:
  201. empty = MarkovChain("")
  202. chns = (empty, MarkovChainIntersection(empty, empty))
  203. return CopyvioCheckResult(False, 0, url, 0, 0, article_chain, chns)
  204. confidence, chains = self._copyvio_compare_content(article_chain, url)
  205. ctime = time() - start_time
  206. if confidence >= min_confidence:
  207. is_violation = True
  208. log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2}; {3} seconds)"
  209. self._logger.debug(log.format(self.title, confidence, url, ctime))
  210. else:
  211. is_violation = False
  212. log = u"No violation for [[{0}]] (confidence: {1}; URL: {2}; {3} seconds)"
  213. self._logger.debug(log.format(self.title, confidence, url, ctime))
  214. return CopyvioCheckResult(is_violation, confidence, url, 0, ctime,
  215. article_chain, chains)