A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

234 wiersze
9.8 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from gzip import GzipFile
  23. from StringIO import StringIO
  24. from time import sleep, time
  25. from urllib2 import build_opener, URLError
  26. try:
  27. import oauth2 as oauth
  28. except ImportError:
  29. oauth = None
  30. from earwigbot import exceptions
  31. from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
  32. from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser
  33. from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
  34. __all__ = ["CopyvioCheckResult", "CopyvioMixIn"]
  35. class CopyvioCheckResult(object):
  36. def __init__(self, violation, confidence, url, queries, article, chains):
  37. self.violation = violation
  38. self.confidence = confidence
  39. self.url = url
  40. self.queries = queries
  41. self.article_chain = article
  42. self.source_chain = chains[0]
  43. self.delta_chain = chains[1]
  44. def __repr__(self):
  45. """Return the canonical string representation of the result."""
  46. res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
  47. return res.format(self.violation, self.confidence, self.url,
  48. self.queries)
  49. def __str__(self):
  50. """Return a nice string representation of the result."""
  51. res = "<CopyvioCheckResult ({0} with {1} conf)>"
  52. return res.format(self.violation, self.confidence)
  53. class CopyvioMixIn(object):
  54. """
  55. EarwigBot's Wiki Toolset: Copyright Violation Mixin
  56. This is a mixin that provides two public methods, copyvio_check() and
  57. copyvio_compare(). The former checks the page for copyright violations
  58. using a search engine API, and the latter compares the page against a
  59. specified URL. Credentials for the search engine API are stored in the
  60. site's config.
  61. """
  62. def __init__(self, site):
  63. self._opener = build_opener()
  64. self._opener.addheaders = site._opener.addheaders
  65. def _open_url_ignoring_errors(self, url):
  66. """Open a URL using self._opener and return its content, or None.
  67. Will decompress the content if the headers contain "gzip" as its
  68. content encoding, and will return None if URLError is raised while
  69. opening the URL. IOErrors while gunzipping a compressed response are
  70. ignored, and the original content is returned.
  71. """
  72. try:
  73. response = self._opener.open(url)
  74. except URLError:
  75. return None
  76. result = response.read()
  77. if response.headers.get("Content-Encoding") == "gzip":
  78. stream = StringIO(result)
  79. gzipper = GzipFile(fileobj=stream)
  80. try:
  81. result = gzipper.read()
  82. except IOError:
  83. pass
  84. return result
  85. def _select_search_engine(self):
  86. """Return a function that can be called to do web searches.
  87. The "function" is a functools.partial object that takes one argument, a
  88. query, and returns a list of URLs, ranked by importance. The underlying
  89. logic depends on the 'engine' argument; for example, if 'engine' is
  90. "Yahoo! BOSS", we'll use self._yahoo_boss_query for querying.
  91. Raises UnknownSearchEngineError if the 'engine' listed in our config is
  92. unknown to us, and UnsupportedSearchEngineError if we are missing a
  93. required package or module, like oauth2 for "Yahoo! BOSS".
  94. """
  95. engine, credentials = self._site._search_config
  96. if engine == "Yahoo! BOSS":
  97. if not oauth:
  98. e = "The package 'oauth2' could not be imported"
  99. raise exceptions.UnsupportedSearchEngineError(e)
  100. return YahooBOSSSearchEngine(credentials)
  101. raise exceptions.UnknownSearchEngineError(engine)
  102. def _copyvio_compare_content(self, article, url):
  103. """Return a number comparing an article and a URL.
  104. The *article* is a Markov chain, whereas the URL is a string that we
  105. will try to open ourselves.
  106. """
  107. html = self._open_url_ignoring_errors(url)
  108. if not html:
  109. return 0
  110. source = MarkovChain(HTMLTextParser(html).strip())
  111. delta = MarkovChainIntersection(article, source)
  112. return float(delta.size()) / article.size(), (source, delta)
  113. def copyvio_check(self, min_confidence=0.5, max_queries=-1,
  114. interquery_sleep=1, force=False):
  115. """Check the page for copyright violations.
  116. Returns a _CopyvioCheckResult object with four useful attributes:
  117. "violation", "confidence", "url", and "queries". "confidence" is a
  118. number between 0 and 1; if it is less than "min_confidence", we could
  119. not find any indication of a violation (so "violation" will be False
  120. and "url" may or may not be None), otherwise it indicates the relative
  121. faith in our results, "violation" will be True, and "url" will be the
  122. place the article is suspected of being copied from. "queries" is the
  123. number of queries used to determine the results.
  124. "max_queries" is self-explanatory; we will never make more than this
  125. number of queries in a given check. If it's less than 0, we will not
  126. limit our number of queries.
  127. "interquery_sleep" is the minimum amount of time we will sleep between
  128. search engine queries, in seconds.
  129. "force" is simply passed to page.get() - it has the same behavior there
  130. as it does here.
  131. Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
  132. SearchQueryError, ...) on errors.
  133. """
  134. searcher = self._select_search_engine()
  135. handled_urls = []
  136. best_confidence = 0
  137. best_match = None
  138. num_queries = 0
  139. empty = MarkovChain("")
  140. best_chains = (empty, MarkovChainIntersection(empty, empty))
  141. content = self.get(force)
  142. clean = ArticleTextParser(content).strip()
  143. chunks = ArticleTextParser(clean).chunk(max_queries)
  144. article_chain = MarkovChain(clean)
  145. last_query = time()
  146. if article_chain.size() < 20: # Auto-fail very small articles
  147. return CopyvioCheckResult(False, best_confidence, best_match,
  148. num_queries, article_chain, best_chains)
  149. while (chunks and best_confidence < min_confidence and
  150. (max_queries < 0 or num_queries < max_queries)):
  151. urls = searcher.search(chunks.pop(0))
  152. urls = [url for url in urls if url not in handled_urls]
  153. for url in urls:
  154. handled_urls.append(url)
  155. conf, chains = self._copyvio_compare_content(article_chain, url)
  156. if conf > best_confidence:
  157. best_confidence = conf
  158. best_match = url
  159. best_chains = chains
  160. num_queries += 1
  161. diff = time() - last_query
  162. if diff < interquery_sleep:
  163. sleep(interquery_sleep - diff)
  164. last_query = time()
  165. if best_confidence >= min_confidence: # violation?
  166. v = True
  167. else:
  168. v = False
  169. return CopyvioCheckResult(v, best_confidence, best_match, num_queries,
  170. article_chain, best_chains)
  171. def copyvio_compare(self, url, min_confidence=0.5, force=False):
  172. """Check the page like copyvio_check(), but against a specific URL.
  173. This is essentially a reduced version of the above - a copyivo
  174. comparison is made using Markov chains and the result is returned in a
  175. _CopyvioCheckResult object - without using a search engine, as the
  176. suspected "violated" URL is supplied from the start.
  177. Its primary use is to generate a result when the URL is retrieved from
  178. a cache, like the one used in EarwigBot's Toolserver site. After a
  179. search is done, the resulting URL is stored in a cache for 24 hours so
  180. future checks against that page will not require another set of
  181. time-and-money-consuming search engine queries. However, the comparison
  182. itself (which includes the article's and the source's content) cannot
  183. be stored for data retention reasons, so a fresh comparison is made
  184. using this function.
  185. Since no searching is done, neither UnknownSearchEngineError nor
  186. SearchQueryError will be raised.
  187. """
  188. content = self.get(force)
  189. clean = ArticleTextParser(content).strip()
  190. article_chain = MarkovChain(clean)
  191. confidence, chains = self._copyvio_compare_content(article_chain, url)
  192. if confidence >= min_confidence:
  193. is_violation = True
  194. else:
  195. is_violation = False
  196. return CopyvioCheckResult(is_violation, confidence, url, 0,
  197. article_chain, chains)