A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

325 строки
12 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from collections import defaultdict
  23. from functools import partial
  24. from gzip import GzipFile
  25. from json import loads
  26. from re import sub, UNICODE
  27. from StringIO import StringIO
  28. from time import sleep, time
  29. from urllib import quote_plus, urlencode
  30. from urllib2 import build_opener, URLError
  31. try:
  32. import oauth2 as oauth
  33. except ImportError:
  34. oauth = None
  35. from earwigbot.exceptions import *
  36. class _CopyvioCheckResult(object):
  37. def __init__(self, violation, confidence, url, queries, article, chains):
  38. self.violation = violation
  39. self.confidence = confidence
  40. self.url = url
  41. self.queries = queries
  42. self.article_chain = article
  43. self.source_chain = chains[0]
  44. self.delta_chain = chains[1]
  45. def __repr__(self):
  46. r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
  47. return r.format(self.violation, self.confidence, self.url, self.queries)
  48. class _MarkovChain(object):
  49. START = -1
  50. END = -2
  51. def __init__(self, text):
  52. self.text = text
  53. self.chain = defaultdict(lambda: defaultdict(lambda: 0))
  54. words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
  55. prev = self.START
  56. for word in words:
  57. self.chain[prev][word] += 1
  58. prev = word
  59. try: # This won't work if the source text is completely blank
  60. self.chain[word][self.END] += 1
  61. except KeyError:
  62. pass
  63. def size(self):
  64. count = 0
  65. for node in self.chain.itervalues():
  66. for hits in node.itervalues():
  67. count += hits
  68. return count
  69. class _MarkovChainIntersection(_MarkovChain):
  70. def __init__(self, mc1, mc2):
  71. self.chain = defaultdict(lambda: defaultdict(lambda: 0))
  72. c1 = mc1.chain
  73. c2 = mc2.chain
  74. for word, nodes1 in c1.iteritems():
  75. if word in c2:
  76. nodes2 = c2[word]
  77. for node, count1 in nodes1.iteritems():
  78. if node in nodes2:
  79. count2 = nodes2[node]
  80. self.chain[word][node] = min(count1, count2)
  81. class CopyrightMixin(object):
  82. """
  83. EarwigBot's Wiki Toolset: Copyright Violation Mixin
  84. This is a mixin that provides two public methods, copyvio_check() and
  85. copyvio_compare(). The former checks the page for copyright violations
  86. using a search engine API, and the latter compares the page against a
  87. specified URL. Credentials for the search engine API are stored in the
  88. site's config.
  89. """
  90. def __init__(self, site):
  91. self._opener = build_opener()
  92. self._opener.addheaders = site._opener.addheaders
  93. def _open_url_ignoring_errors(self, url):
  94. """Open a URL using self._opener and return its content, or None.
  95. Will decompress the content if the headers contain "gzip" as its
  96. content encoding, and will return None if URLError is raised while
  97. opening the URL. IOErrors while gunzipping a compressed response are
  98. ignored, and the original content is returned.
  99. """
  100. try:
  101. response = self._opener.open(url)
  102. except URLError:
  103. return None
  104. result = response.read()
  105. if response.headers.get("Content-Encoding") == "gzip":
  106. stream = StringIO(result)
  107. gzipper = GzipFile(fileobj=stream)
  108. try:
  109. result = gzipper.read()
  110. except IOError:
  111. pass
  112. return result
  113. def _select_search_engine(self):
  114. """Return a function that can be called to do web searches.
  115. The "function" is a functools.partial object that takes one argument, a
  116. query, and returns a list of URLs, ranked by importance. The underlying
  117. logic depends on the 'engine' argument; for example, if 'engine' is
  118. "Yahoo! BOSS", we'll use self._yahoo_boss_query for querying.
  119. Raises UnknownSearchEngineError if the 'engine' listed in our config is
  120. unknown to us, and UnsupportedSearchEngineError if we are missing a
  121. required package or module, like oauth2 for "Yahoo! BOSS".
  122. """
  123. engine, credentials = self._site._search_config
  124. if engine == "Yahoo! BOSS":
  125. if not oauth:
  126. e = "The package 'oauth2' could not be imported"
  127. raise UnsupportedSearchEngineError(e)
  128. searcher = self._yahoo_boss_query
  129. else:
  130. raise UnknownSearchEngineError(engine)
  131. return partial(searcher, credentials)
  132. def _yahoo_boss_query(self, cred, query):
  133. """Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials.
  134. Returns a list of URLs, no more than fifty, ranked by relevance (as
  135. determined by Yahoo). Raises SearchQueryError() on errors.
  136. """
  137. base_url = "http://yboss.yahooapis.com/ysearch/web"
  138. query = quote_plus(query.join('"', '"'))
  139. params = {"q": query, "style": "raw", "format": "json"}
  140. url = "{0}?{1}".format(base_url, urlencode(params))
  141. consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"])
  142. client = oauth.Client(consumer)
  143. headers, body = client.request(url, "GET")
  144. if headers["status"] != "200":
  145. e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
  146. raise SearchQueryError(e.format(headers["status"], body))
  147. try:
  148. res = loads(body)
  149. except ValueError:
  150. e = "Yahoo! BOSS Error: JSON could not be decoded"
  151. raise SearchQueryError(e)
  152. try:
  153. results = res["bossresponse"]["web"]["results"]
  154. except KeyError:
  155. return []
  156. return [result["url"] for result in results]
  157. def _copyvio_strip_html(self, html):
  158. """
  159. STUB
  160. """
  161. return html
  162. def _copyvio_strip_article(self, content):
  163. """Clean the page's raw text by removing templates and formatting.
  164. Returns the page's text with all HTML and wikicode formatting removed,
  165. including templates, tables, references, and the Bibliography/
  166. References/Sources/See also section(s). It retains punctuation
  167. (spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
  168. quotes) and original capitalization, but not brackets (square and
  169. angular), abnormal spacing, nor anything else. HTML entities are
  170. replaced by their unicode equivalents.
  171. STUB
  172. """
  173. return content
  174. def _copyvio_chunk_article(self, content, max_chunks):
  175. """
  176. STUB
  177. """
  178. return [content]
  179. def _copyvio_compare_content(self, article, url):
  180. """
  181. DOCSTRING NEEDED
  182. """
  183. html = self._open_url_ignoring_errors(url)
  184. if not html:
  185. return 0
  186. source = _MarkovChain(self._copyvio_strip_html(html))
  187. delta = _MarkovChainIntersection(article, source)
  188. return float(delta.size()) / article.size(), (source, delta)
  189. def copyvio_check(self, min_confidence=0.5, max_queries=-1,
  190. interquery_sleep=1, force=False):
  191. """Check the page for copyright violations.
  192. Returns a _CopyvioCheckResult object with four useful attributes:
  193. "violation", "confidence", "url", and "queries". "confidence" is a
  194. number between 0 and 1; if it is less than "min_confidence", we could
  195. not find any indication of a violation (so "violation" will be False
  196. and "url" may or may not be None), otherwise it indicates the relative
  197. faith in our results, "violation" will be True, and "url" will be the
  198. place the article is suspected of being copied from. "queries" is the
  199. number of queries used to determine the results.
  200. "max_queries" is self-explanatory; we will never make more than this
  201. number of queries in a given check. If it's less than 0, we will not
  202. limit our number of queries.
  203. "interquery_sleep" is the minimum amount of time we will sleep between
  204. search engine queries, in seconds.
  205. "force" is simply passed to page.get() - it has the same behavior there
  206. as it does here.
  207. Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
  208. SearchQueryError, ...) on errors.
  209. """
  210. search = self._select_search_engine()
  211. handled_urls = []
  212. best_confidence = 0
  213. best_match = None
  214. num_queries = 0
  215. empty = _MarkovChain("")
  216. best_chains = (empty, _MarkovChainIntersection(empty, empty))
  217. content = self.get(force)
  218. clean = self._copyvio_strip_article(content)
  219. chunks = self._copyvio_chunk_article(clean, max_queries)
  220. article_chain = _MarkovChain(clean)
  221. last_query = time()
  222. if article_chain.size() < 20: # Auto-fail very small articles
  223. return _CopyvioCheckResult(False, best_confidence, best_match,
  224. num_queries, article_chain, best_chains)
  225. while (chunks and best_confidence < min_confidence and
  226. (max_queries < 0 or num_queries < max_queries)):
  227. urls = search(chunks.pop(0))
  228. urls = [url for url in urls if url not in handled_urls]
  229. for url in urls:
  230. handled_urls.append(url)
  231. conf, chains = self._copyvio_compare_content(article_chain, url)
  232. if conf > best_confidence:
  233. best_confidence = conf
  234. best_match = url
  235. best_chains = chains
  236. num_queries += 1
  237. diff = time() - last_query
  238. if diff < interquery_sleep:
  239. sleep(interquery_sleep - diff)
  240. last_query = time()
  241. if best_confidence >= min_confidence: # violation?
  242. v = True
  243. else:
  244. v = False
  245. return _CopyvioCheckResult(v, best_confidence, best_match, num_queries,
  246. article_chain, best_chains)
  247. def copyvio_compare(self, url, min_confidence=0.5, force=False):
  248. """Check the page like copyvio_check(), but against a specific URL.
  249. This is essentially a reduced version of the above - a copyivo
  250. comparison is made using Markov chains and the result is returned in a
  251. _CopyvioCheckResult object - without using a search engine, as the
  252. suspected "violated" URL is supplied from the start.
  253. Its primary use is to generate a result when the URL is retrieved from
  254. a cache, like the one used in EarwigBot's Toolserver site. After a
  255. search is done, the resulting URL is stored in a cache for 24 hours so
  256. future checks against that page will not require another set of
  257. time-and-money-consuming search engine queries. However, the comparison
  258. itself (which includes the article's and the source's content) cannot
  259. be stored for data retention reasons, so a fresh comparison is made
  260. using this function.
  261. Since no searching is done, neither UnknownSearchEngineError nor
  262. SearchQueryError will be raised.
  263. """
  264. content = self.get(force)
  265. clean = self._copyvio_strip_article(content)
  266. article_chain = _MarkovChain(clean)
  267. confidence, chains = self._copyvio_compare_content(article_chain, url)
  268. if confidence >= min_confidence:
  269. is_violation = True
  270. else:
  271. is_violation = False
  272. return _CopyvioCheckResult(is_violation, confidence, url, 0,
  273. article_chain, chains)