A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.

349 satır
15 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2014 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from collections import namedtuple
  23. from gzip import GzipFile
  24. from Queue import Empty, Queue
  25. from socket import timeout
  26. from StringIO import StringIO
  27. from threading import Lock, Semaphore, Thread
  28. from time import sleep, time
  29. from urllib2 import build_opener, URLError
  30. from earwigbot import exceptions, importer
  31. from earwigbot.wiki.copyvios.markov import (
  32. EMPTY, EMPTY_INTERSECTION, MarkovChain, MarkovChainIntersection)
  33. from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser
  34. from earwigbot.wiki.copyvios.result import CopyvioCheckResult
  35. from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
  36. oauth = importer.new("oauth2")
  37. tldextract = importer.new("tldextract")
  38. __all__ = ["CopyvioMixIn"]
  39. _WorkingResult = namedtuple("_WorkingResult", ["url", "confidence", "chains"])
  40. class _CopyvioWorkspace(object):
  41. """Manages a single copyvio check distributed across threads."""
  42. def __init__(self, article, min_confidence, until, logger, headers,
  43. url_timeout=5, max_concurrent_requests=6):
  44. self.best = _WorkingResult(None, 0.0, (EMPTY, EMPTY_INTERSECTION))
  45. self.request_semaphore = Semaphore(max_concurrent_requests)
  46. self._article = article
  47. self._logger = logger.getChild("copyvios")
  48. self._min_confidence = min_confidence
  49. self._handled_urls = []
  50. self._is_finished = False
  51. self._enqueue_lock = Lock()
  52. self._result_lock = Lock()
  53. self._workers = {}
  54. self._worker_args = (self, until, headers, url_timeout)
  55. def _calculate_confidence(self, delta):
  56. """Return the confidence of a violation as a float between 0 and 1."""
  57. return float(delta.size()) / self._article.size()
  58. def _finish_early(self):
  59. """Finish handling links prematurely (if we've hit min_confidence)."""
  60. self._logger.debug("Confidence threshold met; clearing worker queues")
  61. with self._enqueue_lock:
  62. for worker in self._workers.itervalues():
  63. with worker.queue.mutex:
  64. worker.queue.clear()
  65. worker.queue.put(None)
  66. self._is_finished = True
  67. def enqueue(self, urls, exclude_check=None):
  68. """Put a list of URLs into the worker queue.
  69. *exclude_check* is an optional exclusion function that takes a URL and
  70. returns ``True`` if we should skip it and ``False`` otherwise.
  71. """
  72. for url in urls:
  73. with self._enqueue_lock:
  74. if self._is_finished:
  75. break
  76. if url in self._handled_urls:
  77. continue
  78. self._handled_urls.append(url)
  79. if exclude_check and exclude_check(url):
  80. continue
  81. try:
  82. key = tldextract.extract(url).registered_domain
  83. except ImportError: # Fall back on very naive method
  84. from urlparse import urlparse
  85. key = u".".join(urlparse(url).netloc.split(".")[-2:])
  86. logmsg = "enqueue(): {0} {1} -> {2}"
  87. if key in self._workers:
  88. self._logger.debug(logmsg.format("PUT", key, url))
  89. self._workers[key].queue.put(url)
  90. else:
  91. self._logger.debug(logmsg.format("NEW", key, url))
  92. worker = _CopyvioWorker(*self._worker_args)
  93. worker.queue.put(url)
  94. thread = Thread(target=worker.run)
  95. thread.name = "cvworker-" + key.encode("utf8")
  96. thread.daemon = True
  97. thread.start()
  98. self._workers[key] = worker
  99. def wait(self):
  100. """Wait for the workers to finish handling the queue."""
  101. self._logger.debug("Waiting on {0} workers".format(len(self._workers)))
  102. for worker in self._workers.itervalues():
  103. worker.queue.put(None) # Exit signal to workers
  104. for worker in self._workers.itervalues():
  105. worker.join()
  106. def compare(self, url, source):
  107. """Compare a source to the article, and update the working result."""
  108. delta = MarkovChainIntersection(self._article, source)
  109. confidence = self._calculate_confidence(delta)
  110. self._logger.debug("compare(): {0} -> {1}".format(url, confidence))
  111. with self._result_lock:
  112. if confidence > self.best.confidence:
  113. self.best = _WorkingResult(url, confidence, (source, delta))
  114. if confidence >= self._min_confidence:
  115. self._finish_early()
  116. class _CopyvioWorker(object):
  117. """A multithreaded URL opener/parser instance."""
  118. def __init__(self, workspace, until, headers, url_timeout):
  119. self.queue = Queue()
  120. self._workspace = workspace
  121. self._until = until
  122. self._opener = build_opener()
  123. self._opener.addheaders = headers
  124. self._url_timeout = url_timeout
  125. def _open_url(self, url):
  126. """Open a URL and return its parsed content, or None.
  127. First, we will decompress the content if the headers contain "gzip" as
  128. its content encoding. Then, we will return the content stripped using
  129. an HTML parser if the headers indicate it is HTML, or return the
  130. content directly if it is plain text. If we don't understand the
  131. content type, we'll return None.
  132. If a URLError was raised while opening the URL or an IOError was raised
  133. while decompressing, None will be returned.
  134. """
  135. with self._workspace.request_semaphore:
  136. try:
  137. response = self._opener.open(url, timeout=self._url_timeout)
  138. result = response.read()
  139. except (URLError, timeout):
  140. return None
  141. if response.headers.get("Content-Encoding") == "gzip":
  142. stream = StringIO(result)
  143. gzipper = GzipFile(fileobj=stream)
  144. try:
  145. result = gzipper.read()
  146. except IOError:
  147. return None
  148. ctype_full = response.headers.get("Content-Type", "text/plain")
  149. ctype = ctype_full.split(";", 1)[0]
  150. if ctype in ["text/html", "application/xhtml+xml"]:
  151. return HTMLTextParser(result).strip()
  152. elif ctype == "text/plain":
  153. return result.strip()
  154. else:
  155. return None
  156. def run(self):
  157. """Main entry point for the worker.
  158. We will keep fetching URLs from the queue and handling them until
  159. either we run out of time, or we get an exit signal that the queue is
  160. now empty.
  161. """
  162. while True:
  163. if self._until:
  164. max_time = self._until - time()
  165. if max_time <= 0:
  166. return
  167. try:
  168. url = self.queue.get(timeout=max_time)
  169. except Empty:
  170. return
  171. else:
  172. url = self.queue.get()
  173. if url is None: # Exit signal
  174. return
  175. text = self._open_url(url.encode("utf8"))
  176. if text:
  177. self._workspace.compare(url, MarkovChain(text))
  178. class CopyvioMixIn(object):
  179. """
  180. **EarwigBot: Wiki Toolset: Copyright Violation MixIn**
  181. This is a mixin that provides two public methods, :py:meth:`copyvio_check`
  182. and :py:meth:`copyvio_compare`. The former checks the page for copyright
  183. violations using a search engine API, and the latter compares the page
  184. against a given URL. Credentials for the search engine API are stored in
  185. the :py:class:`~earwigbot.wiki.site.Site`'s config.
  186. """
  187. def __init__(self, site):
  188. self._search_config = site._search_config
  189. self._exclusions_db = self._search_config.get("exclusions_db")
  190. self._addheaders = site._opener.addheaders
  191. def _get_search_engine(self):
  192. """Return a function that can be called to do web searches.
  193. The function takes one argument, a search query, and returns a list of
  194. URLs, ranked by importance. The underlying logic depends on the
  195. *engine* argument within our config; for example, if *engine* is
  196. "Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying.
  197. Raises UnknownSearchEngineError if the 'engine' listed in our config is
  198. unknown to us, and UnsupportedSearchEngineError if we are missing a
  199. required package or module, like oauth2 for "Yahoo! BOSS".
  200. """
  201. engine = self._search_config["engine"]
  202. credentials = self._search_config["credentials"]
  203. if engine == "Yahoo! BOSS":
  204. try:
  205. oauth.__version__ # Force-load the lazy module
  206. except ImportError:
  207. e = "Yahoo! BOSS requires the 'oauth2' package: https://github.com/simplegeo/python-oauth2"
  208. raise exceptions.UnsupportedSearchEngineError(e)
  209. opener = build_opener()
  210. opener.addheaders = self._addheaders
  211. return YahooBOSSSearchEngine(credentials, opener)
  212. raise exceptions.UnknownSearchEngineError(engine)
  213. def copyvio_check(self, min_confidence=0.5, max_queries=15, max_time=-1):
  214. """Check the page for copyright violations.
  215. Returns a :class:`.CopyvioCheckResult` object with information on the
  216. results of the check.
  217. *min_confidence* is the minimum amount of confidence we must have in
  218. the similarity between a source text and the article in order for us to
  219. consider it a suspected violation. This is a number between 0 and 1.
  220. *max_queries* is self-explanatory; we will never make more than this
  221. number of queries in a given check.
  222. *max_time* can be set to prevent copyvio checks from taking longer than
  223. a set amount of time (generally around a minute), which can be useful
  224. if checks are called through a web server with timeouts. We will stop
  225. checking new URLs as soon as this limit is reached.
  226. Raises :exc:`.CopyvioCheckError` or subclasses
  227. (:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on
  228. errors.
  229. """
  230. log = "Starting copyvio check for [[{0}]]"
  231. self._logger.info(log.format(self.title))
  232. start_time = time()
  233. until = (start_time + max_time) if max_time > 0 else None
  234. searcher = self._get_search_engine()
  235. parser = ArticleTextParser(self.get())
  236. article = MarkovChain(parser.strip())
  237. workspace = _CopyvioWorkspace(article, min_confidence, until,
  238. self._logger, self._addheaders)
  239. if self._exclusions_db:
  240. self._exclusions_db.sync(self.site.name)
  241. exclude = lambda u: self._exclusions_db.check(self.site.name, u)
  242. else:
  243. exclude = None
  244. if article.size() < 20: # Auto-fail very small articles
  245. result = CopyvioCheckResult(False, 0.0, None, 0, 0, article,
  246. workspace.best.chains)
  247. self._logger.info(result.get_log_message(self.title))
  248. return result
  249. workspace.enqueue(parser.get_links(), exclude)
  250. chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
  251. num_queries = 0
  252. for chunk in chunks:
  253. if workspace.best.confidence >= min_confidence:
  254. break
  255. log = u"[[{0}]] -> querying {1} for {2!r}"
  256. self._logger.debug(log.format(self.title, searcher.name, chunk))
  257. workspace.enqueue(searcher.search(chunk), exclude)
  258. num_queries += 1
  259. sleep(1)
  260. workspace.wait()
  261. result = CopyvioCheckResult(
  262. workspace.best.confidence >= min_confidence,
  263. workspace.best.confidence, workspace.best.url, num_queries,
  264. time() - start_time, article, workspace.best.chains)
  265. self._logger.info(result.get_log_message(self.title))
  266. return result
  267. def copyvio_compare(self, url, min_confidence=0.5, max_time=30):
  268. """Check the page like :py:meth:`copyvio_check` against a specific URL.
  269. This is essentially a reduced version of :meth:`copyvio_check` - a
  270. copyivo comparison is made using Markov chains and the result is
  271. returned in a :class:`.CopyvioCheckResult` object - but without using a
  272. search engine, since the suspected "violated" URL is supplied from the
  273. start.
  274. Its primary use is to generate a result when the URL is retrieved from
  275. a cache, like the one used in EarwigBot's Tool Labs site. After a
  276. search is done, the resulting URL is stored in a cache for 72 hours so
  277. future checks against that page will not require another set of
  278. time-and-money-consuming search engine queries. However, the comparison
  279. itself (which includes the article's and the source's content) cannot
  280. be stored for data retention reasons, so a fresh comparison is made
  281. using this function.
  282. Since no searching is done, neither :exc:`.UnknownSearchEngineError`
  283. nor :exc:`.SearchQueryError` will be raised.
  284. """
  285. log = "Starting copyvio compare for [[{0}]] against {1}"
  286. self._logger.info(log.format(self.title, url))
  287. start_time = time()
  288. until = (start_time + max_time) if max_time > 0 else None
  289. article = MarkovChain(ArticleTextParser(self.get()).strip())
  290. workspace = _CopyvioWorkspace(article, min_confidence, until,
  291. self._logger, self._addheaders, max_time)
  292. workspace.enqueue([url])
  293. workspace.wait()
  294. url, conf, chains = workspace.best
  295. result = CopyvioCheckResult(conf >= min_confidence, conf, url, 0,
  296. time() - start_time, article, chains)
  297. self._logger.info(result.get_log_message(self.title))
  298. return result