A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

380 lines
14 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from collections import deque
  23. from gzip import GzipFile
  24. from httplib import HTTPException
  25. from logging import getLogger
  26. from math import log
  27. from Queue import Empty, Queue
  28. from socket import error as socket_error
  29. from StringIO import StringIO
  30. from struct import error as struct_error
  31. from threading import Lock, Thread
  32. from time import time
  33. from urllib2 import build_opener, URLError
  34. from earwigbot import importer
  35. from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
  36. from earwigbot.wiki.copyvios.parsers import get_parser
  37. from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource
  38. tldextract = importer.new("tldextract")
  39. __all__ = ["globalize", "localize", "CopyvioWorkspace"]
  40. _is_globalized = False
  41. _global_queues = None
  42. _global_workers = []
  43. def globalize(num_workers=8):
  44. """Cause all copyvio checks to be done by one global set of workers.
  45. This is useful when checks are being done through a web interface where
  46. large numbers of simulatenous requests could be problematic. The global
  47. workers are spawned when the function is called, run continuously, and
  48. intelligently handle multiple checks.
  49. This function is not thread-safe and should only be called when no checks
  50. are being done. It has no effect if it has already been called.
  51. """
  52. global _is_globalized, _global_queues
  53. if _is_globalized:
  54. return
  55. _global_queues = _CopyvioQueues()
  56. for i in xrange(num_workers):
  57. worker = _CopyvioWorker("global-{0}".format(i), _global_queues)
  58. worker.start()
  59. _global_workers.append(worker)
  60. _is_globalized = True
  61. def localize():
  62. """Return to using page-specific workers for copyvio checks.
  63. This disables changes made by :func:`globalize`, including stoping the
  64. global worker threads.
  65. This function is not thread-safe and should only be called when no checks
  66. are being done.
  67. """
  68. global _is_globalized, _global_queues, _global_workers
  69. if not _is_globalized:
  70. return
  71. for i in xrange(len(_global_workers)):
  72. _global_queues.unassigned.put((StopIteration, None))
  73. _global_queues = None
  74. _global_workers = []
  75. _is_globalized = False
  76. class _CopyvioQueues(object):
  77. """Stores data necessary to maintain the various queues during a check."""
  78. def __init__(self):
  79. self.lock = Lock()
  80. self.sites = {}
  81. self.unassigned = Queue()
  82. class _CopyvioWorker(object):
  83. """A multithreaded URL opener/parser instance."""
  84. def __init__(self, name, queues, until=None):
  85. self._name = name
  86. self._queues = queues
  87. self._until = until
  88. self._site = None
  89. self._queue = None
  90. self._opener = build_opener()
  91. self._logger = getLogger("earwigbot.wiki.cvworker." + name)
  92. def _open_url(self, source):
  93. """Open a URL and return its parsed content, or None.
  94. First, we will decompress the content if the headers contain "gzip" as
  95. its content encoding. Then, we will return the content stripped using
  96. an HTML parser if the headers indicate it is HTML, or return the
  97. content directly if it is plain text. If we don't understand the
  98. content type, we'll return None.
  99. If a URLError was raised while opening the URL or an IOError was raised
  100. while decompressing, None will be returned.
  101. """
  102. if source.headers:
  103. self._opener.addheaders = source.headers
  104. url = source.url.encode("utf8")
  105. try:
  106. response = self._opener.open(url, timeout=source.timeout)
  107. except (URLError, HTTPException, socket_error):
  108. return None
  109. try:
  110. size = int(response.headers.get("Content-Length", 0))
  111. except ValueError:
  112. return None
  113. content_type = response.headers.get("Content-Type", "text/plain")
  114. handler = get_parser(content_type)
  115. if not handler:
  116. return None
  117. if size > (15 if handler.TYPE == "PDF" else 2) * 1024 ** 2:
  118. return None
  119. try:
  120. content = response.read()
  121. except (URLError, socket_error):
  122. return None
  123. if response.headers.get("Content-Encoding") == "gzip":
  124. stream = StringIO(content)
  125. gzipper = GzipFile(fileobj=stream)
  126. try:
  127. content = gzipper.read()
  128. except (IOError, struct_error):
  129. return None
  130. return handler(content).parse()
  131. def _acquire_new_site(self):
  132. """Block for a new unassigned site queue."""
  133. if self._until:
  134. timeout = self._until - time()
  135. if timeout <= 0:
  136. raise Empty
  137. else:
  138. timeout = None
  139. self._logger.debug("Waiting for new site queue")
  140. site, queue = self._queues.unassigned.get(timeout=timeout)
  141. if site is StopIteration:
  142. raise StopIteration
  143. self._logger.debug(u"Acquired new site queue: {0}".format(site))
  144. self._site = site
  145. self._queue = queue
  146. def _dequeue(self):
  147. """Remove a source from one of the queues."""
  148. if not self._site:
  149. self._acquire_new_site()
  150. logmsg = u"Fetching source URL from queue {0}"
  151. self._logger.debug(logmsg.format(self._site))
  152. self._queues.lock.acquire()
  153. try:
  154. source = self._queue.popleft()
  155. except IndexError:
  156. self._logger.debug("Queue is empty")
  157. del self._queues.sites[self._site]
  158. self._site = None
  159. self._queue = None
  160. self._queues.lock.release()
  161. return self._dequeue()
  162. self._logger.debug(u"Got source URL: {0}".format(source.url))
  163. if source.skipped:
  164. self._logger.debug("Source has been skipped")
  165. self._queues.lock.release()
  166. return self._dequeue()
  167. source.start_work()
  168. self._queues.lock.release()
  169. return source
  170. def _run(self):
  171. """Main entry point for the worker thread.
  172. We will keep fetching URLs from the queues and handling them until
  173. either we run out of time, or we get an exit signal that the queue is
  174. now empty.
  175. """
  176. while True:
  177. try:
  178. source = self._dequeue()
  179. except Empty:
  180. self._logger.debug("Exiting: queue timed out")
  181. return
  182. except StopIteration:
  183. self._logger.debug("Exiting: got stop signal")
  184. return
  185. text = self._open_url(source)
  186. chain = MarkovChain(text) if text else None
  187. source.workspace.compare(source, chain)
  188. def start(self):
  189. """Start the copyvio worker in a new thread."""
  190. thread = Thread(target=self._run, name="cvworker-" + self._name)
  191. thread.daemon = True
  192. thread.start()
  193. class CopyvioWorkspace(object):
  194. """Manages a single copyvio check distributed across threads."""
  195. def __init__(self, article, min_confidence, max_time, logger, headers,
  196. url_timeout=5, num_workers=8, short_circuit=True):
  197. self.sources = []
  198. self.finished = False
  199. self.possible_miss = False
  200. self._article = article
  201. self._logger = logger.getChild("copyvios")
  202. self._min_confidence = min_confidence
  203. self._start_time = time()
  204. self._until = (self._start_time + max_time) if max_time > 0 else None
  205. self._handled_urls = set()
  206. self._finish_lock = Lock()
  207. self._short_circuit = short_circuit
  208. self._source_args = {"workspace": self, "headers": headers,
  209. "timeout": url_timeout}
  210. if _is_globalized:
  211. self._queues = _global_queues
  212. else:
  213. self._queues = _CopyvioQueues()
  214. self._num_workers = num_workers
  215. for i in xrange(num_workers):
  216. name = "local-{0:04}.{1}".format(id(self) % 10000, i)
  217. _CopyvioWorker(name, self._queues, self._until).start()
  218. def _calculate_confidence(self, delta):
  219. """Return the confidence of a violation as a float between 0 and 1."""
  220. def conf_with_article_and_delta(article, delta):
  221. """Calculate confidence using the article and delta chain sizes."""
  222. # This piecewise function exhibits exponential growth until it
  223. # reaches the default "suspect" confidence threshold, at which
  224. # point it transitions to polynomial growth with a limit of 1 as
  225. # (delta / article) approaches 1.
  226. # A graph can be viewed here: http://goo.gl/mKPhvr
  227. ratio = delta / article
  228. if ratio <= 0.52763:
  229. return -log(1 - ratio)
  230. else:
  231. return (-0.8939 * (ratio ** 2)) + (1.8948 * ratio) - 0.0009
  232. def conf_with_delta(delta):
  233. """Calculate confidence using just the delta chain size."""
  234. # This piecewise function was derived from experimental data using
  235. # reference points at (0, 0), (100, 0.5), (250, 0.75), (500, 0.9),
  236. # and (1000, 0.95), with a limit of 1 as delta approaches infinity.
  237. # A graph can be viewed here: http://goo.gl/lVl7or
  238. if delta <= 100:
  239. return delta / (delta + 100)
  240. elif delta <= 250:
  241. return (delta - 25) / (delta + 50)
  242. elif delta <= 500:
  243. return (10.5 * delta - 750) / (10 * delta)
  244. else:
  245. return (delta - 50) / delta
  246. d_size = float(delta.size)
  247. return abs(max(conf_with_article_and_delta(self._article.size, d_size),
  248. conf_with_delta(d_size)))
  249. def _finish_early(self):
  250. """Finish handling links prematurely (if we've hit min_confidence)."""
  251. self._logger.debug("Confidence threshold met; skipping remaining sources")
  252. with self._queues.lock:
  253. for source in self.sources:
  254. source.skip()
  255. self.finished = True
  256. def enqueue(self, urls, exclude_check=None):
  257. """Put a list of URLs into the various worker queues.
  258. *exclude_check* is an optional exclusion function that takes a URL and
  259. returns ``True`` if we should skip it and ``False`` otherwise.
  260. """
  261. for url in urls:
  262. with self._queues.lock:
  263. if url in self._handled_urls:
  264. continue
  265. self._handled_urls.add(url)
  266. if exclude_check and exclude_check(url):
  267. continue
  268. source = CopyvioSource(url=url, **self._source_args)
  269. self.sources.append(source)
  270. if self._short_circuit and self.finished:
  271. self._logger.debug(u"enqueue(): auto-skip {0}".format(url))
  272. source.skip()
  273. continue
  274. try:
  275. key = tldextract.extract(url).registered_domain
  276. except ImportError: # Fall back on very naive method
  277. from urlparse import urlparse
  278. key = u".".join(urlparse(url).netloc.split(".")[-2:])
  279. logmsg = u"enqueue(): {0} {1} -> {2}"
  280. if key in self._queues.sites:
  281. self._logger.debug(logmsg.format("append", key, url))
  282. self._queues.sites[key].append(source)
  283. else:
  284. self._logger.debug(logmsg.format("new", key, url))
  285. self._queues.sites[key] = queue = deque()
  286. queue.append(source)
  287. self._queues.unassigned.put((key, queue))
  288. def compare(self, source, source_chain):
  289. """Compare a source to the article; call _finish_early if necessary."""
  290. if source_chain:
  291. delta = MarkovChainIntersection(self._article, source_chain)
  292. conf = self._calculate_confidence(delta)
  293. else:
  294. conf = 0.0
  295. self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf))
  296. with self._finish_lock:
  297. if source_chain:
  298. source.update(conf, source_chain, delta)
  299. source.finish_work()
  300. if not self.finished and conf >= self._min_confidence:
  301. if self._short_circuit:
  302. self._finish_early()
  303. else:
  304. self.finished = True
  305. def wait(self):
  306. """Wait for the workers to finish handling the sources."""
  307. self._logger.debug("Waiting on {0} sources".format(len(self.sources)))
  308. for source in self.sources:
  309. source.join(self._until)
  310. with self._finish_lock:
  311. pass # Wait for any remaining comparisons to be finished
  312. if not _is_globalized:
  313. for i in xrange(self._num_workers):
  314. self._queues.unassigned.put((StopIteration, None))
  315. def get_result(self, num_queries=0):
  316. """Return a CopyvioCheckResult containing the results of this check."""
  317. def cmpfunc(s1, s2):
  318. if s2.confidence != s1.confidence:
  319. return 1 if s2.confidence > s1.confidence else -1
  320. return int(s1.skipped) - int(s2.skipped)
  321. self.sources.sort(cmpfunc)
  322. return CopyvioCheckResult(self.finished, self.sources, num_queries,
  323. time() - self._start_time, self._article,
  324. self.possible_miss)