A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

404 行
15 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2014 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from collections import deque
  23. from gzip import GzipFile
  24. from logging import getLogger
  25. from math import log
  26. from Queue import Empty, Queue
  27. from socket import error
  28. from StringIO import StringIO
  29. from threading import Event, Lock, Thread
  30. from time import time
  31. from urllib2 import build_opener, URLError
  32. from earwigbot import importer
  33. from earwigbot.wiki.copyvios.markov import (
  34. EMPTY, EMPTY_INTERSECTION, MarkovChain, MarkovChainIntersection)
  35. from earwigbot.wiki.copyvios.parsers import HTMLTextParser
  36. tldextract = importer.new("tldextract")
  37. __all__ = ["globalize", "localize", "CopyvioWorkspace"]
  38. _is_globalized = False
  39. _global_queues = None
  40. _global_workers = []
  41. def globalize(num_workers=8):
  42. """Cause all copyvio checks to be done by one global set of workers.
  43. This is useful when checks are being done through a web interface where
  44. large numbers of simulatenous requests could be problematic. The global
  45. workers are spawned when the function is called, run continuously, and
  46. intelligently handle multiple checks.
  47. This function is not thread-safe and should only be called when no checks
  48. are being done. It has no effect if it has already been called.
  49. """
  50. global _is_globalized, _global_queues
  51. if _is_globalized:
  52. return
  53. _global_queues = _CopyvioQueues()
  54. for i in xrange(num_workers):
  55. worker = _CopyvioWorker("global-{0}".format(i), _global_queues)
  56. _global_workers.append(worker)
  57. _is_globalized = True
  58. def localize():
  59. """Return to using page-specific workers for copyvio checks.
  60. This disables changes made by :func:`globalize`, including stoping the
  61. global worker threads.
  62. This function is not thread-safe and should only be called when no checks
  63. are being done.
  64. """
  65. global _is_globalized, _global_queues, _global_workers
  66. if not _is_globalized:
  67. return
  68. for i in xrange(len(_global_workers)):
  69. _global_queues.unassigned.put((StopIteration, None))
  70. _global_queues = None
  71. _global_workers = []
  72. _is_globalized = False
  73. class _CopyvioSource(object):
  74. """Represents a single suspected violation source (a URL)."""
  75. def __init__(self, workspace, url, key, headers=None, timeout=5):
  76. self.workspace = workspace
  77. self.url = url
  78. self.key = key
  79. self.headers = headers
  80. self.timeout = timeout
  81. self.confidence = 0.0
  82. self.chains = (EMPTY, EMPTY_INTERSECTION)
  83. self._event1 = Event()
  84. self._event2 = Event()
  85. self._event2.set()
  86. def touched(self):
  87. """Return whether one of start_work() and cancel() have been called."""
  88. return self._event1.is_set()
  89. def start_work(self):
  90. """Mark this source as being worked on right now."""
  91. self._event2.clear()
  92. self._event1.set()
  93. def finish_work(self, confidence, source_chain, delta_chain):
  94. """Complete the confidence information inside this source."""
  95. self.confidence = confidence
  96. self.chains = (source_chain, delta_chain)
  97. self._event2.set()
  98. def cancel(self):
  99. """Deactivate this source without filling in the relevant data."""
  100. self._event1.set()
  101. def join(self, until):
  102. """Block until this violation result is filled out."""
  103. for event in [self._event1, self._event2]:
  104. if until:
  105. timeout = until - time()
  106. if timeout <= 0:
  107. return
  108. event.wait(timeout)
  109. class _CopyvioQueues(object):
  110. """Stores data necessary to maintain the various queues during a check."""
  111. def __init__(self):
  112. self.lock = Lock()
  113. self.sites = {}
  114. self.unassigned = Queue()
  115. class _CopyvioWorker(object):
  116. """A multithreaded URL opener/parser instance."""
  117. def __init__(self, name, queues, until=None):
  118. self._queues = queues
  119. self._until = until
  120. self._thread = thread = Thread(target=self._run)
  121. self._site = None
  122. self._queue = None
  123. self._opener = build_opener()
  124. self._logger = getLogger("earwigbot.wiki.cvworker." + name)
  125. thread.name = "cvworker-" + name
  126. thread.daemon = True
  127. thread.start()
  128. def _open_url(self, source):
  129. """Open a URL and return its parsed content, or None.
  130. First, we will decompress the content if the headers contain "gzip" as
  131. its content encoding. Then, we will return the content stripped using
  132. an HTML parser if the headers indicate it is HTML, or return the
  133. content directly if it is plain text. If we don't understand the
  134. content type, we'll return None.
  135. If a URLError was raised while opening the URL or an IOError was raised
  136. while decompressing, None will be returned.
  137. """
  138. self._opener.addheaders = source.headers
  139. url = source.url.encode("utf8")
  140. try:
  141. response = self._opener.open(url, timeout=source.timeout)
  142. except (URLError, error):
  143. return None
  144. try:
  145. size = int(response.headers.get("Content-Length", 0))
  146. except ValueError:
  147. return None
  148. if size > 1024 ** 2: # Ignore URLs larger than a megabyte
  149. return None
  150. ctype_full = response.headers.get("Content-Type", "text/plain")
  151. ctype = ctype_full.split(";", 1)[0]
  152. if ctype in ["text/html", "application/xhtml+xml"]:
  153. handler = lambda res: HTMLTextParser(res).strip()
  154. elif ctype == "text/plain":
  155. handler = lambda res: res.strip()
  156. else:
  157. return None
  158. try:
  159. content = response.read()
  160. except (URLError, error):
  161. return None
  162. if response.headers.get("Content-Encoding") == "gzip":
  163. stream = StringIO(content)
  164. gzipper = GzipFile(fileobj=stream)
  165. try:
  166. content = gzipper.read(2 * 1024 ** 2)
  167. except IOError:
  168. return None
  169. return handler(content)
  170. def _acquire_new_site(self):
  171. """Block for a new unassigned site queue."""
  172. if self._until:
  173. timeout = self._until - time()
  174. if timeout <= 0:
  175. raise Empty
  176. else:
  177. timeout = None
  178. self._logger.debug("Waiting for new site queue")
  179. site, queue = self._queues.unassigned.get(timeout=timeout)
  180. if site is StopIteration:
  181. raise StopIteration
  182. self._logger.debug(u"Acquired new site queue: {0}".format(site))
  183. self._site = site
  184. self._queue = queue
  185. def _dequeue(self):
  186. """Remove a source from one of the queues."""
  187. if not self._site:
  188. self._acquire_new_site()
  189. logmsg = u"Fetching source URL from queue {0}"
  190. self._logger.debug(logmsg.format(self._site))
  191. self._queues.lock.acquire()
  192. try:
  193. source = self._queue.popleft()
  194. except IndexError:
  195. self._logger.debug("Queue is empty")
  196. del self._queues.sites[self._site]
  197. self._site = None
  198. self._queue = None
  199. self._queues.lock.release()
  200. return self._dequeue()
  201. self._logger.debug(u"Got source URL: {0}".format(source.url))
  202. if source.touched():
  203. self._logger.debug("Source has been cancelled")
  204. self._queues.lock.release()
  205. return self._dequeue()
  206. source.start_work()
  207. self._queues.lock.release()
  208. return source
  209. def _run(self):
  210. """Main entry point for the worker thread.
  211. We will keep fetching URLs from the queues and handling them until
  212. either we run out of time, or we get an exit signal that the queue is
  213. now empty.
  214. """
  215. while True:
  216. try:
  217. source = self._dequeue()
  218. except Empty:
  219. self._logger.debug("Exiting: queue timed out")
  220. return
  221. except StopIteration:
  222. self._logger.debug("Exiting: got stop signal")
  223. return
  224. text = self._open_url(source)
  225. source.workspace.compare(source, MarkovChain(text or ""))
  226. class CopyvioWorkspace(object):
  227. """Manages a single copyvio check distributed across threads."""
  228. def __init__(self, article, min_confidence, until, logger, headers,
  229. url_timeout=5, num_workers=8):
  230. self.best = _CopyvioSource(self, None, None)
  231. self.sources = []
  232. self._article = article
  233. self._logger = logger.getChild("copyvios")
  234. self._min_confidence = min_confidence
  235. self._until = until
  236. self._handled_urls = []
  237. self._is_finished = False
  238. self._compare_lock = Lock()
  239. self._source_args = {"workspace": self, "headers": headers,
  240. "timeout": url_timeout}
  241. if _is_globalized:
  242. self._queues = _global_queues
  243. else:
  244. self._queues = _CopyvioQueues()
  245. self._num_workers = num_workers
  246. for i in xrange(num_workers):
  247. name = "local-{0:04}.{1}".format(id(self) % 10000, i)
  248. worker = _CopyvioWorker(name, self._queues, until)
  249. def _calculate_confidence(self, delta):
  250. """Return the confidence of a violation as a float between 0 and 1."""
  251. def conf_with_article_and_delta(article, delta):
  252. """Calculate confidence using the article and delta chain sizes."""
  253. # This piecewise function, C_AΔ(Δ), was defined such that
  254. # confidence exhibits exponential growth until it reaches the
  255. # default "suspect" confidence threshold, at which point it
  256. # transitions to polynomial growth with lim (A/Δ)→1 C_AΔ(A,Δ) = 1.
  257. # A graph can be viewed here:
  258. # http://benkurtovic.com/static/article-delta_confidence_function.pdf
  259. ratio = delta / article
  260. if ratio <= 0.52763:
  261. return log(1 / (1 - ratio))
  262. else:
  263. return (-0.8939 * (ratio ** 2)) + (1.8948 * ratio) - 0.0009
  264. def conf_with_delta(delta):
  265. """Calculate confidence using just the delta chain size."""
  266. # This piecewise function, C_Δ(Δ), was derived from experimental
  267. # data using reference points at (0, 0), (100, 0.5), (250, 0.75),
  268. # (500, 0.9), and (1000, 0.95) with lim Δ→+∞ C_Δ(Δ) = 1.
  269. # A graph can be viewed here:
  270. # http://benkurtovic.com/static/delta_confidence_function.pdf
  271. if delta <= 100:
  272. return delta / (delta + 100)
  273. elif delta <= 250:
  274. return (delta - 25) / (delta + 50)
  275. elif delta <= 500:
  276. return (10.5 * delta - 750) / (10 * delta)
  277. else:
  278. return (delta - 50) / delta
  279. d_size = float(delta.size)
  280. return max(conf_with_article_and_delta(self._article.size, d_size),
  281. conf_with_delta(d_size))
  282. def _finish_early(self):
  283. """Finish handling links prematurely (if we've hit min_confidence)."""
  284. if self._is_finished:
  285. return
  286. self._logger.debug("Confidence threshold met; cancelling remaining sources")
  287. with self._queues.lock:
  288. for source in self.sources:
  289. source.cancel()
  290. self._is_finished = True
  291. def enqueue(self, urls, exclude_check=None):
  292. """Put a list of URLs into the various worker queues.
  293. *exclude_check* is an optional exclusion function that takes a URL and
  294. returns ``True`` if we should skip it and ``False`` otherwise.
  295. """
  296. for url in urls:
  297. with self._queues.lock:
  298. if self._is_finished:
  299. break
  300. if url in self._handled_urls:
  301. continue
  302. self._handled_urls.append(url)
  303. if exclude_check and exclude_check(url):
  304. continue
  305. try:
  306. key = tldextract.extract(url).registered_domain
  307. except ImportError: # Fall back on very naive method
  308. from urlparse import urlparse
  309. key = u".".join(urlparse(url).netloc.split(".")[-2:])
  310. source = _CopyvioSource(url=url, key=key, **self._source_args)
  311. self.sources.append(source)
  312. logmsg = u"enqueue(): {0} {1} -> {2}"
  313. if key in self._queues.sites:
  314. self._logger.debug(logmsg.format("append", key, url))
  315. self._queues.sites[key].append(source)
  316. else:
  317. self._logger.debug(logmsg.format("new", key, url))
  318. self._queues.sites[key] = queue = deque()
  319. queue.append(source)
  320. self._queues.unassigned.put((key, queue))
  321. def wait(self):
  322. """Wait for the workers to finish handling the sources."""
  323. self._logger.debug("Waiting on {0} sources".format(len(self.sources)))
  324. for source in self.sources:
  325. source.join(self._until)
  326. with self._compare_lock:
  327. pass # Wait for any remaining comparisons to be finished
  328. if not _is_globalized:
  329. for i in xrange(self._num_workers):
  330. self._queues.unassigned.put((StopIteration, None))
  331. def compare(self, source, source_chain):
  332. """Compare a source to the article, and update the best known one."""
  333. delta = MarkovChainIntersection(self._article, source_chain)
  334. conf = self._calculate_confidence(delta)
  335. self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf))
  336. with self._compare_lock:
  337. source.finish_work(conf, source_chain, delta)
  338. if conf > self.best.confidence:
  339. self.best = source
  340. if conf >= self._min_confidence:
  341. self._finish_early()