A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

185 lines
8.4 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from time import sleep, time
  23. from urllib2 import build_opener
  24. from earwigbot import exceptions, importer
  25. from earwigbot.wiki.copyvios.markov import MarkovChain
  26. from earwigbot.wiki.copyvios.parsers import ArticleTextParser
  27. from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
  28. from earwigbot.wiki.copyvios.workers import (
  29. globalize, localize, CopyvioWorkspace)
  30. oauth = importer.new("oauth2")
  31. __all__ = ["CopyvioMixIn", "globalize", "localize"]
  32. class CopyvioMixIn(object):
  33. """
  34. **EarwigBot: Wiki Toolset: Copyright Violation MixIn**
  35. This is a mixin that provides two public methods, :py:meth:`copyvio_check`
  36. and :py:meth:`copyvio_compare`. The former checks the page for copyright
  37. violations using a search engine API, and the latter compares the page
  38. against a given URL. Credentials for the search engine API are stored in
  39. the :py:class:`~earwigbot.wiki.site.Site`'s config.
  40. """
  41. def __init__(self, site):
  42. self._search_config = site._search_config
  43. self._exclusions_db = self._search_config.get("exclusions_db")
  44. self._addheaders = site._opener.addheaders
  45. def _get_search_engine(self):
  46. """Return a function that can be called to do web searches.
  47. The function takes one argument, a search query, and returns a list of
  48. URLs, ranked by importance. The underlying logic depends on the
  49. *engine* argument within our config; for example, if *engine* is
  50. "Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying.
  51. Raises UnknownSearchEngineError if the 'engine' listed in our config is
  52. unknown to us, and UnsupportedSearchEngineError if we are missing a
  53. required package or module, like oauth2 for "Yahoo! BOSS".
  54. """
  55. engine = self._search_config["engine"]
  56. credentials = self._search_config["credentials"]
  57. if engine == "Yahoo! BOSS":
  58. try:
  59. oauth.__version__ # Force-load the lazy module
  60. except ImportError:
  61. e = "Yahoo! BOSS requires the 'oauth2' package: https://github.com/simplegeo/python-oauth2"
  62. raise exceptions.UnsupportedSearchEngineError(e)
  63. opener = build_opener()
  64. opener.addheaders = self._addheaders
  65. return YahooBOSSSearchEngine(credentials, opener)
  66. raise exceptions.UnknownSearchEngineError(engine)
  67. def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1,
  68. no_searches=False, no_links=False, short_circuit=True):
  69. """Check the page for copyright violations.
  70. Returns a :class:`.CopyvioCheckResult` object with information on the
  71. results of the check.
  72. *min_confidence* is the minimum amount of confidence we must have in
  73. the similarity between a source text and the article in order for us to
  74. consider it a suspected violation. This is a number between 0 and 1.
  75. *max_queries* is self-explanatory; we will never make more than this
  76. number of queries in a given check.
  77. *max_time* can be set to prevent copyvio checks from taking longer than
  78. a set amount of time (generally around a minute), which can be useful
  79. if checks are called through a web server with timeouts. We will stop
  80. checking new URLs as soon as this limit is reached.
  81. Setting *no_searches* to ``True`` will cause only URLs in the wikitext
  82. of the page to be checked; no search engine queries will be made.
  83. Setting *no_links* to ``True`` will cause the opposite to happen: URLs
  84. in the wikitext will be ignored; search engine queries will be made
  85. only. Setting both of these to ``True`` is pointless.
  86. Normally, the checker will short-circuit if it finds a URL that meets
  87. *min_confidence*. This behavior normally causes it to skip any
  88. remaining URLs and web queries, but setting *short_circuit* to
  89. ``False`` will prevent this.
  90. Raises :exc:`.CopyvioCheckError` or subclasses
  91. (:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on
  92. errors.
  93. """
  94. log = u"Starting copyvio check for [[{0}]]"
  95. self._logger.info(log.format(self.title))
  96. searcher = self._get_search_engine()
  97. parser = ArticleTextParser(self.get())
  98. article = MarkovChain(parser.strip())
  99. workspace = CopyvioWorkspace(
  100. article, min_confidence, max_time, self._logger, self._addheaders,
  101. short_circuit=short_circuit)
  102. if self._exclusions_db:
  103. self._exclusions_db.sync(self.site.name)
  104. exclude = lambda u: self._exclusions_db.check(self.site.name, u)
  105. else:
  106. exclude = None
  107. if article.size < 20: # Auto-fail very small articles
  108. result = workspace.get_result()
  109. self._logger.info(result.get_log_message(self.title))
  110. return result
  111. if not no_links:
  112. workspace.enqueue(parser.get_links(), exclude)
  113. num_queries = 0
  114. if not no_searches:
  115. chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
  116. for chunk in chunks:
  117. if short_circuit and workspace.finished:
  118. workspace.possible_miss = True
  119. break
  120. log = u"[[{0}]] -> querying {1} for {2!r}"
  121. self._logger.debug(log.format(self.title, searcher.name, chunk))
  122. workspace.enqueue(searcher.search(chunk), exclude)
  123. num_queries += 1
  124. sleep(1)
  125. workspace.wait()
  126. result = workspace.get_result(num_queries)
  127. self._logger.info(result.get_log_message(self.title))
  128. return result
  129. def copyvio_compare(self, url, min_confidence=0.75, max_time=30):
  130. """Check the page like :py:meth:`copyvio_check` against a specific URL.
  131. This is essentially a reduced version of :meth:`copyvio_check` - a
  132. copyivo comparison is made using Markov chains and the result is
  133. returned in a :class:`.CopyvioCheckResult` object - but without using a
  134. search engine, since the suspected "violated" URL is supplied from the
  135. start.
  136. Its primary use is to generate a result when the URL is retrieved from
  137. a cache, like the one used in EarwigBot's Tool Labs site. After a
  138. search is done, the resulting URL is stored in a cache for 72 hours so
  139. future checks against that page will not require another set of
  140. time-and-money-consuming search engine queries. However, the comparison
  141. itself (which includes the article's and the source's content) cannot
  142. be stored for data retention reasons, so a fresh comparison is made
  143. using this function.
  144. Since no searching is done, neither :exc:`.UnknownSearchEngineError`
  145. nor :exc:`.SearchQueryError` will be raised.
  146. """
  147. log = u"Starting copyvio compare for [[{0}]] against {1}"
  148. self._logger.info(log.format(self.title, url))
  149. article = MarkovChain(ArticleTextParser(self.get()).strip())
  150. workspace = CopyvioWorkspace(
  151. article, min_confidence, max_time, self._logger, self._addheaders,
  152. max_time, 1)
  153. workspace.enqueue([url])
  154. workspace.wait()
  155. result = workspace.get_result()
  156. self._logger.info(result.get_log_message(self.title))
  157. return result