A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

220 rader
9.0 KiB

  1. # Copyright (C) 2009-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19. # SOFTWARE.
  20. from time import sleep
  21. from urllib.request import build_opener
  22. from earwigbot import exceptions
  23. from earwigbot.wiki.copyvios.markov import MarkovChain
  24. from earwigbot.wiki.copyvios.parsers import ArticleTextParser
  25. from earwigbot.wiki.copyvios.search import SEARCH_ENGINES
  26. from earwigbot.wiki.copyvios.workers import CopyvioWorkspace, globalize, localize
  27. __all__ = ["CopyvioMixIn", "globalize", "localize"]
  28. class CopyvioMixIn:
  29. """
  30. **EarwigBot: Wiki Toolset: Copyright Violation MixIn**
  31. This is a mixin that provides two public methods, :py:meth:`copyvio_check`
  32. and :py:meth:`copyvio_compare`. The former checks the page for copyright
  33. violations using a search engine API, and the latter compares the page
  34. against a given URL. Credentials for the search engine API are stored in
  35. the :py:class:`~earwigbot.wiki.site.Site`'s config.
  36. """
  37. def __init__(self, site):
  38. self._search_config = site._search_config
  39. self._exclusions_db = self._search_config.get("exclusions_db")
  40. self._addheaders = [
  41. ("User-Agent", site.user_agent),
  42. ("Accept-Encoding", "gzip"),
  43. ]
  44. def _get_search_engine(self):
  45. """Return a function that can be called to do web searches.
  46. The function takes one argument, a search query, and returns a list of
  47. URLs, ranked by importance. The underlying logic depends on the
  48. *engine* argument within our config; for example, if *engine* is
  49. "Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying.
  50. Raises UnknownSearchEngineError if the 'engine' listed in our config is
  51. unknown to us, and UnsupportedSearchEngineError if we are missing a
  52. required package or module, like oauth2 for "Yahoo! BOSS".
  53. """
  54. engine = self._search_config["engine"]
  55. if engine not in SEARCH_ENGINES:
  56. raise exceptions.UnknownSearchEngineError(engine)
  57. klass = SEARCH_ENGINES[engine]
  58. credentials = self._search_config["credentials"]
  59. opener = build_opener()
  60. opener.addheaders = self._addheaders
  61. for dep in klass.requirements():
  62. try:
  63. __import__(dep).__name__
  64. except (ImportError, AttributeError):
  65. e = "Missing a required dependency ({}) for the {} engine"
  66. e = e.format(dep, engine)
  67. raise exceptions.UnsupportedSearchEngineError(e)
  68. return klass(credentials, opener)
  69. def copyvio_check(
  70. self,
  71. min_confidence=0.75,
  72. max_queries=15,
  73. max_time=-1,
  74. no_searches=False,
  75. no_links=False,
  76. short_circuit=True,
  77. ):
  78. """Check the page for copyright violations.
  79. Returns a :class:`.CopyvioCheckResult` object with information on the
  80. results of the check.
  81. *min_confidence* is the minimum amount of confidence we must have in
  82. the similarity between a source text and the article in order for us to
  83. consider it a suspected violation. This is a number between 0 and 1.
  84. *max_queries* is self-explanatory; we will never make more than this
  85. number of queries in a given check.
  86. *max_time* can be set to prevent copyvio checks from taking longer than
  87. a set amount of time (generally around a minute), which can be useful
  88. if checks are called through a web server with timeouts. We will stop
  89. checking new URLs as soon as this limit is reached.
  90. Setting *no_searches* to ``True`` will cause only URLs in the wikitext
  91. of the page to be checked; no search engine queries will be made.
  92. Setting *no_links* to ``True`` will cause the opposite to happen: URLs
  93. in the wikitext will be ignored; search engine queries will be made
  94. only. Setting both of these to ``True`` is pointless.
  95. Normally, the checker will short-circuit if it finds a URL that meets
  96. *min_confidence*. This behavior normally causes it to skip any
  97. remaining URLs and web queries, but setting *short_circuit* to
  98. ``False`` will prevent this.
  99. Raises :exc:`.CopyvioCheckError` or subclasses
  100. (:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on
  101. errors.
  102. """
  103. log = "Starting copyvio check for [[{0}]]"
  104. self._logger.info(log.format(self.title))
  105. searcher = self._get_search_engine()
  106. parser = ArticleTextParser(
  107. self.get(),
  108. args={"nltk_dir": self._search_config["nltk_dir"], "lang": self._site.lang},
  109. )
  110. article = MarkovChain(parser.strip())
  111. parser_args = {}
  112. if self._exclusions_db:
  113. self._exclusions_db.sync(self.site.name)
  114. def exclude(u):
  115. return self._exclusions_db.check(self.site.name, u)
  116. parser_args["mirror_hints"] = self._exclusions_db.get_mirror_hints(self)
  117. else:
  118. exclude = None
  119. workspace = CopyvioWorkspace(
  120. article,
  121. min_confidence,
  122. max_time,
  123. self._logger,
  124. self._addheaders,
  125. short_circuit=short_circuit,
  126. parser_args=parser_args,
  127. exclude_check=exclude,
  128. config=self._search_config,
  129. )
  130. if article.size < 20: # Auto-fail very small articles
  131. result = workspace.get_result()
  132. self._logger.info(result.get_log_message(self.title))
  133. return result
  134. if not no_links:
  135. workspace.enqueue(parser.get_links())
  136. num_queries = 0
  137. if not no_searches:
  138. chunks = parser.chunk(max_queries)
  139. for chunk in chunks:
  140. if short_circuit and workspace.finished:
  141. workspace.possible_miss = True
  142. break
  143. log = "[[{0}]] -> querying {1} for {2!r}"
  144. self._logger.debug(log.format(self.title, searcher.name, chunk))
  145. workspace.enqueue(searcher.search(chunk))
  146. num_queries += 1
  147. sleep(1)
  148. workspace.wait()
  149. result = workspace.get_result(num_queries)
  150. self._logger.info(result.get_log_message(self.title))
  151. return result
  152. def copyvio_compare(self, url, min_confidence=0.75, max_time=30):
  153. """Check the page like :py:meth:`copyvio_check` against a specific URL.
  154. This is essentially a reduced version of :meth:`copyvio_check` - a
  155. copyivo comparison is made using Markov chains and the result is
  156. returned in a :class:`.CopyvioCheckResult` object - but without using a
  157. search engine, since the suspected "violated" URL is supplied from the
  158. start.
  159. Its primary use is to generate a result when the URL is retrieved from
  160. a cache, like the one used in EarwigBot's Tool Labs site. After a
  161. search is done, the resulting URL is stored in a cache for 72 hours so
  162. future checks against that page will not require another set of
  163. time-and-money-consuming search engine queries. However, the comparison
  164. itself (which includes the article's and the source's content) cannot
  165. be stored for data retention reasons, so a fresh comparison is made
  166. using this function.
  167. Since no searching is done, neither :exc:`.UnknownSearchEngineError`
  168. nor :exc:`.SearchQueryError` will be raised.
  169. """
  170. log = "Starting copyvio compare for [[{0}]] against {1}"
  171. self._logger.info(log.format(self.title, url))
  172. article = MarkovChain(ArticleTextParser(self.get()).strip())
  173. workspace = CopyvioWorkspace(
  174. article,
  175. min_confidence,
  176. max_time,
  177. self._logger,
  178. self._addheaders,
  179. max_time,
  180. num_workers=1,
  181. config=self._search_config,
  182. )
  183. workspace.enqueue([url])
  184. workspace.wait()
  185. result = workspace.get_result()
  186. self._logger.info(result.get_log_message(self.title))
  187. return result