A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.

219 líneas
7.0 KiB

  1. # Copyright (C) 2009-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19. # SOFTWARE.
  20. from gzip import GzipFile
  21. from io import StringIO
  22. from json import loads
  23. from re import sub as re_sub
  24. from urllib.error import URLError
  25. from urllib.parse import urlencode
  26. from earwigbot import importer
  27. from earwigbot.exceptions import SearchQueryError
  28. lxml = importer.new("lxml")
  29. __all__ = [
  30. "BingSearchEngine",
  31. "GoogleSearchEngine",
  32. "YandexSearchEngine",
  33. "SEARCH_ENGINES",
  34. ]
  35. class _BaseSearchEngine:
  36. """Base class for a simple search engine interface."""
  37. name = "Base"
  38. def __init__(self, cred, opener):
  39. """Store credentials (*cred*) and *opener* for searching later on."""
  40. self.cred = cred
  41. self.opener = opener
  42. self.count = 5
  43. def __repr__(self):
  44. """Return the canonical string representation of the search engine."""
  45. return f"{self.__class__.__name__}()"
  46. def __str__(self):
  47. """Return a nice string representation of the search engine."""
  48. return f"<{self.__class__.__name__}>"
  49. def _open(self, *args):
  50. """Open a URL (like urlopen) and try to return its contents."""
  51. try:
  52. response = self.opener.open(*args)
  53. result = response.read()
  54. except (OSError, URLError) as exc:
  55. err = SearchQueryError(f"{self.name} Error: {exc}")
  56. err.cause = exc
  57. raise err
  58. if response.headers.get("Content-Encoding") == "gzip":
  59. stream = StringIO(result)
  60. gzipper = GzipFile(fileobj=stream)
  61. result = gzipper.read()
  62. code = response.getcode()
  63. if code != 200:
  64. err = "{0} Error: got response code '{1}':\n{2}'"
  65. raise SearchQueryError(err.format(self.name, code, result))
  66. return result
  67. @staticmethod
  68. def requirements():
  69. """Return a list of packages required by this search engine."""
  70. return []
  71. def search(self, query):
  72. """Use this engine to search for *query*.
  73. Not implemented in this base class; overridden in subclasses.
  74. """
  75. raise NotImplementedError()
  76. class BingSearchEngine(_BaseSearchEngine):
  77. """A search engine interface with Bing Search (via Azure Marketplace)."""
  78. name = "Bing"
  79. def __init__(self, cred, opener):
  80. super().__init__(cred, opener)
  81. key = self.cred["key"]
  82. auth = (key + ":" + key).encode("base64").replace("\n", "")
  83. self.opener.addheaders.append(("Authorization", "Basic " + auth))
  84. def search(self, query):
  85. """Do a Bing web search for *query*.
  86. Returns a list of URLs ranked by relevance (as determined by Bing).
  87. Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
  88. """
  89. service = "SearchWeb" if self.cred["type"] == "searchweb" else "Search"
  90. url = f"https://api.datamarket.azure.com/Bing/{service}/Web?"
  91. params = {
  92. "$format": "json",
  93. "$top": str(self.count),
  94. "Query": "'\"" + query.replace('"', "").encode("utf8") + "\"'",
  95. "Market": "'en-US'",
  96. "Adult": "'Off'",
  97. "Options": "'DisableLocationDetection'",
  98. "WebSearchOptions": "'DisableHostCollapsing+DisableQueryAlterations'",
  99. }
  100. result = self._open(url + urlencode(params))
  101. try:
  102. res = loads(result)
  103. except ValueError:
  104. err = "Bing Error: JSON could not be decoded"
  105. raise SearchQueryError(err)
  106. try:
  107. results = res["d"]["results"]
  108. except KeyError:
  109. return []
  110. return [result["Url"] for result in results]
  111. class GoogleSearchEngine(_BaseSearchEngine):
  112. """A search engine interface with Google Search."""
  113. name = "Google"
  114. def search(self, query):
  115. """Do a Google web search for *query*.
  116. Returns a list of URLs ranked by relevance (as determined by Google).
  117. Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
  118. """
  119. domain = self.cred.get("proxy", "www.googleapis.com")
  120. url = f"https://{domain}/customsearch/v1?"
  121. params = {
  122. "cx": self.cred["id"],
  123. "key": self.cred["key"],
  124. "q": '"' + query.replace('"', "").encode("utf8") + '"',
  125. "alt": "json",
  126. "num": str(self.count),
  127. "safe": "off",
  128. "fields": "items(link)",
  129. }
  130. result = self._open(url + urlencode(params))
  131. try:
  132. res = loads(result)
  133. except ValueError:
  134. err = "Google Error: JSON could not be decoded"
  135. raise SearchQueryError(err)
  136. try:
  137. return [item["link"] for item in res["items"]]
  138. except KeyError:
  139. return []
  140. class YandexSearchEngine(_BaseSearchEngine):
  141. """A search engine interface with Yandex Search."""
  142. name = "Yandex"
  143. @staticmethod
  144. def requirements():
  145. return ["lxml.etree"]
  146. def search(self, query):
  147. """Do a Yandex web search for *query*.
  148. Returns a list of URLs ranked by relevance (as determined by Yandex).
  149. Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
  150. """
  151. domain = self.cred.get("proxy", "yandex.com")
  152. url = f"https://{domain}/search/xml?"
  153. query = re_sub(r"[^a-zA-Z0-9 ]", "", query).encode("utf8")
  154. params = {
  155. "user": self.cred["user"],
  156. "key": self.cred["key"],
  157. "query": '"' + query + '"',
  158. "l10n": "en",
  159. "filter": "none",
  160. "maxpassages": "1",
  161. "groupby": f"mode=flat.groups-on-page={self.count}",
  162. }
  163. result = self._open(url + urlencode(params))
  164. try:
  165. data = lxml.etree.fromstring(result)
  166. return [elem.text for elem in data.xpath(".//url")]
  167. except lxml.etree.Error as exc:
  168. raise SearchQueryError("Yandex XML parse error: " + str(exc))
  169. SEARCH_ENGINES = {
  170. "Bing": BingSearchEngine,
  171. "Google": GoogleSearchEngine,
  172. "Yandex": YandexSearchEngine,
  173. }