From b3ef06c0f50de9e5b7e2098447e8f0838b962090 Mon Sep 17 00:00:00 2001 From: TheresNoTime Date: Thu, 23 Mar 2023 17:01:31 +0000 Subject: [PATCH] Add DuckDuckGo search engine, add `extra_deps` --- earwigbot/wiki/copyvios/search.py | 22 ++++++++++++++++++++-- setup.py | 1 + 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index dedffd8..c98ad91 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -27,13 +27,14 @@ from socket import error from io import StringIO from urllib.parse import quote, urlencode from urllib.error import URLError +from duckduckgo_search import ddg from earwigbot import importer from earwigbot.exceptions import SearchQueryError lxml = importer.new("lxml") -__all__ = ["BingSearchEngine", "GoogleSearchEngine", "YandexSearchEngine", "SEARCH_ENGINES"] +__all__ = ["BingSearchEngine", "GoogleSearchEngine", "YandexSearchEngine", "DDGSearchEngine", "SEARCH_ENGINES"] class _BaseSearchEngine: """Base class for a simple search engine interface.""" @@ -203,9 +204,26 @@ class YandexSearchEngine(_BaseSearchEngine): except lxml.etree.Error as exc: raise SearchQueryError("Yandex XML parse error: " + str(exc)) +class DDGSearchEngine(_BaseSearchEngine): + """A search engine interface with DuckDuckGo""" + name = "DDG" + + def search(self, query): + """Do a DuckDuckGo web search for *query*. + + Returns a list of URLs ranked by relevance (as determined by DuckDuckGo). + """ + result = ddg(query, safesearch='Off', time='y', max_results=200) + + try: + return [item["href"] for item in result] + except KeyError: + return [] + SEARCH_ENGINES = { "Bing": BingSearchEngine, "Google": GoogleSearchEngine, - "Yandex": YandexSearchEngine + "Yandex": YandexSearchEngine, + "DDG": DDGSearchEngine } diff --git a/setup.py b/setup.py index a928353..2145f30 100644 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ extra_deps = { "nltk >= 3.6.1", # Parsing sentences to split article content "pdfminer >= 20191125", # Extracting text from PDF files "tldextract >= 3.1.0", # Getting domains for the multithreaded workers + "duckduckgo-search == 2.8.5", # DuckDuckGo search engine ], "time": [ "pytz >= 2021.1", # Handling timezones for the !time IRC command