From f5c0de7c945409b306125a81265692e23f511291 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 20 Aug 2024 01:30:03 +0000 Subject: [PATCH] Add degree support --- earwigbot/wiki/copyvios/__init__.py | 12 ++++++------ earwigbot/wiki/copyvios/markov.py | 4 ++-- earwigbot/wiki/copyvios/search.py | 3 +++ earwigbot/wiki/copyvios/workers.py | 5 +++-- earwigbot/wiki/site.py | 1 + 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 9b1c616..af8cc6f 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -81,7 +81,7 @@ class CopyvioMixIn(object): return klass(credentials, opener) def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1, - no_searches=False, no_links=False, short_circuit=True): + no_searches=False, no_links=False, short_circuit=True, degree=5): """Check the page for copyright violations. Returns a :class:`.CopyvioCheckResult` object with information on the @@ -121,7 +121,7 @@ class CopyvioMixIn(object): "nltk_dir": self._search_config["nltk_dir"], "lang": self._site.lang }) - article = MarkovChain(parser.strip()) + article = MarkovChain(parser.strip(), degree=degree) parser_args = {} if self._exclusions_db: @@ -135,7 +135,7 @@ class CopyvioMixIn(object): workspace = CopyvioWorkspace( article, min_confidence, max_time, self._logger, self._addheaders, short_circuit=short_circuit, parser_args=parser_args, exclude_check=exclude, - config=self._search_config) + config=self._search_config, degree=degree) if article.size < 20: # Auto-fail very small articles result = workspace.get_result() @@ -162,7 +162,7 @@ class CopyvioMixIn(object): self._logger.info(result.get_log_message(self.title)) return result - def copyvio_compare(self, url, min_confidence=0.75, max_time=30): + def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5): """Check the page like :py:meth:`copyvio_check` against a specific URL. This is essentially a reduced version of :meth:`copyvio_check` - a @@ -185,10 +185,10 @@ class CopyvioMixIn(object): """ log = u"Starting copyvio compare for [[{0}]] against {1}" self._logger.info(log.format(self.title, url)) - article = MarkovChain(ArticleTextParser(self.get()).strip()) + article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree) workspace = CopyvioWorkspace( article, min_confidence, max_time, self._logger, self._addheaders, - max_time, num_workers=1, config=self._search_config) + max_time, num_workers=1, config=self._search_config, degree=degree) workspace.enqueue([url]) workspace.wait() result = workspace.get_result() diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py index 9a4717d..e3db1a7 100644 --- a/earwigbot/wiki/copyvios/markov.py +++ b/earwigbot/wiki/copyvios/markov.py @@ -29,10 +29,10 @@ class MarkovChain(object): """Implements a basic ngram Markov chain of words.""" START = -1 END = -2 - degree = 5 # 2 for bigrams, 3 for trigrams, etc. - def __init__(self, text): + def __init__(self, text, degree=5): self.text = text + self.degree = degree # 2 for bigrams, 3 for trigrams, etc. self.chain = self._build() self.size = self._get_size() diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index d05ec82..b5c8e03 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -28,6 +28,8 @@ from StringIO import StringIO from urllib import quote, urlencode from urllib2 import URLError +import ssl + from earwigbot import importer from earwigbot.exceptions import SearchQueryError @@ -58,6 +60,7 @@ class _BaseSearchEngine(object): def _open(self, *args): """Open a URL (like urlopen) and try to return its contents.""" try: + ssl._create_default_https_context = ssl._create_unverified_context response = self.opener.open(*args) result = response.read() except (URLError, error) as exc: diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index 5282f24..9ec33eb 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -307,7 +307,7 @@ class _CopyvioWorker(object): source.skip() source.finish_work() else: - chain = MarkovChain(text) if text else None + chain = MarkovChain(text, degree=source.workspace._degree) if text else None source.workspace.compare(source, chain) return True @@ -338,7 +338,7 @@ class CopyvioWorkspace(object): def __init__(self, article, min_confidence, max_time, logger, headers, url_timeout=5, num_workers=8, short_circuit=True, - parser_args=None, exclude_check=None, config=None): + parser_args=None, exclude_check=None, config=None, degree=5): self.sources = [] self.finished = False self.possible_miss = False @@ -355,6 +355,7 @@ class CopyvioWorkspace(object): "workspace": self, "headers": headers, "timeout": url_timeout, "parser_args": parser_args, "search_config": config} self._exclude_check = exclude_check + self._degree = degree if _is_globalized: self._queues = _global_queues diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index 94c5a0e..07f39b4 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -157,6 +157,7 @@ class Site(object): user_agent = constants.USER_AGENT # Set default UA self._oauth = oauth self._session = requests.Session() + self._session.verify = False # XXX self._session.cookies = self._cookiejar self._session.headers["User-Agent"] = user_agent if oauth: