diff --git a/src/earwigbot/wiki/copyvios/__init__.py b/src/earwigbot/wiki/copyvios/__init__.py index 08ba328..b9e1e2e 100644 --- a/src/earwigbot/wiki/copyvios/__init__.py +++ b/src/earwigbot/wiki/copyvios/__init__.py @@ -88,6 +88,7 @@ class CopyvioMixIn: no_searches=False, no_links=False, short_circuit=True, + degree=5, ): """Check the page for copyright violations. @@ -128,7 +129,7 @@ class CopyvioMixIn: self.get(), args={"nltk_dir": self._search_config["nltk_dir"], "lang": self._site.lang}, ) - article = MarkovChain(parser.strip()) + article = MarkovChain(parser.strip(), degree=degree) parser_args = {} if self._exclusions_db: @@ -151,6 +152,7 @@ class CopyvioMixIn: parser_args=parser_args, exclude_check=exclude, config=self._search_config, + degree=degree, ) if article.size < 20: # Auto-fail very small articles @@ -178,7 +180,7 @@ class CopyvioMixIn: self._logger.info(result.get_log_message(self.title)) return result - def copyvio_compare(self, url, min_confidence=0.75, max_time=30): + def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5): """Check the page like :py:meth:`copyvio_check` against a specific URL. This is essentially a reduced version of :meth:`copyvio_check` - a @@ -201,7 +203,7 @@ class CopyvioMixIn: """ log = "Starting copyvio compare for [[{0}]] against {1}" self._logger.info(log.format(self.title, url)) - article = MarkovChain(ArticleTextParser(self.get()).strip()) + article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=5) workspace = CopyvioWorkspace( article, min_confidence, @@ -211,6 +213,7 @@ class CopyvioMixIn: max_time, num_workers=1, config=self._search_config, + degree=degree, ) workspace.enqueue([url]) workspace.wait() diff --git a/src/earwigbot/wiki/copyvios/markov.py b/src/earwigbot/wiki/copyvios/markov.py index d994045..86bf497 100644 --- a/src/earwigbot/wiki/copyvios/markov.py +++ b/src/earwigbot/wiki/copyvios/markov.py @@ -28,10 +28,10 @@ class MarkovChain: START = -1 END = -2 - degree = 5 # 2 for bigrams, 3 for trigrams, etc. - def __init__(self, text): + def __init__(self, text, degree=5): self.text = text + self.degree = degree # 2 for bigrams, 3 for trigrams, etc. self.chain = self._build() self.size = self._get_size() diff --git a/src/earwigbot/wiki/copyvios/workers.py b/src/earwigbot/wiki/copyvios/workers.py index 680b265..789821c 100644 --- a/src/earwigbot/wiki/copyvios/workers.py +++ b/src/earwigbot/wiki/copyvios/workers.py @@ -310,7 +310,7 @@ class _CopyvioWorker: source.skip() source.finish_work() else: - chain = MarkovChain(text) if text else None + chain = MarkovChain(text, degree=source.workspace._degree) if text else None source.workspace.compare(source, chain) return True @@ -352,6 +352,7 @@ class CopyvioWorkspace: parser_args=None, exclude_check=None, config=None, + degree=5, ): self.sources = [] self.finished = False @@ -373,6 +374,7 @@ class CopyvioWorkspace: "search_config": config, } self._exclude_check = exclude_check + self._degree = degree if _is_globalized: self._queues = _global_queues