Browse Source

Add degree support

tags/v0.4
Ben Kurtovic 2 months ago
parent
commit
b67121881f
3 changed files with 11 additions and 6 deletions
  1. +6
    -3
      src/earwigbot/wiki/copyvios/__init__.py
  2. +2
    -2
      src/earwigbot/wiki/copyvios/markov.py
  3. +3
    -1
      src/earwigbot/wiki/copyvios/workers.py

+ 6
- 3
src/earwigbot/wiki/copyvios/__init__.py View File

@@ -88,6 +88,7 @@ class CopyvioMixIn:
no_searches=False, no_searches=False,
no_links=False, no_links=False,
short_circuit=True, short_circuit=True,
degree=5,
): ):
"""Check the page for copyright violations. """Check the page for copyright violations.


@@ -128,7 +129,7 @@ class CopyvioMixIn:
self.get(), self.get(),
args={"nltk_dir": self._search_config["nltk_dir"], "lang": self._site.lang}, args={"nltk_dir": self._search_config["nltk_dir"], "lang": self._site.lang},
) )
article = MarkovChain(parser.strip())
article = MarkovChain(parser.strip(), degree=degree)
parser_args = {} parser_args = {}


if self._exclusions_db: if self._exclusions_db:
@@ -151,6 +152,7 @@ class CopyvioMixIn:
parser_args=parser_args, parser_args=parser_args,
exclude_check=exclude, exclude_check=exclude,
config=self._search_config, config=self._search_config,
degree=degree,
) )


if article.size < 20: # Auto-fail very small articles if article.size < 20: # Auto-fail very small articles
@@ -178,7 +180,7 @@ class CopyvioMixIn:
self._logger.info(result.get_log_message(self.title)) self._logger.info(result.get_log_message(self.title))
return result return result


def copyvio_compare(self, url, min_confidence=0.75, max_time=30):
def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5):
"""Check the page like :py:meth:`copyvio_check` against a specific URL. """Check the page like :py:meth:`copyvio_check` against a specific URL.


This is essentially a reduced version of :meth:`copyvio_check` - a This is essentially a reduced version of :meth:`copyvio_check` - a
@@ -201,7 +203,7 @@ class CopyvioMixIn:
""" """
log = "Starting copyvio compare for [[{0}]] against {1}" log = "Starting copyvio compare for [[{0}]] against {1}"
self._logger.info(log.format(self.title, url)) self._logger.info(log.format(self.title, url))
article = MarkovChain(ArticleTextParser(self.get()).strip())
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=5)
workspace = CopyvioWorkspace( workspace = CopyvioWorkspace(
article, article,
min_confidence, min_confidence,
@@ -211,6 +213,7 @@ class CopyvioMixIn:
max_time, max_time,
num_workers=1, num_workers=1,
config=self._search_config, config=self._search_config,
degree=degree,
) )
workspace.enqueue([url]) workspace.enqueue([url])
workspace.wait() workspace.wait()


+ 2
- 2
src/earwigbot/wiki/copyvios/markov.py View File

@@ -28,10 +28,10 @@ class MarkovChain:


START = -1 START = -1
END = -2 END = -2
degree = 5 # 2 for bigrams, 3 for trigrams, etc.


def __init__(self, text):
def __init__(self, text, degree=5):
self.text = text self.text = text
self.degree = degree # 2 for bigrams, 3 for trigrams, etc.
self.chain = self._build() self.chain = self._build()
self.size = self._get_size() self.size = self._get_size()




+ 3
- 1
src/earwigbot/wiki/copyvios/workers.py View File

@@ -310,7 +310,7 @@ class _CopyvioWorker:
source.skip() source.skip()
source.finish_work() source.finish_work()
else: else:
chain = MarkovChain(text) if text else None
chain = MarkovChain(text, degree=source.workspace._degree) if text else None
source.workspace.compare(source, chain) source.workspace.compare(source, chain)
return True return True


@@ -352,6 +352,7 @@ class CopyvioWorkspace:
parser_args=None, parser_args=None,
exclude_check=None, exclude_check=None,
config=None, config=None,
degree=5,
): ):
self.sources = [] self.sources = []
self.finished = False self.finished = False
@@ -373,6 +374,7 @@ class CopyvioWorkspace:
"search_config": config, "search_config": config,
} }
self._exclude_check = exclude_check self._exclude_check = exclude_check
self._degree = degree


if _is_globalized: if _is_globalized:
self._queues = _global_queues self._queues = _global_queues


Loading…
Cancel
Save