瀏覽代碼

Add degree support

tags/v0.4
Ben Kurtovic 2 月之前
父節點
當前提交
b67121881f
共有 3 個文件被更改,包括 11 次插入6 次删除
  1. +6
    -3
      src/earwigbot/wiki/copyvios/__init__.py
  2. +2
    -2
      src/earwigbot/wiki/copyvios/markov.py
  3. +3
    -1
      src/earwigbot/wiki/copyvios/workers.py

+ 6
- 3
src/earwigbot/wiki/copyvios/__init__.py 查看文件

@@ -88,6 +88,7 @@ class CopyvioMixIn:
no_searches=False,
no_links=False,
short_circuit=True,
degree=5,
):
"""Check the page for copyright violations.

@@ -128,7 +129,7 @@ class CopyvioMixIn:
self.get(),
args={"nltk_dir": self._search_config["nltk_dir"], "lang": self._site.lang},
)
article = MarkovChain(parser.strip())
article = MarkovChain(parser.strip(), degree=degree)
parser_args = {}

if self._exclusions_db:
@@ -151,6 +152,7 @@ class CopyvioMixIn:
parser_args=parser_args,
exclude_check=exclude,
config=self._search_config,
degree=degree,
)

if article.size < 20: # Auto-fail very small articles
@@ -178,7 +180,7 @@ class CopyvioMixIn:
self._logger.info(result.get_log_message(self.title))
return result

def copyvio_compare(self, url, min_confidence=0.75, max_time=30):
def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5):
"""Check the page like :py:meth:`copyvio_check` against a specific URL.

This is essentially a reduced version of :meth:`copyvio_check` - a
@@ -201,7 +203,7 @@ class CopyvioMixIn:
"""
log = "Starting copyvio compare for [[{0}]] against {1}"
self._logger.info(log.format(self.title, url))
article = MarkovChain(ArticleTextParser(self.get()).strip())
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=5)
workspace = CopyvioWorkspace(
article,
min_confidence,
@@ -211,6 +213,7 @@ class CopyvioMixIn:
max_time,
num_workers=1,
config=self._search_config,
degree=degree,
)
workspace.enqueue([url])
workspace.wait()


+ 2
- 2
src/earwigbot/wiki/copyvios/markov.py 查看文件

@@ -28,10 +28,10 @@ class MarkovChain:

START = -1
END = -2
degree = 5 # 2 for bigrams, 3 for trigrams, etc.

def __init__(self, text):
def __init__(self, text, degree=5):
self.text = text
self.degree = degree # 2 for bigrams, 3 for trigrams, etc.
self.chain = self._build()
self.size = self._get_size()



+ 3
- 1
src/earwigbot/wiki/copyvios/workers.py 查看文件

@@ -310,7 +310,7 @@ class _CopyvioWorker:
source.skip()
source.finish_work()
else:
chain = MarkovChain(text) if text else None
chain = MarkovChain(text, degree=source.workspace._degree) if text else None
source.workspace.compare(source, chain)
return True

@@ -352,6 +352,7 @@ class CopyvioWorkspace:
parser_args=None,
exclude_check=None,
config=None,
degree=5,
):
self.sources = []
self.finished = False
@@ -373,6 +374,7 @@ class CopyvioWorkspace:
"search_config": config,
}
self._exclude_check = exclude_check
self._degree = degree

if _is_globalized:
self._queues = _global_queues


Loading…
取消
儲存