Browse Source

Add degree support

legacy-python2
Ben Kurtovic 4 months ago
committed by Ben Kurtovic
parent
commit
f5c0de7c94
5 changed files with 15 additions and 10 deletions
  1. +6
    -6
      earwigbot/wiki/copyvios/__init__.py
  2. +2
    -2
      earwigbot/wiki/copyvios/markov.py
  3. +3
    -0
      earwigbot/wiki/copyvios/search.py
  4. +3
    -2
      earwigbot/wiki/copyvios/workers.py
  5. +1
    -0
      earwigbot/wiki/site.py

+ 6
- 6
earwigbot/wiki/copyvios/__init__.py View File

@@ -81,7 +81,7 @@ class CopyvioMixIn(object):
return klass(credentials, opener)

def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1,
no_searches=False, no_links=False, short_circuit=True):
no_searches=False, no_links=False, short_circuit=True, degree=5):
"""Check the page for copyright violations.

Returns a :class:`.CopyvioCheckResult` object with information on the
@@ -121,7 +121,7 @@ class CopyvioMixIn(object):
"nltk_dir": self._search_config["nltk_dir"],
"lang": self._site.lang
})
article = MarkovChain(parser.strip())
article = MarkovChain(parser.strip(), degree=degree)
parser_args = {}

if self._exclusions_db:
@@ -135,7 +135,7 @@ class CopyvioMixIn(object):
workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders,
short_circuit=short_circuit, parser_args=parser_args, exclude_check=exclude,
config=self._search_config)
config=self._search_config, degree=degree)

if article.size < 20: # Auto-fail very small articles
result = workspace.get_result()
@@ -162,7 +162,7 @@ class CopyvioMixIn(object):
self._logger.info(result.get_log_message(self.title))
return result

def copyvio_compare(self, url, min_confidence=0.75, max_time=30):
def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5):
"""Check the page like :py:meth:`copyvio_check` against a specific URL.

This is essentially a reduced version of :meth:`copyvio_check` - a
@@ -185,10 +185,10 @@ class CopyvioMixIn(object):
"""
log = u"Starting copyvio compare for [[{0}]] against {1}"
self._logger.info(log.format(self.title, url))
article = MarkovChain(ArticleTextParser(self.get()).strip())
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree)
workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders,
max_time, num_workers=1, config=self._search_config)
max_time, num_workers=1, config=self._search_config, degree=degree)
workspace.enqueue([url])
workspace.wait()
result = workspace.get_result()


+ 2
- 2
earwigbot/wiki/copyvios/markov.py View File

@@ -29,10 +29,10 @@ class MarkovChain(object):
"""Implements a basic ngram Markov chain of words."""
START = -1
END = -2
degree = 5 # 2 for bigrams, 3 for trigrams, etc.

def __init__(self, text):
def __init__(self, text, degree=5):
self.text = text
self.degree = degree # 2 for bigrams, 3 for trigrams, etc.
self.chain = self._build()
self.size = self._get_size()



+ 3
- 0
earwigbot/wiki/copyvios/search.py View File

@@ -28,6 +28,8 @@ from StringIO import StringIO
from urllib import quote, urlencode
from urllib2 import URLError

import ssl

from earwigbot import importer
from earwigbot.exceptions import SearchQueryError

@@ -58,6 +60,7 @@ class _BaseSearchEngine(object):
def _open(self, *args):
"""Open a URL (like urlopen) and try to return its contents."""
try:
ssl._create_default_https_context = ssl._create_unverified_context
response = self.opener.open(*args)
result = response.read()
except (URLError, error) as exc:


+ 3
- 2
earwigbot/wiki/copyvios/workers.py View File

@@ -307,7 +307,7 @@ class _CopyvioWorker(object):
source.skip()
source.finish_work()
else:
chain = MarkovChain(text) if text else None
chain = MarkovChain(text, degree=source.workspace._degree) if text else None
source.workspace.compare(source, chain)
return True

@@ -338,7 +338,7 @@ class CopyvioWorkspace(object):

def __init__(self, article, min_confidence, max_time, logger, headers,
url_timeout=5, num_workers=8, short_circuit=True,
parser_args=None, exclude_check=None, config=None):
parser_args=None, exclude_check=None, config=None, degree=5):
self.sources = []
self.finished = False
self.possible_miss = False
@@ -355,6 +355,7 @@ class CopyvioWorkspace(object):
"workspace": self, "headers": headers, "timeout": url_timeout,
"parser_args": parser_args, "search_config": config}
self._exclude_check = exclude_check
self._degree = degree

if _is_globalized:
self._queues = _global_queues


+ 1
- 0
earwigbot/wiki/site.py View File

@@ -157,6 +157,7 @@ class Site(object):
user_agent = constants.USER_AGENT # Set default UA
self._oauth = oauth
self._session = requests.Session()
self._session.verify = False # XXX
self._session.cookies = self._cookiejar
self._session.headers["User-Agent"] = user_agent
if oauth:


Loading…
Cancel
Save