Browse Source

Add degree support

legacy-python2
Ben Kurtovic 3 months ago
committed by Ben Kurtovic
parent
commit
f5c0de7c94
5 changed files with 15 additions and 10 deletions
  1. +6
    -6
      earwigbot/wiki/copyvios/__init__.py
  2. +2
    -2
      earwigbot/wiki/copyvios/markov.py
  3. +3
    -0
      earwigbot/wiki/copyvios/search.py
  4. +3
    -2
      earwigbot/wiki/copyvios/workers.py
  5. +1
    -0
      earwigbot/wiki/site.py

+ 6
- 6
earwigbot/wiki/copyvios/__init__.py View File

@@ -81,7 +81,7 @@ class CopyvioMixIn(object):
return klass(credentials, opener) return klass(credentials, opener)


def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1, def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1,
no_searches=False, no_links=False, short_circuit=True):
no_searches=False, no_links=False, short_circuit=True, degree=5):
"""Check the page for copyright violations. """Check the page for copyright violations.


Returns a :class:`.CopyvioCheckResult` object with information on the Returns a :class:`.CopyvioCheckResult` object with information on the
@@ -121,7 +121,7 @@ class CopyvioMixIn(object):
"nltk_dir": self._search_config["nltk_dir"], "nltk_dir": self._search_config["nltk_dir"],
"lang": self._site.lang "lang": self._site.lang
}) })
article = MarkovChain(parser.strip())
article = MarkovChain(parser.strip(), degree=degree)
parser_args = {} parser_args = {}


if self._exclusions_db: if self._exclusions_db:
@@ -135,7 +135,7 @@ class CopyvioMixIn(object):
workspace = CopyvioWorkspace( workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders, article, min_confidence, max_time, self._logger, self._addheaders,
short_circuit=short_circuit, parser_args=parser_args, exclude_check=exclude, short_circuit=short_circuit, parser_args=parser_args, exclude_check=exclude,
config=self._search_config)
config=self._search_config, degree=degree)


if article.size < 20: # Auto-fail very small articles if article.size < 20: # Auto-fail very small articles
result = workspace.get_result() result = workspace.get_result()
@@ -162,7 +162,7 @@ class CopyvioMixIn(object):
self._logger.info(result.get_log_message(self.title)) self._logger.info(result.get_log_message(self.title))
return result return result


def copyvio_compare(self, url, min_confidence=0.75, max_time=30):
def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5):
"""Check the page like :py:meth:`copyvio_check` against a specific URL. """Check the page like :py:meth:`copyvio_check` against a specific URL.


This is essentially a reduced version of :meth:`copyvio_check` - a This is essentially a reduced version of :meth:`copyvio_check` - a
@@ -185,10 +185,10 @@ class CopyvioMixIn(object):
""" """
log = u"Starting copyvio compare for [[{0}]] against {1}" log = u"Starting copyvio compare for [[{0}]] against {1}"
self._logger.info(log.format(self.title, url)) self._logger.info(log.format(self.title, url))
article = MarkovChain(ArticleTextParser(self.get()).strip())
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree)
workspace = CopyvioWorkspace( workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders, article, min_confidence, max_time, self._logger, self._addheaders,
max_time, num_workers=1, config=self._search_config)
max_time, num_workers=1, config=self._search_config, degree=degree)
workspace.enqueue([url]) workspace.enqueue([url])
workspace.wait() workspace.wait()
result = workspace.get_result() result = workspace.get_result()


+ 2
- 2
earwigbot/wiki/copyvios/markov.py View File

@@ -29,10 +29,10 @@ class MarkovChain(object):
"""Implements a basic ngram Markov chain of words.""" """Implements a basic ngram Markov chain of words."""
START = -1 START = -1
END = -2 END = -2
degree = 5 # 2 for bigrams, 3 for trigrams, etc.


def __init__(self, text):
def __init__(self, text, degree=5):
self.text = text self.text = text
self.degree = degree # 2 for bigrams, 3 for trigrams, etc.
self.chain = self._build() self.chain = self._build()
self.size = self._get_size() self.size = self._get_size()




+ 3
- 0
earwigbot/wiki/copyvios/search.py View File

@@ -28,6 +28,8 @@ from StringIO import StringIO
from urllib import quote, urlencode from urllib import quote, urlencode
from urllib2 import URLError from urllib2 import URLError


import ssl

from earwigbot import importer from earwigbot import importer
from earwigbot.exceptions import SearchQueryError from earwigbot.exceptions import SearchQueryError


@@ -58,6 +60,7 @@ class _BaseSearchEngine(object):
def _open(self, *args): def _open(self, *args):
"""Open a URL (like urlopen) and try to return its contents.""" """Open a URL (like urlopen) and try to return its contents."""
try: try:
ssl._create_default_https_context = ssl._create_unverified_context
response = self.opener.open(*args) response = self.opener.open(*args)
result = response.read() result = response.read()
except (URLError, error) as exc: except (URLError, error) as exc:


+ 3
- 2
earwigbot/wiki/copyvios/workers.py View File

@@ -307,7 +307,7 @@ class _CopyvioWorker(object):
source.skip() source.skip()
source.finish_work() source.finish_work()
else: else:
chain = MarkovChain(text) if text else None
chain = MarkovChain(text, degree=source.workspace._degree) if text else None
source.workspace.compare(source, chain) source.workspace.compare(source, chain)
return True return True


@@ -338,7 +338,7 @@ class CopyvioWorkspace(object):


def __init__(self, article, min_confidence, max_time, logger, headers, def __init__(self, article, min_confidence, max_time, logger, headers,
url_timeout=5, num_workers=8, short_circuit=True, url_timeout=5, num_workers=8, short_circuit=True,
parser_args=None, exclude_check=None, config=None):
parser_args=None, exclude_check=None, config=None, degree=5):
self.sources = [] self.sources = []
self.finished = False self.finished = False
self.possible_miss = False self.possible_miss = False
@@ -355,6 +355,7 @@ class CopyvioWorkspace(object):
"workspace": self, "headers": headers, "timeout": url_timeout, "workspace": self, "headers": headers, "timeout": url_timeout,
"parser_args": parser_args, "search_config": config} "parser_args": parser_args, "search_config": config}
self._exclude_check = exclude_check self._exclude_check = exclude_check
self._degree = degree


if _is_globalized: if _is_globalized:
self._queues = _global_queues self._queues = _global_queues


+ 1
- 0
earwigbot/wiki/site.py View File

@@ -157,6 +157,7 @@ class Site(object):
user_agent = constants.USER_AGENT # Set default UA user_agent = constants.USER_AGENT # Set default UA
self._oauth = oauth self._oauth = oauth
self._session = requests.Session() self._session = requests.Session()
self._session.verify = False # XXX
self._session.cookies = self._cookiejar self._session.cookies = self._cookiejar
self._session.headers["User-Agent"] = user_agent self._session.headers["User-Agent"] = user_agent
if oauth: if oauth:


Loading…
Cancel
Save