@@ -81,7 +81,7 @@ class CopyvioMixIn(object): | |||||
return klass(credentials, opener) | return klass(credentials, opener) | ||||
def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1, | def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1, | ||||
no_searches=False, no_links=False, short_circuit=True): | |||||
no_searches=False, no_links=False, short_circuit=True, degree=5): | |||||
"""Check the page for copyright violations. | """Check the page for copyright violations. | ||||
Returns a :class:`.CopyvioCheckResult` object with information on the | Returns a :class:`.CopyvioCheckResult` object with information on the | ||||
@@ -121,7 +121,7 @@ class CopyvioMixIn(object): | |||||
"nltk_dir": self._search_config["nltk_dir"], | "nltk_dir": self._search_config["nltk_dir"], | ||||
"lang": self._site.lang | "lang": self._site.lang | ||||
}) | }) | ||||
article = MarkovChain(parser.strip()) | |||||
article = MarkovChain(parser.strip(), degree=degree) | |||||
parser_args = {} | parser_args = {} | ||||
if self._exclusions_db: | if self._exclusions_db: | ||||
@@ -135,7 +135,7 @@ class CopyvioMixIn(object): | |||||
workspace = CopyvioWorkspace( | workspace = CopyvioWorkspace( | ||||
article, min_confidence, max_time, self._logger, self._addheaders, | article, min_confidence, max_time, self._logger, self._addheaders, | ||||
short_circuit=short_circuit, parser_args=parser_args, exclude_check=exclude, | short_circuit=short_circuit, parser_args=parser_args, exclude_check=exclude, | ||||
config=self._search_config) | |||||
config=self._search_config, degree=degree) | |||||
if article.size < 20: # Auto-fail very small articles | if article.size < 20: # Auto-fail very small articles | ||||
result = workspace.get_result() | result = workspace.get_result() | ||||
@@ -162,7 +162,7 @@ class CopyvioMixIn(object): | |||||
self._logger.info(result.get_log_message(self.title)) | self._logger.info(result.get_log_message(self.title)) | ||||
return result | return result | ||||
def copyvio_compare(self, url, min_confidence=0.75, max_time=30): | |||||
def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5): | |||||
"""Check the page like :py:meth:`copyvio_check` against a specific URL. | """Check the page like :py:meth:`copyvio_check` against a specific URL. | ||||
This is essentially a reduced version of :meth:`copyvio_check` - a | This is essentially a reduced version of :meth:`copyvio_check` - a | ||||
@@ -185,10 +185,10 @@ class CopyvioMixIn(object): | |||||
""" | """ | ||||
log = u"Starting copyvio compare for [[{0}]] against {1}" | log = u"Starting copyvio compare for [[{0}]] against {1}" | ||||
self._logger.info(log.format(self.title, url)) | self._logger.info(log.format(self.title, url)) | ||||
article = MarkovChain(ArticleTextParser(self.get()).strip()) | |||||
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree) | |||||
workspace = CopyvioWorkspace( | workspace = CopyvioWorkspace( | ||||
article, min_confidence, max_time, self._logger, self._addheaders, | article, min_confidence, max_time, self._logger, self._addheaders, | ||||
max_time, num_workers=1, config=self._search_config) | |||||
max_time, num_workers=1, config=self._search_config, degree=degree) | |||||
workspace.enqueue([url]) | workspace.enqueue([url]) | ||||
workspace.wait() | workspace.wait() | ||||
result = workspace.get_result() | result = workspace.get_result() | ||||
@@ -29,10 +29,10 @@ class MarkovChain(object): | |||||
"""Implements a basic ngram Markov chain of words.""" | """Implements a basic ngram Markov chain of words.""" | ||||
START = -1 | START = -1 | ||||
END = -2 | END = -2 | ||||
degree = 5 # 2 for bigrams, 3 for trigrams, etc. | |||||
def __init__(self, text): | |||||
def __init__(self, text, degree=5): | |||||
self.text = text | self.text = text | ||||
self.degree = degree # 2 for bigrams, 3 for trigrams, etc. | |||||
self.chain = self._build() | self.chain = self._build() | ||||
self.size = self._get_size() | self.size = self._get_size() | ||||
@@ -28,6 +28,8 @@ from StringIO import StringIO | |||||
from urllib import quote, urlencode | from urllib import quote, urlencode | ||||
from urllib2 import URLError | from urllib2 import URLError | ||||
import ssl | |||||
from earwigbot import importer | from earwigbot import importer | ||||
from earwigbot.exceptions import SearchQueryError | from earwigbot.exceptions import SearchQueryError | ||||
@@ -58,6 +60,7 @@ class _BaseSearchEngine(object): | |||||
def _open(self, *args): | def _open(self, *args): | ||||
"""Open a URL (like urlopen) and try to return its contents.""" | """Open a URL (like urlopen) and try to return its contents.""" | ||||
try: | try: | ||||
ssl._create_default_https_context = ssl._create_unverified_context | |||||
response = self.opener.open(*args) | response = self.opener.open(*args) | ||||
result = response.read() | result = response.read() | ||||
except (URLError, error) as exc: | except (URLError, error) as exc: | ||||
@@ -307,7 +307,7 @@ class _CopyvioWorker(object): | |||||
source.skip() | source.skip() | ||||
source.finish_work() | source.finish_work() | ||||
else: | else: | ||||
chain = MarkovChain(text) if text else None | |||||
chain = MarkovChain(text, degree=source.workspace._degree) if text else None | |||||
source.workspace.compare(source, chain) | source.workspace.compare(source, chain) | ||||
return True | return True | ||||
@@ -338,7 +338,7 @@ class CopyvioWorkspace(object): | |||||
def __init__(self, article, min_confidence, max_time, logger, headers, | def __init__(self, article, min_confidence, max_time, logger, headers, | ||||
url_timeout=5, num_workers=8, short_circuit=True, | url_timeout=5, num_workers=8, short_circuit=True, | ||||
parser_args=None, exclude_check=None, config=None): | |||||
parser_args=None, exclude_check=None, config=None, degree=5): | |||||
self.sources = [] | self.sources = [] | ||||
self.finished = False | self.finished = False | ||||
self.possible_miss = False | self.possible_miss = False | ||||
@@ -355,6 +355,7 @@ class CopyvioWorkspace(object): | |||||
"workspace": self, "headers": headers, "timeout": url_timeout, | "workspace": self, "headers": headers, "timeout": url_timeout, | ||||
"parser_args": parser_args, "search_config": config} | "parser_args": parser_args, "search_config": config} | ||||
self._exclude_check = exclude_check | self._exclude_check = exclude_check | ||||
self._degree = degree | |||||
if _is_globalized: | if _is_globalized: | ||||
self._queues = _global_queues | self._queues = _global_queues | ||||
@@ -157,6 +157,7 @@ class Site(object): | |||||
user_agent = constants.USER_AGENT # Set default UA | user_agent = constants.USER_AGENT # Set default UA | ||||
self._oauth = oauth | self._oauth = oauth | ||||
self._session = requests.Session() | self._session = requests.Session() | ||||
self._session.verify = False # XXX | |||||
self._session.cookies = self._cookiejar | self._session.cookies = self._cookiejar | ||||
self._session.headers["User-Agent"] = user_agent | self._session.headers["User-Agent"] = user_agent | ||||
if oauth: | if oauth: | ||||