@@ -180,8 +180,8 @@ class CopyvioMixIn: | |||
self._logger.info(result.get_log_message(self.title)) | |||
return result | |||
def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5): | |||
"""Check the page like :py:meth:`copyvio_check` against a specific URL. | |||
def copyvio_compare(self, urls, min_confidence=0.75, max_time=30, degree=5): | |||
"""Check the page like :py:meth:`copyvio_check` against specific URLs. | |||
This is essentially a reduced version of :meth:`copyvio_check` - a | |||
copyivo comparison is made using Markov chains and the result is | |||
@@ -201,9 +201,11 @@ class CopyvioMixIn: | |||
Since no searching is done, neither :exc:`.UnknownSearchEngineError` | |||
nor :exc:`.SearchQueryError` will be raised. | |||
""" | |||
if not isinstance(urls, list): | |||
urls = [urls] | |||
log = "Starting copyvio compare for [[{0}]] against {1}" | |||
self._logger.info(log.format(self.title, url)) | |||
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=5) | |||
self._logger.info(log.format(self.title, ", ".join(urls))) | |||
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree) | |||
workspace = CopyvioWorkspace( | |||
article, | |||
min_confidence, | |||
@@ -211,11 +213,12 @@ class CopyvioMixIn: | |||
self._logger, | |||
self._addheaders, | |||
max_time, | |||
num_workers=1, | |||
num_workers=min(len(urls), 8), | |||
short_circuit=False, | |||
config=self._search_config, | |||
degree=degree, | |||
) | |||
workspace.enqueue([url]) | |||
workspace.enqueue(urls) | |||
workspace.wait() | |||
result = workspace.get_result() | |||
self._logger.info(result.get_log_message(self.title)) | |||
@@ -93,5 +93,35 @@ class MarkovChainIntersection(MarkovChain): | |||
return res.format(self.size, self.mc1, self.mc2) | |||
class MarkovChainUnion(MarkovChain): | |||
"""Implemented the union of multiple chains.""" | |||
def __init__(self, chains): | |||
self.chains = list(chains) | |||
self.chain = self._build() | |||
self.size = self._get_size() | |||
def _build(self): | |||
"""Build and return the Markov chain from the input chains.""" | |||
union = {} | |||
for chain in self.chains: | |||
for phrase, count in chain.chain.iteritems(): | |||
if phrase in union: | |||
union[phrase] += count | |||
else: | |||
union[phrase] = count | |||
return union | |||
def __repr__(self): | |||
"""Return the canonical string representation of the union.""" | |||
res = "MarkovChainUnion(chains={!r})" | |||
return res.format(self.chains) | |||
def __str__(self): | |||
"""Return a nice string representation of the union.""" | |||
res = "<MarkovChainUnion of size {} ({})>" | |||
return res.format(self.size, "| ".join(str(chain) for chain in self.chains)) | |||
EMPTY = MarkovChain("") | |||
EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY) |
@@ -273,7 +273,7 @@ class _HTMLParser(_BaseTextParser): | |||
for element in soup.find_all(tag): | |||
element.extract() | |||
return "\n".join(soup.stripped_strings) | |||
return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings) | |||
def _open(self, url, **kwargs): | |||
"""Try to read a URL. Return None if it couldn't be read.""" | |||
@@ -21,6 +21,8 @@ | |||
from threading import Event | |||
from time import time | |||
import urlparse | |||
from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION | |||
__all__ = ["CopyvioSource", "CopyvioCheckResult"] | |||
@@ -84,6 +86,11 @@ class CopyvioSource: | |||
res = "<CopyvioSource ({0} with {1} conf)>" | |||
return res.format(self.url, self.confidence) | |||
@property | |||
def domain(self): | |||
"""The source URL's domain name, or None.""" | |||
return urlparse.urlparse(self.url).netloc or None | |||
def start_work(self): | |||
"""Mark this source as being worked on right now.""" | |||
self._event2.clear() | |||
@@ -137,14 +144,25 @@ class CopyvioCheckResult: | |||
""" | |||
def __init__( | |||
self, violation, sources, queries, check_time, article_chain, possible_miss | |||
self, | |||
violation, | |||
sources, | |||
queries, | |||
check_time, | |||
article_chain, | |||
possible_miss, | |||
included_sources=None, | |||
unified_confidence=None, | |||
): | |||
assert isinstance(sources, list) | |||
self.violation = violation | |||
self.sources = sources | |||
self.queries = queries | |||
self.time = check_time | |||
self.article_chain = article_chain | |||
self.possible_miss = possible_miss | |||
self.included_sources = included_sources if included_sources else [] | |||
self.unified_confidence = unified_confidence | |||
def __repr__(self): | |||
"""Return the canonical string representation of the result.""" | |||
@@ -164,7 +182,13 @@ class CopyvioCheckResult: | |||
@property | |||
def confidence(self): | |||
"""The confidence of the best source, or 0 if no sources exist.""" | |||
return self.best.confidence if self.best else 0.0 | |||
return ( | |||
self.unified_confidence | |||
if self.unified_confidence is not None | |||
else self.best.confidence | |||
if self.best | |||
else 0.0 | |||
) | |||
@property | |||
def url(self): | |||
@@ -37,7 +37,11 @@ from urllib.request import Request, build_opener | |||
from earwigbot import importer | |||
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError | |||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | |||
from earwigbot.wiki.copyvios.markov import ( | |||
MarkovChain, | |||
MarkovChainIntersection, | |||
MarkovChainUnion, | |||
) | |||
from earwigbot.wiki.copyvios.parsers import get_parser | |||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | |||
@@ -45,6 +49,8 @@ tldextract = importer.new("tldextract") | |||
__all__ = ["globalize", "localize", "CopyvioWorkspace"] | |||
INCLUDE_THRESHOLD = 0.15 | |||
_MAX_REDIRECTS = 3 | |||
_MAX_RAW_SIZE = 20 * 1024**2 | |||
@@ -501,15 +507,26 @@ class CopyvioWorkspace: | |||
def get_result(self, num_queries=0): | |||
"""Return a CopyvioCheckResult containing the results of this check.""" | |||
self.sources.sort( | |||
key=lambda s: ( | |||
s.confidence, | |||
not s.excluded, | |||
not s.skipped, | |||
s.chains[0].size, | |||
), | |||
reverse=True, | |||
) | |||
def cmpfunc(s1, s2): | |||
if s2.confidence != s1.confidence: | |||
return 1 if s2.confidence > s1.confidence else -1 | |||
if s2.excluded != s1.excluded: | |||
return 1 if s1.excluded else -1 | |||
return int(s1.skipped) - int(s2.skipped) | |||
included_sources = [ | |||
source for source in self.sources if source.confidence >= INCLUDE_THRESHOLD | |||
] | |||
if included_sources: | |||
unified = MarkovChainUnion(source.chains[0] for source in included_sources) | |||
delta = MarkovChainIntersection(self._article, unified) | |||
unified_confidence = self._calculate_confidence(delta) | |||
else: | |||
unified_confidence = None | |||
self.sources.sort(cmpfunc) | |||
return CopyvioCheckResult( | |||
self.finished, | |||
self.sources, | |||
@@ -517,4 +534,6 @@ class CopyvioWorkspace: | |||
time.time() - self._start_time, | |||
self._article, | |||
self.possible_miss, | |||
included_sources, | |||
unified_confidence, | |||
) |