@@ -162,8 +162,8 @@ class CopyvioMixIn(object): | |||
self._logger.info(result.get_log_message(self.title)) | |||
return result | |||
def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5): | |||
"""Check the page like :py:meth:`copyvio_check` against a specific URL. | |||
def copyvio_compare(self, urls, min_confidence=0.75, max_time=30, degree=5): | |||
"""Check the page like :py:meth:`copyvio_check` against specific URLs. | |||
This is essentially a reduced version of :meth:`copyvio_check` - a | |||
copyivo comparison is made using Markov chains and the result is | |||
@@ -183,13 +183,16 @@ class CopyvioMixIn(object): | |||
Since no searching is done, neither :exc:`.UnknownSearchEngineError` | |||
nor :exc:`.SearchQueryError` will be raised. | |||
""" | |||
if not isinstance(urls, list): | |||
urls = [urls] | |||
log = u"Starting copyvio compare for [[{0}]] against {1}" | |||
self._logger.info(log.format(self.title, url)) | |||
self._logger.info(log.format(self.title, ", ".join(urls))) | |||
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree) | |||
workspace = CopyvioWorkspace( | |||
article, min_confidence, max_time, self._logger, self._addheaders, | |||
max_time, num_workers=1, config=self._search_config, degree=degree) | |||
workspace.enqueue([url]) | |||
max_time, num_workers=min(len(urls), 8), short_circuit=False, | |||
config=self._search_config, degree=degree) | |||
workspace.enqueue(urls) | |||
workspace.wait() | |||
result = workspace.get_result() | |||
self._logger.info(result.get_log_message(self.title)) | |||
@@ -94,5 +94,35 @@ class MarkovChainIntersection(MarkovChain): | |||
return res.format(self.size, self.mc1, self.mc2) | |||
class MarkovChainUnion(MarkovChain): | |||
"""Implemented the union of multiple chains.""" | |||
def __init__(self, chains): | |||
self.chains = list(chains) | |||
self.chain = self._build() | |||
self.size = self._get_size() | |||
def _build(self): | |||
"""Build and return the Markov chain from the input chains.""" | |||
union = {} | |||
for chain in self.chains: | |||
for phrase, count in chain.chain.iteritems(): | |||
if phrase in union: | |||
union[phrase] += count | |||
else: | |||
union[phrase] = count | |||
return union | |||
def __repr__(self): | |||
"""Return the canonical string representation of the union.""" | |||
res = "MarkovChainUnion(chains={!r})" | |||
return res.format(self.chains) | |||
def __str__(self): | |||
"""Return a nice string representation of the union.""" | |||
res = "<MarkovChainUnion of size {} ({})>" | |||
return res.format(self.size, "| ".join(str(chain) for chain in self.chains)) | |||
EMPTY = MarkovChain("") | |||
EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY) |
@@ -265,7 +265,7 @@ class _HTMLParser(_BaseTextParser): | |||
for element in soup.find_all(tag): | |||
element.extract() | |||
return "\n".join(soup.stripped_strings) | |||
return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings) | |||
def _open(self, url, **kwargs): | |||
"""Try to read a URL. Return None if it couldn't be read.""" | |||
@@ -22,6 +22,7 @@ | |||
from threading import Event | |||
from time import time | |||
import urlparse | |||
from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION | |||
@@ -77,6 +78,11 @@ class CopyvioSource(object): | |||
res = "<CopyvioSource ({0} with {1} conf)>" | |||
return res.format(self.url, self.confidence) | |||
@property | |||
def domain(self): | |||
"""The source URL's domain name, or None.""" | |||
return urlparse.urlparse(self.url).netloc or None | |||
def start_work(self): | |||
"""Mark this source as being worked on right now.""" | |||
self._event2.clear() | |||
@@ -130,13 +136,16 @@ class CopyvioCheckResult(object): | |||
""" | |||
def __init__(self, violation, sources, queries, check_time, article_chain, | |||
possible_miss): | |||
possible_miss, included_sources=None, unified_confidence=None): | |||
assert isinstance(sources, list) | |||
self.violation = violation | |||
self.sources = sources | |||
self.queries = queries | |||
self.time = check_time | |||
self.article_chain = article_chain | |||
self.possible_miss = possible_miss | |||
self.included_sources = included_sources if included_sources else [] | |||
self.unified_confidence = unified_confidence | |||
def __repr__(self): | |||
"""Return the canonical string representation of the result.""" | |||
@@ -157,7 +166,11 @@ class CopyvioCheckResult(object): | |||
@property | |||
def confidence(self): | |||
"""The confidence of the best source, or 0 if no sources exist.""" | |||
return self.best.confidence if self.best else 0.0 | |||
return ( | |||
self.unified_confidence if self.unified_confidence is not None else | |||
self.best.confidence if self.best else | |||
0.0 | |||
) | |||
@property | |||
def url(self): | |||
@@ -39,7 +39,7 @@ import urlparse | |||
from earwigbot import importer | |||
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError | |||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | |||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection, MarkovChainUnion | |||
from earwigbot.wiki.copyvios.parsers import get_parser | |||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | |||
@@ -47,6 +47,8 @@ tldextract = importer.new("tldextract") | |||
__all__ = ["globalize", "localize", "CopyvioWorkspace"] | |||
INCLUDE_THRESHOLD = 0.15 | |||
_MAX_REDIRECTS = 3 | |||
_MAX_RAW_SIZE = 20 * 1024 ** 2 | |||
@@ -476,14 +478,21 @@ class CopyvioWorkspace(object): | |||
def get_result(self, num_queries=0): | |||
"""Return a CopyvioCheckResult containing the results of this check.""" | |||
def cmpfunc(s1, s2): | |||
if s2.confidence != s1.confidence: | |||
return 1 if s2.confidence > s1.confidence else -1 | |||
if s2.excluded != s1.excluded: | |||
return 1 if s1.excluded else -1 | |||
return int(s1.skipped) - int(s2.skipped) | |||
self.sources.sort(cmpfunc) | |||
self.sources.sort( | |||
key=lambda s: (s.confidence, not s.excluded, not s.skipped, s.chains[0].size), | |||
reverse=True, | |||
) | |||
included_sources = [ | |||
source for source in self.sources if source.confidence >= INCLUDE_THRESHOLD | |||
] | |||
if included_sources: | |||
unified = MarkovChainUnion(source.chains[0] for source in included_sources) | |||
delta = MarkovChainIntersection(self._article, unified) | |||
unified_confidence = self._calculate_confidence(delta) | |||
else: | |||
unified_confidence = None | |||
return CopyvioCheckResult(self.finished, self.sources, num_queries, | |||
time.time() - self._start_time, self._article, | |||
self.possible_miss) | |||
self.possible_miss, included_sources, unified_confidence) |