@@ -162,8 +162,8 @@ class CopyvioMixIn(object): | |||||
self._logger.info(result.get_log_message(self.title)) | self._logger.info(result.get_log_message(self.title)) | ||||
return result | return result | ||||
def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5): | |||||
"""Check the page like :py:meth:`copyvio_check` against a specific URL. | |||||
def copyvio_compare(self, urls, min_confidence=0.75, max_time=30, degree=5): | |||||
"""Check the page like :py:meth:`copyvio_check` against specific URLs. | |||||
This is essentially a reduced version of :meth:`copyvio_check` - a | This is essentially a reduced version of :meth:`copyvio_check` - a | ||||
copyivo comparison is made using Markov chains and the result is | copyivo comparison is made using Markov chains and the result is | ||||
@@ -183,13 +183,16 @@ class CopyvioMixIn(object): | |||||
Since no searching is done, neither :exc:`.UnknownSearchEngineError` | Since no searching is done, neither :exc:`.UnknownSearchEngineError` | ||||
nor :exc:`.SearchQueryError` will be raised. | nor :exc:`.SearchQueryError` will be raised. | ||||
""" | """ | ||||
if not isinstance(urls, list): | |||||
urls = [urls] | |||||
log = u"Starting copyvio compare for [[{0}]] against {1}" | log = u"Starting copyvio compare for [[{0}]] against {1}" | ||||
self._logger.info(log.format(self.title, url)) | |||||
self._logger.info(log.format(self.title, ", ".join(urls))) | |||||
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree) | article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree) | ||||
workspace = CopyvioWorkspace( | workspace = CopyvioWorkspace( | ||||
article, min_confidence, max_time, self._logger, self._addheaders, | article, min_confidence, max_time, self._logger, self._addheaders, | ||||
max_time, num_workers=1, config=self._search_config, degree=degree) | |||||
workspace.enqueue([url]) | |||||
max_time, num_workers=min(len(urls), 8), short_circuit=False, | |||||
config=self._search_config, degree=degree) | |||||
workspace.enqueue(urls) | |||||
workspace.wait() | workspace.wait() | ||||
result = workspace.get_result() | result = workspace.get_result() | ||||
self._logger.info(result.get_log_message(self.title)) | self._logger.info(result.get_log_message(self.title)) | ||||
@@ -94,5 +94,35 @@ class MarkovChainIntersection(MarkovChain): | |||||
return res.format(self.size, self.mc1, self.mc2) | return res.format(self.size, self.mc1, self.mc2) | ||||
class MarkovChainUnion(MarkovChain): | |||||
"""Implemented the union of multiple chains.""" | |||||
def __init__(self, chains): | |||||
self.chains = list(chains) | |||||
self.chain = self._build() | |||||
self.size = self._get_size() | |||||
def _build(self): | |||||
"""Build and return the Markov chain from the input chains.""" | |||||
union = {} | |||||
for chain in self.chains: | |||||
for phrase, count in chain.chain.iteritems(): | |||||
if phrase in union: | |||||
union[phrase] += count | |||||
else: | |||||
union[phrase] = count | |||||
return union | |||||
def __repr__(self): | |||||
"""Return the canonical string representation of the union.""" | |||||
res = "MarkovChainUnion(chains={!r})" | |||||
return res.format(self.chains) | |||||
def __str__(self): | |||||
"""Return a nice string representation of the union.""" | |||||
res = "<MarkovChainUnion of size {} ({})>" | |||||
return res.format(self.size, "| ".join(str(chain) for chain in self.chains)) | |||||
EMPTY = MarkovChain("") | EMPTY = MarkovChain("") | ||||
EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY) | EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY) |
@@ -265,7 +265,7 @@ class _HTMLParser(_BaseTextParser): | |||||
for element in soup.find_all(tag): | for element in soup.find_all(tag): | ||||
element.extract() | element.extract() | ||||
return "\n".join(soup.stripped_strings) | |||||
return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings) | |||||
def _open(self, url, **kwargs): | def _open(self, url, **kwargs): | ||||
"""Try to read a URL. Return None if it couldn't be read.""" | """Try to read a URL. Return None if it couldn't be read.""" | ||||
@@ -22,6 +22,7 @@ | |||||
from threading import Event | from threading import Event | ||||
from time import time | from time import time | ||||
import urlparse | |||||
from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION | from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION | ||||
@@ -77,6 +78,11 @@ class CopyvioSource(object): | |||||
res = "<CopyvioSource ({0} with {1} conf)>" | res = "<CopyvioSource ({0} with {1} conf)>" | ||||
return res.format(self.url, self.confidence) | return res.format(self.url, self.confidence) | ||||
@property | |||||
def domain(self): | |||||
"""The source URL's domain name, or None.""" | |||||
return urlparse.urlparse(self.url).netloc or None | |||||
def start_work(self): | def start_work(self): | ||||
"""Mark this source as being worked on right now.""" | """Mark this source as being worked on right now.""" | ||||
self._event2.clear() | self._event2.clear() | ||||
@@ -130,13 +136,16 @@ class CopyvioCheckResult(object): | |||||
""" | """ | ||||
def __init__(self, violation, sources, queries, check_time, article_chain, | def __init__(self, violation, sources, queries, check_time, article_chain, | ||||
possible_miss): | |||||
possible_miss, included_sources=None, unified_confidence=None): | |||||
assert isinstance(sources, list) | |||||
self.violation = violation | self.violation = violation | ||||
self.sources = sources | self.sources = sources | ||||
self.queries = queries | self.queries = queries | ||||
self.time = check_time | self.time = check_time | ||||
self.article_chain = article_chain | self.article_chain = article_chain | ||||
self.possible_miss = possible_miss | self.possible_miss = possible_miss | ||||
self.included_sources = included_sources if included_sources else [] | |||||
self.unified_confidence = unified_confidence | |||||
def __repr__(self): | def __repr__(self): | ||||
"""Return the canonical string representation of the result.""" | """Return the canonical string representation of the result.""" | ||||
@@ -157,7 +166,11 @@ class CopyvioCheckResult(object): | |||||
@property | @property | ||||
def confidence(self): | def confidence(self): | ||||
"""The confidence of the best source, or 0 if no sources exist.""" | """The confidence of the best source, or 0 if no sources exist.""" | ||||
return self.best.confidence if self.best else 0.0 | |||||
return ( | |||||
self.unified_confidence if self.unified_confidence is not None else | |||||
self.best.confidence if self.best else | |||||
0.0 | |||||
) | |||||
@property | @property | ||||
def url(self): | def url(self): | ||||
@@ -39,7 +39,7 @@ import urlparse | |||||
from earwigbot import importer | from earwigbot import importer | ||||
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError | from earwigbot.exceptions import ParserExclusionError, ParserRedirectError | ||||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | |||||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection, MarkovChainUnion | |||||
from earwigbot.wiki.copyvios.parsers import get_parser | from earwigbot.wiki.copyvios.parsers import get_parser | ||||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | ||||
@@ -47,6 +47,8 @@ tldextract = importer.new("tldextract") | |||||
__all__ = ["globalize", "localize", "CopyvioWorkspace"] | __all__ = ["globalize", "localize", "CopyvioWorkspace"] | ||||
INCLUDE_THRESHOLD = 0.15 | |||||
_MAX_REDIRECTS = 3 | _MAX_REDIRECTS = 3 | ||||
_MAX_RAW_SIZE = 20 * 1024 ** 2 | _MAX_RAW_SIZE = 20 * 1024 ** 2 | ||||
@@ -476,14 +478,21 @@ class CopyvioWorkspace(object): | |||||
def get_result(self, num_queries=0): | def get_result(self, num_queries=0): | ||||
"""Return a CopyvioCheckResult containing the results of this check.""" | """Return a CopyvioCheckResult containing the results of this check.""" | ||||
def cmpfunc(s1, s2): | |||||
if s2.confidence != s1.confidence: | |||||
return 1 if s2.confidence > s1.confidence else -1 | |||||
if s2.excluded != s1.excluded: | |||||
return 1 if s1.excluded else -1 | |||||
return int(s1.skipped) - int(s2.skipped) | |||||
self.sources.sort(cmpfunc) | |||||
self.sources.sort( | |||||
key=lambda s: (s.confidence, not s.excluded, not s.skipped, s.chains[0].size), | |||||
reverse=True, | |||||
) | |||||
included_sources = [ | |||||
source for source in self.sources if source.confidence >= INCLUDE_THRESHOLD | |||||
] | |||||
if included_sources: | |||||
unified = MarkovChainUnion(source.chains[0] for source in included_sources) | |||||
delta = MarkovChainIntersection(self._article, unified) | |||||
unified_confidence = self._calculate_confidence(delta) | |||||
else: | |||||
unified_confidence = None | |||||
return CopyvioCheckResult(self.finished, self.sources, num_queries, | return CopyvioCheckResult(self.finished, self.sources, num_queries, | ||||
time.time() - self._start_time, self._article, | time.time() - self._start_time, self._article, | ||||
self.possible_miss) | |||||
self.possible_miss, included_sources, unified_confidence) |