From 35519e9870667ebf9b28c8ae9982382618bbb411 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2024 21:58:29 -0400 Subject: [PATCH] Support multiple URLs --- src/earwigbot/wiki/copyvios/__init__.py | 15 ++++++++------ src/earwigbot/wiki/copyvios/markov.py | 30 ++++++++++++++++++++++++++++ src/earwigbot/wiki/copyvios/parsers.py | 2 +- src/earwigbot/wiki/copyvios/result.py | 28 ++++++++++++++++++++++++-- src/earwigbot/wiki/copyvios/workers.py | 35 +++++++++++++++++++++++++-------- 5 files changed, 93 insertions(+), 17 deletions(-) diff --git a/src/earwigbot/wiki/copyvios/__init__.py b/src/earwigbot/wiki/copyvios/__init__.py index b9e1e2e..dd8eca8 100644 --- a/src/earwigbot/wiki/copyvios/__init__.py +++ b/src/earwigbot/wiki/copyvios/__init__.py @@ -180,8 +180,8 @@ class CopyvioMixIn: self._logger.info(result.get_log_message(self.title)) return result - def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5): - """Check the page like :py:meth:`copyvio_check` against a specific URL. + def copyvio_compare(self, urls, min_confidence=0.75, max_time=30, degree=5): + """Check the page like :py:meth:`copyvio_check` against specific URLs. This is essentially a reduced version of :meth:`copyvio_check` - a copyivo comparison is made using Markov chains and the result is @@ -201,9 +201,11 @@ class CopyvioMixIn: Since no searching is done, neither :exc:`.UnknownSearchEngineError` nor :exc:`.SearchQueryError` will be raised. """ + if not isinstance(urls, list): + urls = [urls] log = "Starting copyvio compare for [[{0}]] against {1}" - self._logger.info(log.format(self.title, url)) - article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=5) + self._logger.info(log.format(self.title, ", ".join(urls))) + article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree) workspace = CopyvioWorkspace( article, min_confidence, @@ -211,11 +213,12 @@ class CopyvioMixIn: self._logger, self._addheaders, max_time, - num_workers=1, + num_workers=min(len(urls), 8), + short_circuit=False, config=self._search_config, degree=degree, ) - workspace.enqueue([url]) + workspace.enqueue(urls) workspace.wait() result = workspace.get_result() self._logger.info(result.get_log_message(self.title)) diff --git a/src/earwigbot/wiki/copyvios/markov.py b/src/earwigbot/wiki/copyvios/markov.py index 86bf497..7b3c486 100644 --- a/src/earwigbot/wiki/copyvios/markov.py +++ b/src/earwigbot/wiki/copyvios/markov.py @@ -93,5 +93,35 @@ class MarkovChainIntersection(MarkovChain): return res.format(self.size, self.mc1, self.mc2) +class MarkovChainUnion(MarkovChain): + """Implemented the union of multiple chains.""" + + def __init__(self, chains): + self.chains = list(chains) + self.chain = self._build() + self.size = self._get_size() + + def _build(self): + """Build and return the Markov chain from the input chains.""" + union = {} + for chain in self.chains: + for phrase, count in chain.chain.iteritems(): + if phrase in union: + union[phrase] += count + else: + union[phrase] = count + return union + + def __repr__(self): + """Return the canonical string representation of the union.""" + res = "MarkovChainUnion(chains={!r})" + return res.format(self.chains) + + def __str__(self): + """Return a nice string representation of the union.""" + res = "" + return res.format(self.size, "| ".join(str(chain) for chain in self.chains)) + + EMPTY = MarkovChain("") EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY) diff --git a/src/earwigbot/wiki/copyvios/parsers.py b/src/earwigbot/wiki/copyvios/parsers.py index 9d0351a..4d3720c 100644 --- a/src/earwigbot/wiki/copyvios/parsers.py +++ b/src/earwigbot/wiki/copyvios/parsers.py @@ -273,7 +273,7 @@ class _HTMLParser(_BaseTextParser): for element in soup.find_all(tag): element.extract() - return "\n".join(soup.stripped_strings) + return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings) def _open(self, url, **kwargs): """Try to read a URL. Return None if it couldn't be read.""" diff --git a/src/earwigbot/wiki/copyvios/result.py b/src/earwigbot/wiki/copyvios/result.py index a2c668d..fb6624f 100644 --- a/src/earwigbot/wiki/copyvios/result.py +++ b/src/earwigbot/wiki/copyvios/result.py @@ -21,6 +21,8 @@ from threading import Event from time import time +import urlparse + from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION __all__ = ["CopyvioSource", "CopyvioCheckResult"] @@ -84,6 +86,11 @@ class CopyvioSource: res = "" return res.format(self.url, self.confidence) + @property + def domain(self): + """The source URL's domain name, or None.""" + return urlparse.urlparse(self.url).netloc or None + def start_work(self): """Mark this source as being worked on right now.""" self._event2.clear() @@ -137,14 +144,25 @@ class CopyvioCheckResult: """ def __init__( - self, violation, sources, queries, check_time, article_chain, possible_miss + self, + violation, + sources, + queries, + check_time, + article_chain, + possible_miss, + included_sources=None, + unified_confidence=None, ): + assert isinstance(sources, list) self.violation = violation self.sources = sources self.queries = queries self.time = check_time self.article_chain = article_chain self.possible_miss = possible_miss + self.included_sources = included_sources if included_sources else [] + self.unified_confidence = unified_confidence def __repr__(self): """Return the canonical string representation of the result.""" @@ -164,7 +182,13 @@ class CopyvioCheckResult: @property def confidence(self): """The confidence of the best source, or 0 if no sources exist.""" - return self.best.confidence if self.best else 0.0 + return ( + self.unified_confidence + if self.unified_confidence is not None + else self.best.confidence + if self.best + else 0.0 + ) @property def url(self): diff --git a/src/earwigbot/wiki/copyvios/workers.py b/src/earwigbot/wiki/copyvios/workers.py index 789821c..2850277 100644 --- a/src/earwigbot/wiki/copyvios/workers.py +++ b/src/earwigbot/wiki/copyvios/workers.py @@ -37,7 +37,11 @@ from urllib.request import Request, build_opener from earwigbot import importer from earwigbot.exceptions import ParserExclusionError, ParserRedirectError -from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection +from earwigbot.wiki.copyvios.markov import ( + MarkovChain, + MarkovChainIntersection, + MarkovChainUnion, +) from earwigbot.wiki.copyvios.parsers import get_parser from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource @@ -45,6 +49,8 @@ tldextract = importer.new("tldextract") __all__ = ["globalize", "localize", "CopyvioWorkspace"] +INCLUDE_THRESHOLD = 0.15 + _MAX_REDIRECTS = 3 _MAX_RAW_SIZE = 20 * 1024**2 @@ -501,15 +507,26 @@ class CopyvioWorkspace: def get_result(self, num_queries=0): """Return a CopyvioCheckResult containing the results of this check.""" + self.sources.sort( + key=lambda s: ( + s.confidence, + not s.excluded, + not s.skipped, + s.chains[0].size, + ), + reverse=True, + ) - def cmpfunc(s1, s2): - if s2.confidence != s1.confidence: - return 1 if s2.confidence > s1.confidence else -1 - if s2.excluded != s1.excluded: - return 1 if s1.excluded else -1 - return int(s1.skipped) - int(s2.skipped) + included_sources = [ + source for source in self.sources if source.confidence >= INCLUDE_THRESHOLD + ] + if included_sources: + unified = MarkovChainUnion(source.chains[0] for source in included_sources) + delta = MarkovChainIntersection(self._article, unified) + unified_confidence = self._calculate_confidence(delta) + else: + unified_confidence = None - self.sources.sort(cmpfunc) return CopyvioCheckResult( self.finished, self.sources, @@ -517,4 +534,6 @@ class CopyvioWorkspace: time.time() - self._start_time, self._article, self.possible_miss, + included_sources, + unified_confidence, )