Browse Source

Support multiple URLs

tags/v0.4
Ben Kurtovic 3 months ago
parent
commit
35519e9870
5 changed files with 93 additions and 17 deletions
  1. +9
    -6
      src/earwigbot/wiki/copyvios/__init__.py
  2. +30
    -0
      src/earwigbot/wiki/copyvios/markov.py
  3. +1
    -1
      src/earwigbot/wiki/copyvios/parsers.py
  4. +26
    -2
      src/earwigbot/wiki/copyvios/result.py
  5. +27
    -8
      src/earwigbot/wiki/copyvios/workers.py

+ 9
- 6
src/earwigbot/wiki/copyvios/__init__.py View File

@@ -180,8 +180,8 @@ class CopyvioMixIn:
self._logger.info(result.get_log_message(self.title)) self._logger.info(result.get_log_message(self.title))
return result return result


def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5):
"""Check the page like :py:meth:`copyvio_check` against a specific URL.
def copyvio_compare(self, urls, min_confidence=0.75, max_time=30, degree=5):
"""Check the page like :py:meth:`copyvio_check` against specific URLs.


This is essentially a reduced version of :meth:`copyvio_check` - a This is essentially a reduced version of :meth:`copyvio_check` - a
copyivo comparison is made using Markov chains and the result is copyivo comparison is made using Markov chains and the result is
@@ -201,9 +201,11 @@ class CopyvioMixIn:
Since no searching is done, neither :exc:`.UnknownSearchEngineError` Since no searching is done, neither :exc:`.UnknownSearchEngineError`
nor :exc:`.SearchQueryError` will be raised. nor :exc:`.SearchQueryError` will be raised.
""" """
if not isinstance(urls, list):
urls = [urls]
log = "Starting copyvio compare for [[{0}]] against {1}" log = "Starting copyvio compare for [[{0}]] against {1}"
self._logger.info(log.format(self.title, url))
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=5)
self._logger.info(log.format(self.title, ", ".join(urls)))
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree)
workspace = CopyvioWorkspace( workspace = CopyvioWorkspace(
article, article,
min_confidence, min_confidence,
@@ -211,11 +213,12 @@ class CopyvioMixIn:
self._logger, self._logger,
self._addheaders, self._addheaders,
max_time, max_time,
num_workers=1,
num_workers=min(len(urls), 8),
short_circuit=False,
config=self._search_config, config=self._search_config,
degree=degree, degree=degree,
) )
workspace.enqueue([url])
workspace.enqueue(urls)
workspace.wait() workspace.wait()
result = workspace.get_result() result = workspace.get_result()
self._logger.info(result.get_log_message(self.title)) self._logger.info(result.get_log_message(self.title))


+ 30
- 0
src/earwigbot/wiki/copyvios/markov.py View File

@@ -93,5 +93,35 @@ class MarkovChainIntersection(MarkovChain):
return res.format(self.size, self.mc1, self.mc2) return res.format(self.size, self.mc1, self.mc2)




class MarkovChainUnion(MarkovChain):
"""Implemented the union of multiple chains."""

def __init__(self, chains):
self.chains = list(chains)
self.chain = self._build()
self.size = self._get_size()

def _build(self):
"""Build and return the Markov chain from the input chains."""
union = {}
for chain in self.chains:
for phrase, count in chain.chain.iteritems():
if phrase in union:
union[phrase] += count
else:
union[phrase] = count
return union

def __repr__(self):
"""Return the canonical string representation of the union."""
res = "MarkovChainUnion(chains={!r})"
return res.format(self.chains)

def __str__(self):
"""Return a nice string representation of the union."""
res = "<MarkovChainUnion of size {} ({})>"
return res.format(self.size, "| ".join(str(chain) for chain in self.chains))


EMPTY = MarkovChain("") EMPTY = MarkovChain("")
EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY) EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY)

+ 1
- 1
src/earwigbot/wiki/copyvios/parsers.py View File

@@ -273,7 +273,7 @@ class _HTMLParser(_BaseTextParser):
for element in soup.find_all(tag): for element in soup.find_all(tag):
element.extract() element.extract()


return "\n".join(soup.stripped_strings)
return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings)


def _open(self, url, **kwargs): def _open(self, url, **kwargs):
"""Try to read a URL. Return None if it couldn't be read.""" """Try to read a URL. Return None if it couldn't be read."""


+ 26
- 2
src/earwigbot/wiki/copyvios/result.py View File

@@ -21,6 +21,8 @@
from threading import Event from threading import Event
from time import time from time import time


import urlparse

from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION


__all__ = ["CopyvioSource", "CopyvioCheckResult"] __all__ = ["CopyvioSource", "CopyvioCheckResult"]
@@ -84,6 +86,11 @@ class CopyvioSource:
res = "<CopyvioSource ({0} with {1} conf)>" res = "<CopyvioSource ({0} with {1} conf)>"
return res.format(self.url, self.confidence) return res.format(self.url, self.confidence)


@property
def domain(self):
"""The source URL's domain name, or None."""
return urlparse.urlparse(self.url).netloc or None

def start_work(self): def start_work(self):
"""Mark this source as being worked on right now.""" """Mark this source as being worked on right now."""
self._event2.clear() self._event2.clear()
@@ -137,14 +144,25 @@ class CopyvioCheckResult:
""" """


def __init__( def __init__(
self, violation, sources, queries, check_time, article_chain, possible_miss
self,
violation,
sources,
queries,
check_time,
article_chain,
possible_miss,
included_sources=None,
unified_confidence=None,
): ):
assert isinstance(sources, list)
self.violation = violation self.violation = violation
self.sources = sources self.sources = sources
self.queries = queries self.queries = queries
self.time = check_time self.time = check_time
self.article_chain = article_chain self.article_chain = article_chain
self.possible_miss = possible_miss self.possible_miss = possible_miss
self.included_sources = included_sources if included_sources else []
self.unified_confidence = unified_confidence


def __repr__(self): def __repr__(self):
"""Return the canonical string representation of the result.""" """Return the canonical string representation of the result."""
@@ -164,7 +182,13 @@ class CopyvioCheckResult:
@property @property
def confidence(self): def confidence(self):
"""The confidence of the best source, or 0 if no sources exist.""" """The confidence of the best source, or 0 if no sources exist."""
return self.best.confidence if self.best else 0.0
return (
self.unified_confidence
if self.unified_confidence is not None
else self.best.confidence
if self.best
else 0.0
)


@property @property
def url(self): def url(self):


+ 27
- 8
src/earwigbot/wiki/copyvios/workers.py View File

@@ -37,7 +37,11 @@ from urllib.request import Request, build_opener


from earwigbot import importer from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError from earwigbot.exceptions import ParserExclusionError, ParserRedirectError
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.markov import (
MarkovChain,
MarkovChainIntersection,
MarkovChainUnion,
)
from earwigbot.wiki.copyvios.parsers import get_parser from earwigbot.wiki.copyvios.parsers import get_parser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource


@@ -45,6 +49,8 @@ tldextract = importer.new("tldextract")


__all__ = ["globalize", "localize", "CopyvioWorkspace"] __all__ = ["globalize", "localize", "CopyvioWorkspace"]


INCLUDE_THRESHOLD = 0.15

_MAX_REDIRECTS = 3 _MAX_REDIRECTS = 3
_MAX_RAW_SIZE = 20 * 1024**2 _MAX_RAW_SIZE = 20 * 1024**2


@@ -501,15 +507,26 @@ class CopyvioWorkspace:


def get_result(self, num_queries=0): def get_result(self, num_queries=0):
"""Return a CopyvioCheckResult containing the results of this check.""" """Return a CopyvioCheckResult containing the results of this check."""
self.sources.sort(
key=lambda s: (
s.confidence,
not s.excluded,
not s.skipped,
s.chains[0].size,
),
reverse=True,
)


def cmpfunc(s1, s2):
if s2.confidence != s1.confidence:
return 1 if s2.confidence > s1.confidence else -1
if s2.excluded != s1.excluded:
return 1 if s1.excluded else -1
return int(s1.skipped) - int(s2.skipped)
included_sources = [
source for source in self.sources if source.confidence >= INCLUDE_THRESHOLD
]
if included_sources:
unified = MarkovChainUnion(source.chains[0] for source in included_sources)
delta = MarkovChainIntersection(self._article, unified)
unified_confidence = self._calculate_confidence(delta)
else:
unified_confidence = None


self.sources.sort(cmpfunc)
return CopyvioCheckResult( return CopyvioCheckResult(
self.finished, self.finished,
self.sources, self.sources,
@@ -517,4 +534,6 @@ class CopyvioWorkspace:
time.time() - self._start_time, time.time() - self._start_time,
self._article, self._article,
self.possible_miss, self.possible_miss,
included_sources,
unified_confidence,
) )

Loading…
Cancel
Save