Browse Source

Support multiple URLs

tags/v0.4
Ben Kurtovic 4 months ago
parent
commit
35519e9870
5 changed files with 93 additions and 17 deletions
  1. +9
    -6
      src/earwigbot/wiki/copyvios/__init__.py
  2. +30
    -0
      src/earwigbot/wiki/copyvios/markov.py
  3. +1
    -1
      src/earwigbot/wiki/copyvios/parsers.py
  4. +26
    -2
      src/earwigbot/wiki/copyvios/result.py
  5. +27
    -8
      src/earwigbot/wiki/copyvios/workers.py

+ 9
- 6
src/earwigbot/wiki/copyvios/__init__.py View File

@@ -180,8 +180,8 @@ class CopyvioMixIn:
self._logger.info(result.get_log_message(self.title))
return result

def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5):
"""Check the page like :py:meth:`copyvio_check` against a specific URL.
def copyvio_compare(self, urls, min_confidence=0.75, max_time=30, degree=5):
"""Check the page like :py:meth:`copyvio_check` against specific URLs.

This is essentially a reduced version of :meth:`copyvio_check` - a
copyivo comparison is made using Markov chains and the result is
@@ -201,9 +201,11 @@ class CopyvioMixIn:
Since no searching is done, neither :exc:`.UnknownSearchEngineError`
nor :exc:`.SearchQueryError` will be raised.
"""
if not isinstance(urls, list):
urls = [urls]
log = "Starting copyvio compare for [[{0}]] against {1}"
self._logger.info(log.format(self.title, url))
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=5)
self._logger.info(log.format(self.title, ", ".join(urls)))
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree)
workspace = CopyvioWorkspace(
article,
min_confidence,
@@ -211,11 +213,12 @@ class CopyvioMixIn:
self._logger,
self._addheaders,
max_time,
num_workers=1,
num_workers=min(len(urls), 8),
short_circuit=False,
config=self._search_config,
degree=degree,
)
workspace.enqueue([url])
workspace.enqueue(urls)
workspace.wait()
result = workspace.get_result()
self._logger.info(result.get_log_message(self.title))


+ 30
- 0
src/earwigbot/wiki/copyvios/markov.py View File

@@ -93,5 +93,35 @@ class MarkovChainIntersection(MarkovChain):
return res.format(self.size, self.mc1, self.mc2)


class MarkovChainUnion(MarkovChain):
"""Implemented the union of multiple chains."""

def __init__(self, chains):
self.chains = list(chains)
self.chain = self._build()
self.size = self._get_size()

def _build(self):
"""Build and return the Markov chain from the input chains."""
union = {}
for chain in self.chains:
for phrase, count in chain.chain.iteritems():
if phrase in union:
union[phrase] += count
else:
union[phrase] = count
return union

def __repr__(self):
"""Return the canonical string representation of the union."""
res = "MarkovChainUnion(chains={!r})"
return res.format(self.chains)

def __str__(self):
"""Return a nice string representation of the union."""
res = "<MarkovChainUnion of size {} ({})>"
return res.format(self.size, "| ".join(str(chain) for chain in self.chains))


EMPTY = MarkovChain("")
EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY)

+ 1
- 1
src/earwigbot/wiki/copyvios/parsers.py View File

@@ -273,7 +273,7 @@ class _HTMLParser(_BaseTextParser):
for element in soup.find_all(tag):
element.extract()

return "\n".join(soup.stripped_strings)
return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings)

def _open(self, url, **kwargs):
"""Try to read a URL. Return None if it couldn't be read."""


+ 26
- 2
src/earwigbot/wiki/copyvios/result.py View File

@@ -21,6 +21,8 @@
from threading import Event
from time import time

import urlparse

from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION

__all__ = ["CopyvioSource", "CopyvioCheckResult"]
@@ -84,6 +86,11 @@ class CopyvioSource:
res = "<CopyvioSource ({0} with {1} conf)>"
return res.format(self.url, self.confidence)

@property
def domain(self):
"""The source URL's domain name, or None."""
return urlparse.urlparse(self.url).netloc or None

def start_work(self):
"""Mark this source as being worked on right now."""
self._event2.clear()
@@ -137,14 +144,25 @@ class CopyvioCheckResult:
"""

def __init__(
self, violation, sources, queries, check_time, article_chain, possible_miss
self,
violation,
sources,
queries,
check_time,
article_chain,
possible_miss,
included_sources=None,
unified_confidence=None,
):
assert isinstance(sources, list)
self.violation = violation
self.sources = sources
self.queries = queries
self.time = check_time
self.article_chain = article_chain
self.possible_miss = possible_miss
self.included_sources = included_sources if included_sources else []
self.unified_confidence = unified_confidence

def __repr__(self):
"""Return the canonical string representation of the result."""
@@ -164,7 +182,13 @@ class CopyvioCheckResult:
@property
def confidence(self):
"""The confidence of the best source, or 0 if no sources exist."""
return self.best.confidence if self.best else 0.0
return (
self.unified_confidence
if self.unified_confidence is not None
else self.best.confidence
if self.best
else 0.0
)

@property
def url(self):


+ 27
- 8
src/earwigbot/wiki/copyvios/workers.py View File

@@ -37,7 +37,11 @@ from urllib.request import Request, build_opener

from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.markov import (
MarkovChain,
MarkovChainIntersection,
MarkovChainUnion,
)
from earwigbot.wiki.copyvios.parsers import get_parser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource

@@ -45,6 +49,8 @@ tldextract = importer.new("tldextract")

__all__ = ["globalize", "localize", "CopyvioWorkspace"]

INCLUDE_THRESHOLD = 0.15

_MAX_REDIRECTS = 3
_MAX_RAW_SIZE = 20 * 1024**2

@@ -501,15 +507,26 @@ class CopyvioWorkspace:

def get_result(self, num_queries=0):
"""Return a CopyvioCheckResult containing the results of this check."""
self.sources.sort(
key=lambda s: (
s.confidence,
not s.excluded,
not s.skipped,
s.chains[0].size,
),
reverse=True,
)

def cmpfunc(s1, s2):
if s2.confidence != s1.confidence:
return 1 if s2.confidence > s1.confidence else -1
if s2.excluded != s1.excluded:
return 1 if s1.excluded else -1
return int(s1.skipped) - int(s2.skipped)
included_sources = [
source for source in self.sources if source.confidence >= INCLUDE_THRESHOLD
]
if included_sources:
unified = MarkovChainUnion(source.chains[0] for source in included_sources)
delta = MarkovChainIntersection(self._article, unified)
unified_confidence = self._calculate_confidence(delta)
else:
unified_confidence = None

self.sources.sort(cmpfunc)
return CopyvioCheckResult(
self.finished,
self.sources,
@@ -517,4 +534,6 @@ class CopyvioWorkspace:
time.time() - self._start_time,
self._article,
self.possible_miss,
included_sources,
unified_confidence,
)

Loading…
Cancel
Save