Browse Source

Support multiple URLs

legacy-python2
Ben Kurtovic 3 weeks ago
parent
commit
aac7ebaedc
5 changed files with 73 additions and 18 deletions
  1. +8
    -5
      earwigbot/wiki/copyvios/__init__.py
  2. +30
    -0
      earwigbot/wiki/copyvios/markov.py
  3. +1
    -1
      earwigbot/wiki/copyvios/parsers.py
  4. +15
    -2
      earwigbot/wiki/copyvios/result.py
  5. +19
    -10
      earwigbot/wiki/copyvios/workers.py

+ 8
- 5
earwigbot/wiki/copyvios/__init__.py View File

@@ -162,8 +162,8 @@ class CopyvioMixIn(object):
self._logger.info(result.get_log_message(self.title))
return result

def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5):
"""Check the page like :py:meth:`copyvio_check` against a specific URL.
def copyvio_compare(self, urls, min_confidence=0.75, max_time=30, degree=5):
"""Check the page like :py:meth:`copyvio_check` against specific URLs.

This is essentially a reduced version of :meth:`copyvio_check` - a
copyivo comparison is made using Markov chains and the result is
@@ -183,13 +183,16 @@ class CopyvioMixIn(object):
Since no searching is done, neither :exc:`.UnknownSearchEngineError`
nor :exc:`.SearchQueryError` will be raised.
"""
if not isinstance(urls, list):
urls = [urls]
log = u"Starting copyvio compare for [[{0}]] against {1}"
self._logger.info(log.format(self.title, url))
self._logger.info(log.format(self.title, ", ".join(urls)))
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree)
workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders,
max_time, num_workers=1, config=self._search_config, degree=degree)
workspace.enqueue([url])
max_time, num_workers=min(len(urls), 8), short_circuit=False,
config=self._search_config, degree=degree)
workspace.enqueue(urls)
workspace.wait()
result = workspace.get_result()
self._logger.info(result.get_log_message(self.title))


+ 30
- 0
earwigbot/wiki/copyvios/markov.py View File

@@ -94,5 +94,35 @@ class MarkovChainIntersection(MarkovChain):
return res.format(self.size, self.mc1, self.mc2)


class MarkovChainUnion(MarkovChain):
"""Implemented the union of multiple chains."""

def __init__(self, chains):
self.chains = list(chains)
self.chain = self._build()
self.size = self._get_size()

def _build(self):
"""Build and return the Markov chain from the input chains."""
union = {}
for chain in self.chains:
for phrase, count in chain.chain.iteritems():
if phrase in union:
union[phrase] += count
else:
union[phrase] = count
return union

def __repr__(self):
"""Return the canonical string representation of the union."""
res = "MarkovChainUnion(chains={!r})"
return res.format(self.chains)

def __str__(self):
"""Return a nice string representation of the union."""
res = "<MarkovChainUnion of size {} ({})>"
return res.format(self.size, "| ".join(str(chain) for chain in self.chains))


EMPTY = MarkovChain("")
EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY)

+ 1
- 1
earwigbot/wiki/copyvios/parsers.py View File

@@ -265,7 +265,7 @@ class _HTMLParser(_BaseTextParser):
for element in soup.find_all(tag):
element.extract()

return "\n".join(soup.stripped_strings)
return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings)

def _open(self, url, **kwargs):
"""Try to read a URL. Return None if it couldn't be read."""


+ 15
- 2
earwigbot/wiki/copyvios/result.py View File

@@ -22,6 +22,7 @@

from threading import Event
from time import time
import urlparse

from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION

@@ -77,6 +78,11 @@ class CopyvioSource(object):
res = "<CopyvioSource ({0} with {1} conf)>"
return res.format(self.url, self.confidence)

@property
def domain(self):
"""The source URL's domain name, or None."""
return urlparse.urlparse(self.url).netloc or None

def start_work(self):
"""Mark this source as being worked on right now."""
self._event2.clear()
@@ -130,13 +136,16 @@ class CopyvioCheckResult(object):
"""

def __init__(self, violation, sources, queries, check_time, article_chain,
possible_miss):
possible_miss, included_sources=None, unified_confidence=None):
assert isinstance(sources, list)
self.violation = violation
self.sources = sources
self.queries = queries
self.time = check_time
self.article_chain = article_chain
self.possible_miss = possible_miss
self.included_sources = included_sources if included_sources else []
self.unified_confidence = unified_confidence

def __repr__(self):
"""Return the canonical string representation of the result."""
@@ -157,7 +166,11 @@ class CopyvioCheckResult(object):
@property
def confidence(self):
"""The confidence of the best source, or 0 if no sources exist."""
return self.best.confidence if self.best else 0.0
return (
self.unified_confidence if self.unified_confidence is not None else
self.best.confidence if self.best else
0.0
)

@property
def url(self):


+ 19
- 10
earwigbot/wiki/copyvios/workers.py View File

@@ -39,7 +39,7 @@ import urlparse

from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection, MarkovChainUnion
from earwigbot.wiki.copyvios.parsers import get_parser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource

@@ -47,6 +47,8 @@ tldextract = importer.new("tldextract")

__all__ = ["globalize", "localize", "CopyvioWorkspace"]

INCLUDE_THRESHOLD = 0.15

_MAX_REDIRECTS = 3
_MAX_RAW_SIZE = 20 * 1024 ** 2

@@ -476,14 +478,21 @@ class CopyvioWorkspace(object):

def get_result(self, num_queries=0):
"""Return a CopyvioCheckResult containing the results of this check."""
def cmpfunc(s1, s2):
if s2.confidence != s1.confidence:
return 1 if s2.confidence > s1.confidence else -1
if s2.excluded != s1.excluded:
return 1 if s1.excluded else -1
return int(s1.skipped) - int(s2.skipped)

self.sources.sort(cmpfunc)
self.sources.sort(
key=lambda s: (s.confidence, not s.excluded, not s.skipped, s.chains[0].size),
reverse=True,
)

included_sources = [
source for source in self.sources if source.confidence >= INCLUDE_THRESHOLD
]
if included_sources:
unified = MarkovChainUnion(source.chains[0] for source in included_sources)
delta = MarkovChainIntersection(self._article, unified)
unified_confidence = self._calculate_confidence(delta)
else:
unified_confidence = None

return CopyvioCheckResult(self.finished, self.sources, num_queries,
time.time() - self._start_time, self._article,
self.possible_miss)
self.possible_miss, included_sources, unified_confidence)

Loading…
Cancel
Save