Browse Source

Support multiple URLs

legacy-python2
Ben Kurtovic 4 months ago
parent
commit
aac7ebaedc
5 changed files with 73 additions and 18 deletions
  1. +8
    -5
      earwigbot/wiki/copyvios/__init__.py
  2. +30
    -0
      earwigbot/wiki/copyvios/markov.py
  3. +1
    -1
      earwigbot/wiki/copyvios/parsers.py
  4. +15
    -2
      earwigbot/wiki/copyvios/result.py
  5. +19
    -10
      earwigbot/wiki/copyvios/workers.py

+ 8
- 5
earwigbot/wiki/copyvios/__init__.py View File

@@ -162,8 +162,8 @@ class CopyvioMixIn(object):
self._logger.info(result.get_log_message(self.title)) self._logger.info(result.get_log_message(self.title))
return result return result


def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5):
"""Check the page like :py:meth:`copyvio_check` against a specific URL.
def copyvio_compare(self, urls, min_confidence=0.75, max_time=30, degree=5):
"""Check the page like :py:meth:`copyvio_check` against specific URLs.


This is essentially a reduced version of :meth:`copyvio_check` - a This is essentially a reduced version of :meth:`copyvio_check` - a
copyivo comparison is made using Markov chains and the result is copyivo comparison is made using Markov chains and the result is
@@ -183,13 +183,16 @@ class CopyvioMixIn(object):
Since no searching is done, neither :exc:`.UnknownSearchEngineError` Since no searching is done, neither :exc:`.UnknownSearchEngineError`
nor :exc:`.SearchQueryError` will be raised. nor :exc:`.SearchQueryError` will be raised.
""" """
if not isinstance(urls, list):
urls = [urls]
log = u"Starting copyvio compare for [[{0}]] against {1}" log = u"Starting copyvio compare for [[{0}]] against {1}"
self._logger.info(log.format(self.title, url))
self._logger.info(log.format(self.title, ", ".join(urls)))
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree) article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree)
workspace = CopyvioWorkspace( workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders, article, min_confidence, max_time, self._logger, self._addheaders,
max_time, num_workers=1, config=self._search_config, degree=degree)
workspace.enqueue([url])
max_time, num_workers=min(len(urls), 8), short_circuit=False,
config=self._search_config, degree=degree)
workspace.enqueue(urls)
workspace.wait() workspace.wait()
result = workspace.get_result() result = workspace.get_result()
self._logger.info(result.get_log_message(self.title)) self._logger.info(result.get_log_message(self.title))


+ 30
- 0
earwigbot/wiki/copyvios/markov.py View File

@@ -94,5 +94,35 @@ class MarkovChainIntersection(MarkovChain):
return res.format(self.size, self.mc1, self.mc2) return res.format(self.size, self.mc1, self.mc2)




class MarkovChainUnion(MarkovChain):
"""Implemented the union of multiple chains."""

def __init__(self, chains):
self.chains = list(chains)
self.chain = self._build()
self.size = self._get_size()

def _build(self):
"""Build and return the Markov chain from the input chains."""
union = {}
for chain in self.chains:
for phrase, count in chain.chain.iteritems():
if phrase in union:
union[phrase] += count
else:
union[phrase] = count
return union

def __repr__(self):
"""Return the canonical string representation of the union."""
res = "MarkovChainUnion(chains={!r})"
return res.format(self.chains)

def __str__(self):
"""Return a nice string representation of the union."""
res = "<MarkovChainUnion of size {} ({})>"
return res.format(self.size, "| ".join(str(chain) for chain in self.chains))


EMPTY = MarkovChain("") EMPTY = MarkovChain("")
EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY) EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY)

+ 1
- 1
earwigbot/wiki/copyvios/parsers.py View File

@@ -265,7 +265,7 @@ class _HTMLParser(_BaseTextParser):
for element in soup.find_all(tag): for element in soup.find_all(tag):
element.extract() element.extract()


return "\n".join(soup.stripped_strings)
return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings)


def _open(self, url, **kwargs): def _open(self, url, **kwargs):
"""Try to read a URL. Return None if it couldn't be read.""" """Try to read a URL. Return None if it couldn't be read."""


+ 15
- 2
earwigbot/wiki/copyvios/result.py View File

@@ -22,6 +22,7 @@


from threading import Event from threading import Event
from time import time from time import time
import urlparse


from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION


@@ -77,6 +78,11 @@ class CopyvioSource(object):
res = "<CopyvioSource ({0} with {1} conf)>" res = "<CopyvioSource ({0} with {1} conf)>"
return res.format(self.url, self.confidence) return res.format(self.url, self.confidence)


@property
def domain(self):
"""The source URL's domain name, or None."""
return urlparse.urlparse(self.url).netloc or None

def start_work(self): def start_work(self):
"""Mark this source as being worked on right now.""" """Mark this source as being worked on right now."""
self._event2.clear() self._event2.clear()
@@ -130,13 +136,16 @@ class CopyvioCheckResult(object):
""" """


def __init__(self, violation, sources, queries, check_time, article_chain, def __init__(self, violation, sources, queries, check_time, article_chain,
possible_miss):
possible_miss, included_sources=None, unified_confidence=None):
assert isinstance(sources, list)
self.violation = violation self.violation = violation
self.sources = sources self.sources = sources
self.queries = queries self.queries = queries
self.time = check_time self.time = check_time
self.article_chain = article_chain self.article_chain = article_chain
self.possible_miss = possible_miss self.possible_miss = possible_miss
self.included_sources = included_sources if included_sources else []
self.unified_confidence = unified_confidence


def __repr__(self): def __repr__(self):
"""Return the canonical string representation of the result.""" """Return the canonical string representation of the result."""
@@ -157,7 +166,11 @@ class CopyvioCheckResult(object):
@property @property
def confidence(self): def confidence(self):
"""The confidence of the best source, or 0 if no sources exist.""" """The confidence of the best source, or 0 if no sources exist."""
return self.best.confidence if self.best else 0.0
return (
self.unified_confidence if self.unified_confidence is not None else
self.best.confidence if self.best else
0.0
)


@property @property
def url(self): def url(self):


+ 19
- 10
earwigbot/wiki/copyvios/workers.py View File

@@ -39,7 +39,7 @@ import urlparse


from earwigbot import importer from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError from earwigbot.exceptions import ParserExclusionError, ParserRedirectError
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection, MarkovChainUnion
from earwigbot.wiki.copyvios.parsers import get_parser from earwigbot.wiki.copyvios.parsers import get_parser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource


@@ -47,6 +47,8 @@ tldextract = importer.new("tldextract")


__all__ = ["globalize", "localize", "CopyvioWorkspace"] __all__ = ["globalize", "localize", "CopyvioWorkspace"]


INCLUDE_THRESHOLD = 0.15

_MAX_REDIRECTS = 3 _MAX_REDIRECTS = 3
_MAX_RAW_SIZE = 20 * 1024 ** 2 _MAX_RAW_SIZE = 20 * 1024 ** 2


@@ -476,14 +478,21 @@ class CopyvioWorkspace(object):


def get_result(self, num_queries=0): def get_result(self, num_queries=0):
"""Return a CopyvioCheckResult containing the results of this check.""" """Return a CopyvioCheckResult containing the results of this check."""
def cmpfunc(s1, s2):
if s2.confidence != s1.confidence:
return 1 if s2.confidence > s1.confidence else -1
if s2.excluded != s1.excluded:
return 1 if s1.excluded else -1
return int(s1.skipped) - int(s2.skipped)

self.sources.sort(cmpfunc)
self.sources.sort(
key=lambda s: (s.confidence, not s.excluded, not s.skipped, s.chains[0].size),
reverse=True,
)

included_sources = [
source for source in self.sources if source.confidence >= INCLUDE_THRESHOLD
]
if included_sources:
unified = MarkovChainUnion(source.chains[0] for source in included_sources)
delta = MarkovChainIntersection(self._article, unified)
unified_confidence = self._calculate_confidence(delta)
else:
unified_confidence = None

return CopyvioCheckResult(self.finished, self.sources, num_queries, return CopyvioCheckResult(self.finished, self.sources, num_queries,
time.time() - self._start_time, self._article, time.time() - self._start_time, self._article,
self.possible_miss)
self.possible_miss, included_sources, unified_confidence)

Loading…
Cancel
Save