# -*- coding: utf-8 -*- # # Copyright (C) 2009-2015 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from threading import Event from time import time from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION __all__ = ["CopyvioSource", "CopyvioCheckResult"] class CopyvioSource(object): """ **EarwigBot: Wiki Toolset: Copyvio Source** A class that represents a single possible source of a copyright violation, i.e., a URL. *Attributes:* - :py:attr:`url`: the URL of the source - :py:attr:`confidence`: the confidence of a violation, between 0 and 1 - :py:attr:`chains`: a 2-tuple of the source chain and the delta chain - :py:attr:`skipped`: whether this URL was skipped during the check """ def __init__(self, workspace, url, headers=None, timeout=5): self.workspace = workspace self.url = url self.headers = headers self.timeout = timeout self.confidence = 0.0 self.chains = (EMPTY, EMPTY_INTERSECTION) self.skipped = False self._event1 = Event() self._event2 = Event() self._event2.set() def __repr__(self): """Return the canonical string representation of the source.""" res = "CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r})" return res.format(self.url, self.confidence, self.skipped) def __str__(self): """Return a nice string representation of the source.""" if self.skipped: return "".format(self.url) res = "" return res.format(self.url, self.confidence) def start_work(self): """Mark this source as being worked on right now.""" self._event2.clear() self._event1.set() def update(self, confidence, source_chain, delta_chain): """Fill out the confidence and chain information inside this source.""" self.confidence = confidence self.chains = (source_chain, delta_chain) def finish_work(self): """Mark this source as finished.""" self._event2.set() def skip(self): """Deactivate this source without filling in the relevant data.""" if self._event1.is_set(): return self.skipped = True self._event1.set() def join(self, until): """Block until this violation result is filled out.""" for event in [self._event1, self._event2]: if until: timeout = until - time() if timeout <= 0: return event.wait(timeout) else: event.wait() class CopyvioCheckResult(object): """ **EarwigBot: Wiki Toolset: Copyvio Check Result** A class holding information about the results of a copyvio check. *Attributes:* - :py:attr:`violation`: ``True`` if this is a violation, else ``False`` - :py:attr:`sources`: a list of CopyvioSources, sorted by confidence - :py:attr:`best`: the best matching CopyvioSource, or ``None`` - :py:attr:`confidence`: the best matching source's confidence, or 0 - :py:attr:`url`: the best matching source's URL, or ``None`` - :py:attr:`queries`: the number of queries used to reach a result - :py:attr:`time`: the amount of time the check took to complete - :py:attr:`article_chain`: the MarkovChain of the article text - :py:attr:`possible_miss`: whether some URLs might have been missed """ def __init__(self, violation, sources, queries, check_time, article_chain, possible_miss): self.violation = violation self.sources = sources self.queries = queries self.time = check_time self.article_chain = article_chain self.possible_miss = possible_miss def __repr__(self): """Return the canonical string representation of the result.""" res = "CopyvioCheckResult(violation={0!r}, sources={1!r}, queries={2!r}, time={3!r})" return res.format(self.violation, self.sources, self.queries, self.time) def __str__(self): """Return a nice string representation of the result.""" res = "" return res.format(self.violation, self.best) @property def best(self): """The best known source, or None if no sources exist.""" return self.sources[0] if self.sources else None @property def confidence(self): """The confidence of the best source, or 0 if no sources exist.""" return self.best.confidence if self.best else 0.0 @property def url(self): """The URL of the best source, or None if no sources exist.""" return self.best.url if self.best else None def get_log_message(self, title): """Build a relevant log message for this copyvio check result.""" if not self.sources: log = u"No violation for [[{0}]] (no sources; {1} queries; {2} seconds)" return log.format(title, self.queries, self.time) log = u"{0} for [[{1}]] (best: {2} ({3} confidence); {4} sources; {5} queries; {6} seconds)" is_vio = "Violation detected" if self.violation else "No violation" return log.format(is_vio, title, self.url, self.confidence, len(self.sources), self.queries, self.time)