@@ -26,7 +26,6 @@ from urllib2 import build_opener | |||
from earwigbot import exceptions, importer | |||
from earwigbot.wiki.copyvios.markov import MarkovChain | |||
from earwigbot.wiki.copyvios.parsers import ArticleTextParser | |||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult | |||
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine | |||
from earwigbot.wiki.copyvios.workers import ( | |||
globalize, localize, CopyvioWorkspace) | |||
@@ -109,12 +108,10 @@ class CopyvioMixIn(object): | |||
""" | |||
log = u"Starting copyvio check for [[{0}]]" | |||
self._logger.info(log.format(self.title)) | |||
start_time = time() | |||
until = (start_time + max_time) if max_time > 0 else None | |||
searcher = self._get_search_engine() | |||
parser = ArticleTextParser(self.get()) | |||
article = MarkovChain(parser.strip()) | |||
workspace = CopyvioWorkspace(article, min_confidence, until, | |||
workspace = CopyvioWorkspace(article, min_confidence, max_time, | |||
self._logger, self._addheaders) | |||
if self._exclusions_db: | |||
self._exclusions_db.sync(self.site.name) | |||
@@ -123,8 +120,7 @@ class CopyvioMixIn(object): | |||
exclude = None | |||
if article.size < 20: # Auto-fail very small articles | |||
result = CopyvioCheckResult(False, 0.0, None, 0, 0, article, | |||
workspace.best.chains) | |||
result = workspace.get_result() | |||
self._logger.info(result.get_log_message(self.title)) | |||
return result | |||
@@ -134,19 +130,15 @@ class CopyvioMixIn(object): | |||
if not no_searches: | |||
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) | |||
for chunk in chunks: | |||
if workspace.best.confidence >= min_confidence: | |||
if workspace.finished: | |||
break | |||
log = u"[[{0}]] -> querying {1} for {2!r}" | |||
self._logger.debug(log.format(self.title, searcher.name, chunk)) | |||
workspace.enqueue(searcher.search(chunk), exclude) | |||
num_queries += 1 | |||
sleep(1) | |||
workspace.wait() | |||
result = CopyvioCheckResult( | |||
workspace.best.confidence >= min_confidence, | |||
workspace.best.confidence, workspace.best.url, num_queries, | |||
time() - start_time, article, workspace.best.chains) | |||
result = workspace.get_result(num_queries) | |||
self._logger.info(result.get_log_message(self.title)) | |||
return result | |||
@@ -173,17 +165,12 @@ class CopyvioMixIn(object): | |||
""" | |||
log = u"Starting copyvio compare for [[{0}]] against {1}" | |||
self._logger.info(log.format(self.title, url)) | |||
start_time = time() | |||
until = (start_time + max_time) if max_time > 0 else None | |||
article = MarkovChain(ArticleTextParser(self.get()).strip()) | |||
workspace = CopyvioWorkspace( | |||
article, min_confidence, until, self._logger, self._addheaders, | |||
article, min_confidence, max_time, self._logger, self._addheaders, | |||
max_time, 1) | |||
workspace.enqueue([url]) | |||
workspace.wait() | |||
best = workspace.best | |||
result = CopyvioCheckResult( | |||
best.confidence >= min_confidence, best.confidence, best.url, 0, | |||
time() - start_time, article, best.chains) | |||
result = workspace.get_result() | |||
self._logger.info(result.get_log_message(self.title)) | |||
return result |
@@ -30,7 +30,7 @@ from earwigbot import exceptions | |||
__all__ = ["ExclusionsDB"] | |||
default_sources = { | |||
DEFAULT_SOURCES = { | |||
"all": [ # Applies to all, but located on enwiki | |||
"User:EarwigBot/Copyvios/Exclusions" | |||
], | |||
@@ -74,7 +74,7 @@ class ExclusionsDB(object): | |||
""" | |||
query = "INSERT INTO sources VALUES (?, ?);" | |||
sources = [] | |||
for sitename, pages in default_sources.iteritems(): | |||
for sitename, pages in DEFAULT_SOURCES.iteritems(): | |||
for page in pages: | |||
sources.append((sitename, page)) | |||
@@ -95,8 +95,9 @@ class ExclusionsDB(object): | |||
r"\*\s*Site:\s*(?:\[|\<nowiki\>)?(?:https?:)?(?://)?(.*?)(?:\].*?|\</nowiki\>.*?)?\s*$" | |||
] | |||
for regex in regexes: | |||
find = re.findall(regex, data, re.I|re.M) | |||
[urls.add(url.lower().strip()) for url in find if url.strip()] | |||
for url in re.findall(regex, data, re.I|re.M): | |||
if url.strip(): | |||
urls.add(url.lower().strip()) | |||
return urls | |||
def _update(self, sitename): | |||
@@ -20,7 +20,6 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
import errno | |||
from os import path | |||
import mwparserfromhell | |||
@@ -28,7 +28,12 @@ from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION | |||
__all__ = ["CopyvioSource", "CopyvioCheckResult"] | |||
class CopyvioSource(object): | |||
"""Represents a single suspected violation source (a URL).""" | |||
""" | |||
**EarwigBot: Wiki Toolset: Copyvio Source** | |||
A class that represents a single possible source of a copyright violation, | |||
i.e., a URL. | |||
""" | |||
def __init__(self, workspace, url, key, headers=None, timeout=5): | |||
self.workspace = workspace | |||
@@ -38,14 +43,23 @@ class CopyvioSource(object): | |||
self.timeout = timeout | |||
self.confidence = 0.0 | |||
self.chains = (EMPTY, EMPTY_INTERSECTION) | |||
self.skipped = False | |||
self._event1 = Event() | |||
self._event2 = Event() | |||
self._event2.set() | |||
def touched(self): | |||
"""Return whether one of start_work() and cancel() have been called.""" | |||
return self._event1.is_set() | |||
def __repr__(self): | |||
"""Return the canonical string representation of the source.""" | |||
res = "CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r})" | |||
return res.format(self.url, self.confidence, self.skipped) | |||
def __str__(self): | |||
"""Return a nice string representation of the source.""" | |||
if self.skipped: | |||
return "<CopyvioSource ({0}, skipped)>".format(self.url) | |||
res = "<CopyvioSource ({0} with {1} conf)>" | |||
return res.format(self.url, self.confidence) | |||
def start_work(self): | |||
"""Mark this source as being worked on right now.""" | |||
@@ -58,8 +72,11 @@ class CopyvioSource(object): | |||
self.chains = (source_chain, delta_chain) | |||
self._event2.set() | |||
def cancel(self): | |||
def skip(self): | |||
"""Deactivate this source without filling in the relevant data.""" | |||
if self._event1.is_set(): | |||
return | |||
self.skipped = True | |||
self._event1.set() | |||
def join(self, until): | |||
@@ -70,6 +87,8 @@ class CopyvioSource(object): | |||
if timeout <= 0: | |||
return | |||
event.wait(timeout) | |||
else: | |||
event.wait() | |||
class CopyvioCheckResult(object): | |||
@@ -81,40 +100,51 @@ class CopyvioCheckResult(object): | |||
*Attributes:* | |||
- :py:attr:`violation`: ``True`` if this is a violation, else ``False`` | |||
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy | |||
- :py:attr:`url`: the URL of the violated page | |||
- :py:attr:`sources`: a list of CopyvioSources, sorted by confidence | |||
- :py:attr:`queries`: the number of queries used to reach a result | |||
- :py:attr:`time`: the amount of time the check took to complete | |||
- :py:attr:`article_chain`: the MarkovChain of the article text | |||
- :py:attr:`source_chain`: the MarkovChain of the violated page text | |||
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two | |||
""" | |||
def __init__(self, violation, confidence, url, queries, time, article, | |||
chains): | |||
def __init__(self, violation, sources, queries, check_time, article_chain): | |||
self.violation = violation | |||
self.confidence = confidence | |||
self.url = url | |||
self.sources = sources | |||
self.queries = queries | |||
self.time = time | |||
self.article_chain = article | |||
self.source_chain = chains[0] | |||
self.delta_chain = chains[1] | |||
self.time = check_time | |||
self.article_chain = article_chain | |||
def __repr__(self): | |||
"""Return the canonical string representation of the result.""" | |||
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3!r})" | |||
return res.format(self.violation, self.confidence, self.url, | |||
self.queries) | |||
res = "CopyvioCheckResult(violation={0!r}, sources={1!r}, queries={2!r}, time={3!r})" | |||
return res.format(self.violation, self.sources, self.queries, | |||
self.time) | |||
def __str__(self): | |||
"""Return a nice string representation of the result.""" | |||
res = "<CopyvioCheckResult ({0} with {1} conf)>" | |||
return res.format(self.violation, self.confidence) | |||
res = "<CopyvioCheckResult ({0} with best {1})>" | |||
return res.format(self.violation, self.best) | |||
@property | |||
def best(self): | |||
"""The best known source, or None if no sources exist.""" | |||
return self.sources[0] if self.sources else None | |||
@property | |||
def confidence(self): | |||
"""The confidence of the best source, or 0 if no sources exist.""" | |||
return self.best.confidence if self.best else 0.0 | |||
@property | |||
def url(self): | |||
"""The url of the best source, or None if no sources exist.""" | |||
return self.best.url if self.best else None | |||
def get_log_message(self, title): | |||
"""Build a relevant log message for this copyvio check result.""" | |||
log = u"{0} for [[{1}]] (confidence: {2}; URL: {3}; {4} queries; {5} seconds)" | |||
if not self.sources: | |||
log = u"No violation for [[{0}]] (no sources; {1} queries; {2} seconds)" | |||
return log.format(title, self.queries, self.time) | |||
log = u"{0} for [[{1}]] (best: {2} ({3} confidence); {4} queries; {5} seconds)" | |||
is_vio = "Violation detected" if self.violation else "No violation" | |||
return log.format(is_vio, title, self.confidence, self.url, | |||
return log.format(is_vio, title, self.url, self.confidence, | |||
self.queries, self.time) |
@@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError | |||
from earwigbot import importer | |||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | |||
from earwigbot.wiki.copyvios.parsers import HTMLTextParser | |||
from earwigbot.wiki.copyvios.result import CopyvioSource | |||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | |||
tldextract = importer.new("tldextract") | |||
@@ -120,7 +120,8 @@ class _CopyvioWorker(object): | |||
If a URLError was raised while opening the URL or an IOError was raised | |||
while decompressing, None will be returned. | |||
""" | |||
self._opener.addheaders = source.headers | |||
if source.headers: | |||
self._opener.addheaders = source.headers | |||
url = source.url.encode("utf8") | |||
try: | |||
response = self._opener.open(url, timeout=source.timeout) | |||
@@ -194,8 +195,8 @@ class _CopyvioWorker(object): | |||
return self._dequeue() | |||
self._logger.debug(u"Got source URL: {0}".format(source.url)) | |||
if source.touched(): | |||
self._logger.debug("Source has been cancelled") | |||
if source.skipped: | |||
self._logger.debug("Source has been skipped") | |||
self._queues.lock.release() | |||
return self._dequeue() | |||
@@ -232,18 +233,18 @@ class _CopyvioWorker(object): | |||
class CopyvioWorkspace(object): | |||
"""Manages a single copyvio check distributed across threads.""" | |||
def __init__(self, article, min_confidence, until, logger, headers, | |||
def __init__(self, article, min_confidence, max_time, logger, headers, | |||
url_timeout=5, num_workers=8): | |||
self.best = CopyvioSource(self, None, None) | |||
self.sources = [] | |||
self.finished = False | |||
self._article = article | |||
self._logger = logger.getChild("copyvios") | |||
self._min_confidence = min_confidence | |||
self._until = until | |||
self._start_time = time() | |||
self._until = (self._start_time + max_time) if max_time > 0 else None | |||
self._handled_urls = [] | |||
self._is_finished = False | |||
self._compare_lock = Lock() | |||
self._finish_lock = Lock() | |||
self._source_args = {"workspace": self, "headers": headers, | |||
"timeout": url_timeout} | |||
@@ -254,7 +255,7 @@ class CopyvioWorkspace(object): | |||
self._num_workers = num_workers | |||
for i in xrange(num_workers): | |||
name = "local-{0:04}.{1}".format(id(self) % 10000, i) | |||
_CopyvioWorker(name, self._queues, until).start() | |||
_CopyvioWorker(name, self._queues, self._until).start() | |||
def _calculate_confidence(self, delta): | |||
"""Return the confidence of a violation as a float between 0 and 1.""" | |||
@@ -294,13 +295,11 @@ class CopyvioWorkspace(object): | |||
def _finish_early(self): | |||
"""Finish handling links prematurely (if we've hit min_confidence).""" | |||
if self._is_finished: | |||
return | |||
self._logger.debug("Confidence threshold met; cancelling remaining sources") | |||
self._logger.debug("Confidence threshold met; skipping remaining sources") | |||
with self._queues.lock: | |||
for source in self.sources: | |||
source.cancel() | |||
self._is_finished = True | |||
source.skip() | |||
self.finished = True | |||
def enqueue(self, urls, exclude_check=None): | |||
"""Put a list of URLs into the various worker queues. | |||
@@ -310,9 +309,9 @@ class CopyvioWorkspace(object): | |||
""" | |||
for url in urls: | |||
with self._queues.lock: | |||
if self._is_finished: | |||
if self.finished: | |||
break | |||
if not url or url in self._handled_urls: | |||
if url in self._handled_urls: | |||
continue | |||
self._handled_urls.append(url) | |||
if exclude_check and exclude_check(url): | |||
@@ -336,26 +335,29 @@ class CopyvioWorkspace(object): | |||
queue.append(source) | |||
self._queues.unassigned.put((key, queue)) | |||
def compare(self, source, source_chain): | |||
"""Compare a source to the article; call _finish_early if necessary.""" | |||
delta = MarkovChainIntersection(self._article, source_chain) | |||
conf = self._calculate_confidence(delta) | |||
self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf)) | |||
with self._finish_lock: | |||
source.finish_work(conf, source_chain, delta) | |||
if not self.finished and conf >= self._min_confidence: | |||
self._finish_early() | |||
def wait(self): | |||
"""Wait for the workers to finish handling the sources.""" | |||
self._logger.debug("Waiting on {0} sources".format(len(self.sources))) | |||
for source in self.sources: | |||
source.join(self._until) | |||
with self._compare_lock: | |||
with self._finish_lock: | |||
pass # Wait for any remaining comparisons to be finished | |||
if not _is_globalized: | |||
for i in xrange(self._num_workers): | |||
self._queues.unassigned.put((StopIteration, None)) | |||
def compare(self, source, source_chain): | |||
"""Compare a source to the article, and update the best known one.""" | |||
delta = MarkovChainIntersection(self._article, source_chain) | |||
conf = self._calculate_confidence(delta) | |||
self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf)) | |||
with self._compare_lock: | |||
source.finish_work(conf, source_chain, delta) | |||
if conf > self.best.confidence: | |||
self.best = source | |||
if conf >= self._min_confidence: | |||
self._finish_early() | |||
def get_result(self, num_queries=0): | |||
"""Return a CopyvioCheckResult containing the results of this check.""" | |||
self.sources.sort(key=lambda source: source.confidence, reverse=True) | |||
return CopyvioCheckResult(self.finished, self.sources, num_queries, | |||
time() - self._start_time, self._article) |