@@ -26,7 +26,6 @@ from urllib2 import build_opener | |||||
from earwigbot import exceptions, importer | from earwigbot import exceptions, importer | ||||
from earwigbot.wiki.copyvios.markov import MarkovChain | from earwigbot.wiki.copyvios.markov import MarkovChain | ||||
from earwigbot.wiki.copyvios.parsers import ArticleTextParser | from earwigbot.wiki.copyvios.parsers import ArticleTextParser | ||||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult | |||||
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine | from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine | ||||
from earwigbot.wiki.copyvios.workers import ( | from earwigbot.wiki.copyvios.workers import ( | ||||
globalize, localize, CopyvioWorkspace) | globalize, localize, CopyvioWorkspace) | ||||
@@ -109,12 +108,10 @@ class CopyvioMixIn(object): | |||||
""" | """ | ||||
log = u"Starting copyvio check for [[{0}]]" | log = u"Starting copyvio check for [[{0}]]" | ||||
self._logger.info(log.format(self.title)) | self._logger.info(log.format(self.title)) | ||||
start_time = time() | |||||
until = (start_time + max_time) if max_time > 0 else None | |||||
searcher = self._get_search_engine() | searcher = self._get_search_engine() | ||||
parser = ArticleTextParser(self.get()) | parser = ArticleTextParser(self.get()) | ||||
article = MarkovChain(parser.strip()) | article = MarkovChain(parser.strip()) | ||||
workspace = CopyvioWorkspace(article, min_confidence, until, | |||||
workspace = CopyvioWorkspace(article, min_confidence, max_time, | |||||
self._logger, self._addheaders) | self._logger, self._addheaders) | ||||
if self._exclusions_db: | if self._exclusions_db: | ||||
self._exclusions_db.sync(self.site.name) | self._exclusions_db.sync(self.site.name) | ||||
@@ -123,8 +120,7 @@ class CopyvioMixIn(object): | |||||
exclude = None | exclude = None | ||||
if article.size < 20: # Auto-fail very small articles | if article.size < 20: # Auto-fail very small articles | ||||
result = CopyvioCheckResult(False, 0.0, None, 0, 0, article, | |||||
workspace.best.chains) | |||||
result = workspace.get_result() | |||||
self._logger.info(result.get_log_message(self.title)) | self._logger.info(result.get_log_message(self.title)) | ||||
return result | return result | ||||
@@ -134,19 +130,15 @@ class CopyvioMixIn(object): | |||||
if not no_searches: | if not no_searches: | ||||
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) | chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) | ||||
for chunk in chunks: | for chunk in chunks: | ||||
if workspace.best.confidence >= min_confidence: | |||||
if workspace.finished: | |||||
break | break | ||||
log = u"[[{0}]] -> querying {1} for {2!r}" | log = u"[[{0}]] -> querying {1} for {2!r}" | ||||
self._logger.debug(log.format(self.title, searcher.name, chunk)) | self._logger.debug(log.format(self.title, searcher.name, chunk)) | ||||
workspace.enqueue(searcher.search(chunk), exclude) | workspace.enqueue(searcher.search(chunk), exclude) | ||||
num_queries += 1 | num_queries += 1 | ||||
sleep(1) | sleep(1) | ||||
workspace.wait() | workspace.wait() | ||||
result = CopyvioCheckResult( | |||||
workspace.best.confidence >= min_confidence, | |||||
workspace.best.confidence, workspace.best.url, num_queries, | |||||
time() - start_time, article, workspace.best.chains) | |||||
result = workspace.get_result(num_queries) | |||||
self._logger.info(result.get_log_message(self.title)) | self._logger.info(result.get_log_message(self.title)) | ||||
return result | return result | ||||
@@ -173,17 +165,12 @@ class CopyvioMixIn(object): | |||||
""" | """ | ||||
log = u"Starting copyvio compare for [[{0}]] against {1}" | log = u"Starting copyvio compare for [[{0}]] against {1}" | ||||
self._logger.info(log.format(self.title, url)) | self._logger.info(log.format(self.title, url)) | ||||
start_time = time() | |||||
until = (start_time + max_time) if max_time > 0 else None | |||||
article = MarkovChain(ArticleTextParser(self.get()).strip()) | article = MarkovChain(ArticleTextParser(self.get()).strip()) | ||||
workspace = CopyvioWorkspace( | workspace = CopyvioWorkspace( | ||||
article, min_confidence, until, self._logger, self._addheaders, | |||||
article, min_confidence, max_time, self._logger, self._addheaders, | |||||
max_time, 1) | max_time, 1) | ||||
workspace.enqueue([url]) | workspace.enqueue([url]) | ||||
workspace.wait() | workspace.wait() | ||||
best = workspace.best | |||||
result = CopyvioCheckResult( | |||||
best.confidence >= min_confidence, best.confidence, best.url, 0, | |||||
time() - start_time, article, best.chains) | |||||
result = workspace.get_result() | |||||
self._logger.info(result.get_log_message(self.title)) | self._logger.info(result.get_log_message(self.title)) | ||||
return result | return result |
@@ -30,7 +30,7 @@ from earwigbot import exceptions | |||||
__all__ = ["ExclusionsDB"] | __all__ = ["ExclusionsDB"] | ||||
default_sources = { | |||||
DEFAULT_SOURCES = { | |||||
"all": [ # Applies to all, but located on enwiki | "all": [ # Applies to all, but located on enwiki | ||||
"User:EarwigBot/Copyvios/Exclusions" | "User:EarwigBot/Copyvios/Exclusions" | ||||
], | ], | ||||
@@ -74,7 +74,7 @@ class ExclusionsDB(object): | |||||
""" | """ | ||||
query = "INSERT INTO sources VALUES (?, ?);" | query = "INSERT INTO sources VALUES (?, ?);" | ||||
sources = [] | sources = [] | ||||
for sitename, pages in default_sources.iteritems(): | |||||
for sitename, pages in DEFAULT_SOURCES.iteritems(): | |||||
for page in pages: | for page in pages: | ||||
sources.append((sitename, page)) | sources.append((sitename, page)) | ||||
@@ -95,8 +95,9 @@ class ExclusionsDB(object): | |||||
r"\*\s*Site:\s*(?:\[|\<nowiki\>)?(?:https?:)?(?://)?(.*?)(?:\].*?|\</nowiki\>.*?)?\s*$" | r"\*\s*Site:\s*(?:\[|\<nowiki\>)?(?:https?:)?(?://)?(.*?)(?:\].*?|\</nowiki\>.*?)?\s*$" | ||||
] | ] | ||||
for regex in regexes: | for regex in regexes: | ||||
find = re.findall(regex, data, re.I|re.M) | |||||
[urls.add(url.lower().strip()) for url in find if url.strip()] | |||||
for url in re.findall(regex, data, re.I|re.M): | |||||
if url.strip(): | |||||
urls.add(url.lower().strip()) | |||||
return urls | return urls | ||||
def _update(self, sitename): | def _update(self, sitename): | ||||
@@ -20,7 +20,6 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
import errno | |||||
from os import path | from os import path | ||||
import mwparserfromhell | import mwparserfromhell | ||||
@@ -28,7 +28,12 @@ from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION | |||||
__all__ = ["CopyvioSource", "CopyvioCheckResult"] | __all__ = ["CopyvioSource", "CopyvioCheckResult"] | ||||
class CopyvioSource(object): | class CopyvioSource(object): | ||||
"""Represents a single suspected violation source (a URL).""" | |||||
""" | |||||
**EarwigBot: Wiki Toolset: Copyvio Source** | |||||
A class that represents a single possible source of a copyright violation, | |||||
i.e., a URL. | |||||
""" | |||||
def __init__(self, workspace, url, key, headers=None, timeout=5): | def __init__(self, workspace, url, key, headers=None, timeout=5): | ||||
self.workspace = workspace | self.workspace = workspace | ||||
@@ -38,14 +43,23 @@ class CopyvioSource(object): | |||||
self.timeout = timeout | self.timeout = timeout | ||||
self.confidence = 0.0 | self.confidence = 0.0 | ||||
self.chains = (EMPTY, EMPTY_INTERSECTION) | self.chains = (EMPTY, EMPTY_INTERSECTION) | ||||
self.skipped = False | |||||
self._event1 = Event() | self._event1 = Event() | ||||
self._event2 = Event() | self._event2 = Event() | ||||
self._event2.set() | self._event2.set() | ||||
def touched(self): | |||||
"""Return whether one of start_work() and cancel() have been called.""" | |||||
return self._event1.is_set() | |||||
def __repr__(self): | |||||
"""Return the canonical string representation of the source.""" | |||||
res = "CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r})" | |||||
return res.format(self.url, self.confidence, self.skipped) | |||||
def __str__(self): | |||||
"""Return a nice string representation of the source.""" | |||||
if self.skipped: | |||||
return "<CopyvioSource ({0}, skipped)>".format(self.url) | |||||
res = "<CopyvioSource ({0} with {1} conf)>" | |||||
return res.format(self.url, self.confidence) | |||||
def start_work(self): | def start_work(self): | ||||
"""Mark this source as being worked on right now.""" | """Mark this source as being worked on right now.""" | ||||
@@ -58,8 +72,11 @@ class CopyvioSource(object): | |||||
self.chains = (source_chain, delta_chain) | self.chains = (source_chain, delta_chain) | ||||
self._event2.set() | self._event2.set() | ||||
def cancel(self): | |||||
def skip(self): | |||||
"""Deactivate this source without filling in the relevant data.""" | """Deactivate this source without filling in the relevant data.""" | ||||
if self._event1.is_set(): | |||||
return | |||||
self.skipped = True | |||||
self._event1.set() | self._event1.set() | ||||
def join(self, until): | def join(self, until): | ||||
@@ -70,6 +87,8 @@ class CopyvioSource(object): | |||||
if timeout <= 0: | if timeout <= 0: | ||||
return | return | ||||
event.wait(timeout) | event.wait(timeout) | ||||
else: | |||||
event.wait() | |||||
class CopyvioCheckResult(object): | class CopyvioCheckResult(object): | ||||
@@ -81,40 +100,51 @@ class CopyvioCheckResult(object): | |||||
*Attributes:* | *Attributes:* | ||||
- :py:attr:`violation`: ``True`` if this is a violation, else ``False`` | - :py:attr:`violation`: ``True`` if this is a violation, else ``False`` | ||||
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy | |||||
- :py:attr:`url`: the URL of the violated page | |||||
- :py:attr:`sources`: a list of CopyvioSources, sorted by confidence | |||||
- :py:attr:`queries`: the number of queries used to reach a result | - :py:attr:`queries`: the number of queries used to reach a result | ||||
- :py:attr:`time`: the amount of time the check took to complete | - :py:attr:`time`: the amount of time the check took to complete | ||||
- :py:attr:`article_chain`: the MarkovChain of the article text | - :py:attr:`article_chain`: the MarkovChain of the article text | ||||
- :py:attr:`source_chain`: the MarkovChain of the violated page text | |||||
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two | |||||
""" | """ | ||||
def __init__(self, violation, confidence, url, queries, time, article, | |||||
chains): | |||||
def __init__(self, violation, sources, queries, check_time, article_chain): | |||||
self.violation = violation | self.violation = violation | ||||
self.confidence = confidence | |||||
self.url = url | |||||
self.sources = sources | |||||
self.queries = queries | self.queries = queries | ||||
self.time = time | |||||
self.article_chain = article | |||||
self.source_chain = chains[0] | |||||
self.delta_chain = chains[1] | |||||
self.time = check_time | |||||
self.article_chain = article_chain | |||||
def __repr__(self): | def __repr__(self): | ||||
"""Return the canonical string representation of the result.""" | """Return the canonical string representation of the result.""" | ||||
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3!r})" | |||||
return res.format(self.violation, self.confidence, self.url, | |||||
self.queries) | |||||
res = "CopyvioCheckResult(violation={0!r}, sources={1!r}, queries={2!r}, time={3!r})" | |||||
return res.format(self.violation, self.sources, self.queries, | |||||
self.time) | |||||
def __str__(self): | def __str__(self): | ||||
"""Return a nice string representation of the result.""" | """Return a nice string representation of the result.""" | ||||
res = "<CopyvioCheckResult ({0} with {1} conf)>" | |||||
return res.format(self.violation, self.confidence) | |||||
res = "<CopyvioCheckResult ({0} with best {1})>" | |||||
return res.format(self.violation, self.best) | |||||
@property | |||||
def best(self): | |||||
"""The best known source, or None if no sources exist.""" | |||||
return self.sources[0] if self.sources else None | |||||
@property | |||||
def confidence(self): | |||||
"""The confidence of the best source, or 0 if no sources exist.""" | |||||
return self.best.confidence if self.best else 0.0 | |||||
@property | |||||
def url(self): | |||||
"""The url of the best source, or None if no sources exist.""" | |||||
return self.best.url if self.best else None | |||||
def get_log_message(self, title): | def get_log_message(self, title): | ||||
"""Build a relevant log message for this copyvio check result.""" | """Build a relevant log message for this copyvio check result.""" | ||||
log = u"{0} for [[{1}]] (confidence: {2}; URL: {3}; {4} queries; {5} seconds)" | |||||
if not self.sources: | |||||
log = u"No violation for [[{0}]] (no sources; {1} queries; {2} seconds)" | |||||
return log.format(title, self.queries, self.time) | |||||
log = u"{0} for [[{1}]] (best: {2} ({3} confidence); {4} queries; {5} seconds)" | |||||
is_vio = "Violation detected" if self.violation else "No violation" | is_vio = "Violation detected" if self.violation else "No violation" | ||||
return log.format(is_vio, title, self.confidence, self.url, | |||||
return log.format(is_vio, title, self.url, self.confidence, | |||||
self.queries, self.time) | self.queries, self.time) |
@@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError | |||||
from earwigbot import importer | from earwigbot import importer | ||||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | ||||
from earwigbot.wiki.copyvios.parsers import HTMLTextParser | from earwigbot.wiki.copyvios.parsers import HTMLTextParser | ||||
from earwigbot.wiki.copyvios.result import CopyvioSource | |||||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | |||||
tldextract = importer.new("tldextract") | tldextract = importer.new("tldextract") | ||||
@@ -120,7 +120,8 @@ class _CopyvioWorker(object): | |||||
If a URLError was raised while opening the URL or an IOError was raised | If a URLError was raised while opening the URL or an IOError was raised | ||||
while decompressing, None will be returned. | while decompressing, None will be returned. | ||||
""" | """ | ||||
self._opener.addheaders = source.headers | |||||
if source.headers: | |||||
self._opener.addheaders = source.headers | |||||
url = source.url.encode("utf8") | url = source.url.encode("utf8") | ||||
try: | try: | ||||
response = self._opener.open(url, timeout=source.timeout) | response = self._opener.open(url, timeout=source.timeout) | ||||
@@ -194,8 +195,8 @@ class _CopyvioWorker(object): | |||||
return self._dequeue() | return self._dequeue() | ||||
self._logger.debug(u"Got source URL: {0}".format(source.url)) | self._logger.debug(u"Got source URL: {0}".format(source.url)) | ||||
if source.touched(): | |||||
self._logger.debug("Source has been cancelled") | |||||
if source.skipped: | |||||
self._logger.debug("Source has been skipped") | |||||
self._queues.lock.release() | self._queues.lock.release() | ||||
return self._dequeue() | return self._dequeue() | ||||
@@ -232,18 +233,18 @@ class _CopyvioWorker(object): | |||||
class CopyvioWorkspace(object): | class CopyvioWorkspace(object): | ||||
"""Manages a single copyvio check distributed across threads.""" | """Manages a single copyvio check distributed across threads.""" | ||||
def __init__(self, article, min_confidence, until, logger, headers, | |||||
def __init__(self, article, min_confidence, max_time, logger, headers, | |||||
url_timeout=5, num_workers=8): | url_timeout=5, num_workers=8): | ||||
self.best = CopyvioSource(self, None, None) | |||||
self.sources = [] | self.sources = [] | ||||
self.finished = False | |||||
self._article = article | self._article = article | ||||
self._logger = logger.getChild("copyvios") | self._logger = logger.getChild("copyvios") | ||||
self._min_confidence = min_confidence | self._min_confidence = min_confidence | ||||
self._until = until | |||||
self._start_time = time() | |||||
self._until = (self._start_time + max_time) if max_time > 0 else None | |||||
self._handled_urls = [] | self._handled_urls = [] | ||||
self._is_finished = False | |||||
self._compare_lock = Lock() | |||||
self._finish_lock = Lock() | |||||
self._source_args = {"workspace": self, "headers": headers, | self._source_args = {"workspace": self, "headers": headers, | ||||
"timeout": url_timeout} | "timeout": url_timeout} | ||||
@@ -254,7 +255,7 @@ class CopyvioWorkspace(object): | |||||
self._num_workers = num_workers | self._num_workers = num_workers | ||||
for i in xrange(num_workers): | for i in xrange(num_workers): | ||||
name = "local-{0:04}.{1}".format(id(self) % 10000, i) | name = "local-{0:04}.{1}".format(id(self) % 10000, i) | ||||
_CopyvioWorker(name, self._queues, until).start() | |||||
_CopyvioWorker(name, self._queues, self._until).start() | |||||
def _calculate_confidence(self, delta): | def _calculate_confidence(self, delta): | ||||
"""Return the confidence of a violation as a float between 0 and 1.""" | """Return the confidence of a violation as a float between 0 and 1.""" | ||||
@@ -294,13 +295,11 @@ class CopyvioWorkspace(object): | |||||
def _finish_early(self): | def _finish_early(self): | ||||
"""Finish handling links prematurely (if we've hit min_confidence).""" | """Finish handling links prematurely (if we've hit min_confidence).""" | ||||
if self._is_finished: | |||||
return | |||||
self._logger.debug("Confidence threshold met; cancelling remaining sources") | |||||
self._logger.debug("Confidence threshold met; skipping remaining sources") | |||||
with self._queues.lock: | with self._queues.lock: | ||||
for source in self.sources: | for source in self.sources: | ||||
source.cancel() | |||||
self._is_finished = True | |||||
source.skip() | |||||
self.finished = True | |||||
def enqueue(self, urls, exclude_check=None): | def enqueue(self, urls, exclude_check=None): | ||||
"""Put a list of URLs into the various worker queues. | """Put a list of URLs into the various worker queues. | ||||
@@ -310,9 +309,9 @@ class CopyvioWorkspace(object): | |||||
""" | """ | ||||
for url in urls: | for url in urls: | ||||
with self._queues.lock: | with self._queues.lock: | ||||
if self._is_finished: | |||||
if self.finished: | |||||
break | break | ||||
if not url or url in self._handled_urls: | |||||
if url in self._handled_urls: | |||||
continue | continue | ||||
self._handled_urls.append(url) | self._handled_urls.append(url) | ||||
if exclude_check and exclude_check(url): | if exclude_check and exclude_check(url): | ||||
@@ -336,26 +335,29 @@ class CopyvioWorkspace(object): | |||||
queue.append(source) | queue.append(source) | ||||
self._queues.unassigned.put((key, queue)) | self._queues.unassigned.put((key, queue)) | ||||
def compare(self, source, source_chain): | |||||
"""Compare a source to the article; call _finish_early if necessary.""" | |||||
delta = MarkovChainIntersection(self._article, source_chain) | |||||
conf = self._calculate_confidence(delta) | |||||
self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf)) | |||||
with self._finish_lock: | |||||
source.finish_work(conf, source_chain, delta) | |||||
if not self.finished and conf >= self._min_confidence: | |||||
self._finish_early() | |||||
def wait(self): | def wait(self): | ||||
"""Wait for the workers to finish handling the sources.""" | """Wait for the workers to finish handling the sources.""" | ||||
self._logger.debug("Waiting on {0} sources".format(len(self.sources))) | self._logger.debug("Waiting on {0} sources".format(len(self.sources))) | ||||
for source in self.sources: | for source in self.sources: | ||||
source.join(self._until) | source.join(self._until) | ||||
with self._compare_lock: | |||||
with self._finish_lock: | |||||
pass # Wait for any remaining comparisons to be finished | pass # Wait for any remaining comparisons to be finished | ||||
if not _is_globalized: | if not _is_globalized: | ||||
for i in xrange(self._num_workers): | for i in xrange(self._num_workers): | ||||
self._queues.unassigned.put((StopIteration, None)) | self._queues.unassigned.put((StopIteration, None)) | ||||
def compare(self, source, source_chain): | |||||
"""Compare a source to the article, and update the best known one.""" | |||||
delta = MarkovChainIntersection(self._article, source_chain) | |||||
conf = self._calculate_confidence(delta) | |||||
self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf)) | |||||
with self._compare_lock: | |||||
source.finish_work(conf, source_chain, delta) | |||||
if conf > self.best.confidence: | |||||
self.best = source | |||||
if conf >= self._min_confidence: | |||||
self._finish_early() | |||||
def get_result(self, num_queries=0): | |||||
"""Return a CopyvioCheckResult containing the results of this check.""" | |||||
self.sources.sort(key=lambda source: source.confidence, reverse=True) | |||||
return CopyvioCheckResult(self.finished, self.sources, num_queries, | |||||
time() - self._start_time, self._article) |