From d81c63731b41f8929d9c8560b07fd46386b93600 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 4 Sep 2014 22:08:18 -0500 Subject: [PATCH] Update cache table schema quite a bit. --- tasks/afc_copyvios.py | 34 ++++++++++++++++++---------------- tasks/schema/afc_copyvios.sql | 43 +++++++++++++++++++++++++++++-------------- 2 files changed, 47 insertions(+), 30 deletions(-) diff --git a/tasks/afc_copyvios.py b/tasks/afc_copyvios.py index 2944872..749edcb 100644 --- a/tasks/afc_copyvios.py +++ b/tasks/afc_copyvios.py @@ -40,7 +40,7 @@ class AFCCopyvios(Task): cfg = self.config.tasks.get(self.name, {}) self.template = cfg.get("template", "AfC suspected copyvio") self.ignore_list = cfg.get("ignoreList", []) - self.min_confidence = cfg.get("minConfidence", 0.5) + self.min_confidence = cfg.get("minConfidence", 0.75) self.max_queries = cfg.get("maxQueries", 10) self.max_time = cfg.get("maxTime", 150) self.cache_results = cfg.get("cacheResults", False) @@ -187,29 +187,31 @@ class AFCCopyvios(Task): def cache_result(self, page, result): """Store the check's result in a cache table temporarily. - The cache contains the page's ID, a hash of its content, the URL of the - best match, the time of caching, and the number of queries used. It - will replace any existing cache entries for that page. + The cache contains some data associated with the hash of the page's + contents. This data includes the number of queries used, the time to + detect a violation, and a list of sources, which store their respective + URLs, confidence values, and skipped states. - The cache is intended for EarwigBot's complementary Toolserver web + The cache is intended for EarwigBot's complementary Tool Labs web interface, in which copyvio checks can be done separately from the bot. The cache saves time and money by saving the result of the web search but neither the result of the comparison nor any actual text (which could violate data retention policy). Cache entries are (intended to be) retained for three days; this task does not remove old entries - (that is handled by the Toolserver component). + (that is handled by the Tool Labs component). This will only be called if ``cache_results == True`` in the task's config, which is ``False`` by default. """ - query = """INSERT INTO cache - VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, 0) - ON DUPLICATE KEY UPDATE - cache_url = ?, cache_time = CURRENT_TIMESTAMP, - cache_queries = ?, cache_process_time = 0""" - mode = "1:1:" - shahash = sha256(mode + page.get().encode("utf8")).hexdigest() - args = (page.pageid, shahash, result.url, result.queries, result.url, - result.queries) + query1 = "DELETE FROM cache WHERE cache_id = ?" + query2 = "INSERT INTO cache VALUES (?, DEFAULT, ?, ?)" + query3 = "INSERT INTO cache_data VALUES (DEFAULT, ?, ?, ?, ?)" + cache_id = sha256("1:1:" + page.get().encode("utf8")).digest() + data = [(cache_id, source.url, source.confidence, source.skipped) + for source in result.sources] with self.conn.cursor() as cursor: - cursor.execute(query, args) + cursor.execute("START TRANSACTION") + cursor.execute(query1, (cache_id,)) + cursor.execute(query2, (cache_id, result.queries, result.time)) + cursor.executemany(query3, data) + cursor.execute("COMMIT") diff --git a/tasks/schema/afc_copyvios.sql b/tasks/schema/afc_copyvios.sql index 3ae3038..04a7845 100644 --- a/tasks/schema/afc_copyvios.sql +++ b/tasks/schema/afc_copyvios.sql @@ -5,8 +5,8 @@ -- Server version 5.1.59 CREATE DATABASE `u_earwig_afc_copyvios` - DEFAULT CHARACTER SET utf8 - DEFAULT COLLATE utf8_unicode_ci; + DEFAULT CHARACTER SET utf8 + DEFAULT COLLATE utf8_unicode_ci; -- -- Table structure for table `cache` @@ -14,14 +14,29 @@ CREATE DATABASE `u_earwig_afc_copyvios` DROP TABLE IF EXISTS `cache`; CREATE TABLE `cache` ( - `cache_id` int(10) unsigned NOT NULL, - `cache_hash` char(64) COLLATE utf8_unicode_ci NOT NULL, - `cache_url` varchar(512) COLLATE utf8_unicode_ci DEFAULT NULL, - `cache_time` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', - `cache_queries` int(4) DEFAULT NULL, - `cache_process_time` float DEFAULT NULL, - PRIMARY KEY (`cache_id`, `cache_hash`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; + `cache_id` BINARY(32) NOT NULL, + `cache_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + `cache_queries` INT(4) NOT NULL DEFAULT 0, + `cache_process_time` FLOAT NOT NULL DEFAULT 0, + PRIMARY KEY (`cache_id`) +) ENGINE=InnoDB; + +-- +-- Table structure for table `cache_data` +-- + +DROP TABLE IF EXISTS `cache_data`; +CREATE TABLE `cache_data` ( + `cdata_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `cdata_cache_id` BINARY(32) NOT NULL, + `cdata_url` VARCHAR(512) NOT NULL, + `cdata_confidence` FLOAT NOT NULL DEFAULT 0, + `cdata_skipped` BOOLEAN NOT NULL DEFAULT "false", + PRIMARY KEY (`cdata_id`), + FOREIGN KEY (`cdata_cache_id`) + REFERENCES `cache` (`cache_id`) + ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB; -- -- Table structure for table `processed` @@ -29,8 +44,8 @@ CREATE TABLE `cache` ( DROP TABLE IF EXISTS `processed`; CREATE TABLE `processed` ( - `page_id` int(10) unsigned NOT NULL, - PRIMARY KEY (`page_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; + `page_id` INT(10) UNSIGNED NOT NULL, + PRIMARY KEY (`page_id`) +) ENGINE=InnoDB; --- Dump completed on 2012-07-20 20:21:00 +-- Dump completed on 2014-08-04 20:00:00