浏览代码

Update cache table schema quite a bit.

pull/15/head
Ben Kurtovic 9 年前
父节点
当前提交
d81c63731b
共有 2 个文件被更改,包括 47 次插入29 次删除
  1. +18
    -16
      tasks/afc_copyvios.py
  2. +29
    -13
      tasks/schema/afc_copyvios.sql

+ 18
- 16
tasks/afc_copyvios.py 查看文件

@@ -40,7 +40,7 @@ class AFCCopyvios(Task):
cfg = self.config.tasks.get(self.name, {})
self.template = cfg.get("template", "AfC suspected copyvio")
self.ignore_list = cfg.get("ignoreList", [])
self.min_confidence = cfg.get("minConfidence", 0.5)
self.min_confidence = cfg.get("minConfidence", 0.75)
self.max_queries = cfg.get("maxQueries", 10)
self.max_time = cfg.get("maxTime", 150)
self.cache_results = cfg.get("cacheResults", False)
@@ -187,29 +187,31 @@ class AFCCopyvios(Task):
def cache_result(self, page, result):
"""Store the check's result in a cache table temporarily.

The cache contains the page's ID, a hash of its content, the URL of the
best match, the time of caching, and the number of queries used. It
will replace any existing cache entries for that page.
The cache contains some data associated with the hash of the page's
contents. This data includes the number of queries used, the time to
detect a violation, and a list of sources, which store their respective
URLs, confidence values, and skipped states.

The cache is intended for EarwigBot's complementary Toolserver web
The cache is intended for EarwigBot's complementary Tool Labs web
interface, in which copyvio checks can be done separately from the bot.
The cache saves time and money by saving the result of the web search
but neither the result of the comparison nor any actual text (which
could violate data retention policy). Cache entries are (intended to
be) retained for three days; this task does not remove old entries
(that is handled by the Toolserver component).
(that is handled by the Tool Labs component).

This will only be called if ``cache_results == True`` in the task's
config, which is ``False`` by default.
"""
query = """INSERT INTO cache
VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, 0)
ON DUPLICATE KEY UPDATE
cache_url = ?, cache_time = CURRENT_TIMESTAMP,
cache_queries = ?, cache_process_time = 0"""
mode = "1:1:"
shahash = sha256(mode + page.get().encode("utf8")).hexdigest()
args = (page.pageid, shahash, result.url, result.queries, result.url,
result.queries)
query1 = "DELETE FROM cache WHERE cache_id = ?"
query2 = "INSERT INTO cache VALUES (?, DEFAULT, ?, ?)"
query3 = "INSERT INTO cache_data VALUES (DEFAULT, ?, ?, ?, ?)"
cache_id = sha256("1:1:" + page.get().encode("utf8")).digest()
data = [(cache_id, source.url, source.confidence, source.skipped)
for source in result.sources]
with self.conn.cursor() as cursor:
cursor.execute(query, args)
cursor.execute("START TRANSACTION")
cursor.execute(query1, (cache_id,))
cursor.execute(query2, (cache_id, result.queries, result.time))
cursor.executemany(query3, data)
cursor.execute("COMMIT")

+ 29
- 13
tasks/schema/afc_copyvios.sql 查看文件

@@ -5,8 +5,8 @@
-- Server version 5.1.59

CREATE DATABASE `u_earwig_afc_copyvios`
DEFAULT CHARACTER SET utf8
DEFAULT COLLATE utf8_unicode_ci;
DEFAULT CHARACTER SET utf8
DEFAULT COLLATE utf8_unicode_ci;

--
-- Table structure for table `cache`
@@ -14,14 +14,29 @@ CREATE DATABASE `u_earwig_afc_copyvios`

DROP TABLE IF EXISTS `cache`;
CREATE TABLE `cache` (
`cache_id` int(10) unsigned NOT NULL,
`cache_hash` char(64) COLLATE utf8_unicode_ci NOT NULL,
`cache_url` varchar(512) COLLATE utf8_unicode_ci DEFAULT NULL,
`cache_time` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',
`cache_queries` int(4) DEFAULT NULL,
`cache_process_time` float DEFAULT NULL,
PRIMARY KEY (`cache_id`, `cache_hash`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
`cache_id` BINARY(32) NOT NULL,
`cache_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
`cache_queries` INT(4) NOT NULL DEFAULT 0,
`cache_process_time` FLOAT NOT NULL DEFAULT 0,
PRIMARY KEY (`cache_id`)
) ENGINE=InnoDB;

--
-- Table structure for table `cache_data`
--

DROP TABLE IF EXISTS `cache_data`;
CREATE TABLE `cache_data` (
`cdata_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`cdata_cache_id` BINARY(32) NOT NULL,
`cdata_url` VARCHAR(512) NOT NULL,
`cdata_confidence` FLOAT NOT NULL DEFAULT 0,
`cdata_skipped` BOOLEAN NOT NULL DEFAULT "false",
PRIMARY KEY (`cdata_id`),
FOREIGN KEY (`cdata_cache_id`)
REFERENCES `cache` (`cache_id`)
ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE=InnoDB;

--
-- Table structure for table `processed`
@@ -29,8 +44,8 @@ CREATE TABLE `cache` (

DROP TABLE IF EXISTS `processed`;
CREATE TABLE `processed` (
`page_id` int(10) unsigned NOT NULL,
PRIMARY KEY (`page_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
`page_id` INT(10) UNSIGNED NOT NULL,
PRIMARY KEY (`page_id`)
) ENGINE=InnoDB;

-- Dump completed on 2014-08-04 20:00:00

正在加载...
取消
保存