ソースを参照

Update cache table schema quite a bit.

pull/15/head
Ben Kurtovic 9年前
コミット
d81c63731b
2個のファイルの変更47行の追加29行の削除
  1. +18
    -16
      tasks/afc_copyvios.py
  2. +29
    -13
      tasks/schema/afc_copyvios.sql

+ 18
- 16
tasks/afc_copyvios.py ファイルの表示

@@ -40,7 +40,7 @@ class AFCCopyvios(Task):
cfg = self.config.tasks.get(self.name, {}) cfg = self.config.tasks.get(self.name, {})
self.template = cfg.get("template", "AfC suspected copyvio") self.template = cfg.get("template", "AfC suspected copyvio")
self.ignore_list = cfg.get("ignoreList", []) self.ignore_list = cfg.get("ignoreList", [])
self.min_confidence = cfg.get("minConfidence", 0.5)
self.min_confidence = cfg.get("minConfidence", 0.75)
self.max_queries = cfg.get("maxQueries", 10) self.max_queries = cfg.get("maxQueries", 10)
self.max_time = cfg.get("maxTime", 150) self.max_time = cfg.get("maxTime", 150)
self.cache_results = cfg.get("cacheResults", False) self.cache_results = cfg.get("cacheResults", False)
@@ -187,29 +187,31 @@ class AFCCopyvios(Task):
def cache_result(self, page, result): def cache_result(self, page, result):
"""Store the check's result in a cache table temporarily. """Store the check's result in a cache table temporarily.


The cache contains the page's ID, a hash of its content, the URL of the
best match, the time of caching, and the number of queries used. It
will replace any existing cache entries for that page.
The cache contains some data associated with the hash of the page's
contents. This data includes the number of queries used, the time to
detect a violation, and a list of sources, which store their respective
URLs, confidence values, and skipped states.


The cache is intended for EarwigBot's complementary Toolserver web
The cache is intended for EarwigBot's complementary Tool Labs web
interface, in which copyvio checks can be done separately from the bot. interface, in which copyvio checks can be done separately from the bot.
The cache saves time and money by saving the result of the web search The cache saves time and money by saving the result of the web search
but neither the result of the comparison nor any actual text (which but neither the result of the comparison nor any actual text (which
could violate data retention policy). Cache entries are (intended to could violate data retention policy). Cache entries are (intended to
be) retained for three days; this task does not remove old entries be) retained for three days; this task does not remove old entries
(that is handled by the Toolserver component).
(that is handled by the Tool Labs component).


This will only be called if ``cache_results == True`` in the task's This will only be called if ``cache_results == True`` in the task's
config, which is ``False`` by default. config, which is ``False`` by default.
""" """
query = """INSERT INTO cache
VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, 0)
ON DUPLICATE KEY UPDATE
cache_url = ?, cache_time = CURRENT_TIMESTAMP,
cache_queries = ?, cache_process_time = 0"""
mode = "1:1:"
shahash = sha256(mode + page.get().encode("utf8")).hexdigest()
args = (page.pageid, shahash, result.url, result.queries, result.url,
result.queries)
query1 = "DELETE FROM cache WHERE cache_id = ?"
query2 = "INSERT INTO cache VALUES (?, DEFAULT, ?, ?)"
query3 = "INSERT INTO cache_data VALUES (DEFAULT, ?, ?, ?, ?)"
cache_id = sha256("1:1:" + page.get().encode("utf8")).digest()
data = [(cache_id, source.url, source.confidence, source.skipped)
for source in result.sources]
with self.conn.cursor() as cursor: with self.conn.cursor() as cursor:
cursor.execute(query, args)
cursor.execute("START TRANSACTION")
cursor.execute(query1, (cache_id,))
cursor.execute(query2, (cache_id, result.queries, result.time))
cursor.executemany(query3, data)
cursor.execute("COMMIT")

+ 29
- 13
tasks/schema/afc_copyvios.sql ファイルの表示

@@ -5,8 +5,8 @@
-- Server version 5.1.59 -- Server version 5.1.59


CREATE DATABASE `u_earwig_afc_copyvios` CREATE DATABASE `u_earwig_afc_copyvios`
DEFAULT CHARACTER SET utf8
DEFAULT COLLATE utf8_unicode_ci;
DEFAULT CHARACTER SET utf8
DEFAULT COLLATE utf8_unicode_ci;


-- --
-- Table structure for table `cache` -- Table structure for table `cache`
@@ -14,14 +14,29 @@ CREATE DATABASE `u_earwig_afc_copyvios`


DROP TABLE IF EXISTS `cache`; DROP TABLE IF EXISTS `cache`;
CREATE TABLE `cache` ( CREATE TABLE `cache` (
`cache_id` int(10) unsigned NOT NULL,
`cache_hash` char(64) COLLATE utf8_unicode_ci NOT NULL,
`cache_url` varchar(512) COLLATE utf8_unicode_ci DEFAULT NULL,
`cache_time` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',
`cache_queries` int(4) DEFAULT NULL,
`cache_process_time` float DEFAULT NULL,
PRIMARY KEY (`cache_id`, `cache_hash`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
`cache_id` BINARY(32) NOT NULL,
`cache_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
`cache_queries` INT(4) NOT NULL DEFAULT 0,
`cache_process_time` FLOAT NOT NULL DEFAULT 0,
PRIMARY KEY (`cache_id`)
) ENGINE=InnoDB;

--
-- Table structure for table `cache_data`
--

DROP TABLE IF EXISTS `cache_data`;
CREATE TABLE `cache_data` (
`cdata_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`cdata_cache_id` BINARY(32) NOT NULL,
`cdata_url` VARCHAR(512) NOT NULL,
`cdata_confidence` FLOAT NOT NULL DEFAULT 0,
`cdata_skipped` BOOLEAN NOT NULL DEFAULT "false",
PRIMARY KEY (`cdata_id`),
FOREIGN KEY (`cdata_cache_id`)
REFERENCES `cache` (`cache_id`)
ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE=InnoDB;


-- --
-- Table structure for table `processed` -- Table structure for table `processed`
@@ -29,8 +44,8 @@ CREATE TABLE `cache` (


DROP TABLE IF EXISTS `processed`; DROP TABLE IF EXISTS `processed`;
CREATE TABLE `processed` ( CREATE TABLE `processed` (
`page_id` int(10) unsigned NOT NULL,
PRIMARY KEY (`page_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
`page_id` INT(10) UNSIGNED NOT NULL,
PRIMARY KEY (`page_id`)
) ENGINE=InnoDB;


-- Dump completed on 2014-08-04 20:00:00

読み込み中…
キャンセル
保存