Explorar el Código

Update cache table schema quite a bit.

pull/15/head
Ben Kurtovic hace 10 años
padre
commit
d81c63731b
Se han modificado 2 ficheros con 47 adiciones y 29 borrados
  1. +18
    -16
      tasks/afc_copyvios.py
  2. +29
    -13
      tasks/schema/afc_copyvios.sql

+ 18
- 16
tasks/afc_copyvios.py Ver fichero

@@ -40,7 +40,7 @@ class AFCCopyvios(Task):
cfg = self.config.tasks.get(self.name, {})
self.template = cfg.get("template", "AfC suspected copyvio")
self.ignore_list = cfg.get("ignoreList", [])
self.min_confidence = cfg.get("minConfidence", 0.5)
self.min_confidence = cfg.get("minConfidence", 0.75)
self.max_queries = cfg.get("maxQueries", 10)
self.max_time = cfg.get("maxTime", 150)
self.cache_results = cfg.get("cacheResults", False)
@@ -187,29 +187,31 @@ class AFCCopyvios(Task):
def cache_result(self, page, result):
"""Store the check's result in a cache table temporarily.

The cache contains the page's ID, a hash of its content, the URL of the
best match, the time of caching, and the number of queries used. It
will replace any existing cache entries for that page.
The cache contains some data associated with the hash of the page's
contents. This data includes the number of queries used, the time to
detect a violation, and a list of sources, which store their respective
URLs, confidence values, and skipped states.

The cache is intended for EarwigBot's complementary Toolserver web
The cache is intended for EarwigBot's complementary Tool Labs web
interface, in which copyvio checks can be done separately from the bot.
The cache saves time and money by saving the result of the web search
but neither the result of the comparison nor any actual text (which
could violate data retention policy). Cache entries are (intended to
be) retained for three days; this task does not remove old entries
(that is handled by the Toolserver component).
(that is handled by the Tool Labs component).

This will only be called if ``cache_results == True`` in the task's
config, which is ``False`` by default.
"""
query = """INSERT INTO cache
VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, 0)
ON DUPLICATE KEY UPDATE
cache_url = ?, cache_time = CURRENT_TIMESTAMP,
cache_queries = ?, cache_process_time = 0"""
mode = "1:1:"
shahash = sha256(mode + page.get().encode("utf8")).hexdigest()
args = (page.pageid, shahash, result.url, result.queries, result.url,
result.queries)
query1 = "DELETE FROM cache WHERE cache_id = ?"
query2 = "INSERT INTO cache VALUES (?, DEFAULT, ?, ?)"
query3 = "INSERT INTO cache_data VALUES (DEFAULT, ?, ?, ?, ?)"
cache_id = sha256("1:1:" + page.get().encode("utf8")).digest()
data = [(cache_id, source.url, source.confidence, source.skipped)
for source in result.sources]
with self.conn.cursor() as cursor:
cursor.execute(query, args)
cursor.execute("START TRANSACTION")
cursor.execute(query1, (cache_id,))
cursor.execute(query2, (cache_id, result.queries, result.time))
cursor.executemany(query3, data)
cursor.execute("COMMIT")

+ 29
- 13
tasks/schema/afc_copyvios.sql Ver fichero

@@ -5,8 +5,8 @@
-- Server version 5.1.59

CREATE DATABASE `u_earwig_afc_copyvios`
DEFAULT CHARACTER SET utf8
DEFAULT COLLATE utf8_unicode_ci;
DEFAULT CHARACTER SET utf8
DEFAULT COLLATE utf8_unicode_ci;

--
-- Table structure for table `cache`
@@ -14,14 +14,29 @@ CREATE DATABASE `u_earwig_afc_copyvios`

DROP TABLE IF EXISTS `cache`;
CREATE TABLE `cache` (
`cache_id` int(10) unsigned NOT NULL,
`cache_hash` char(64) COLLATE utf8_unicode_ci NOT NULL,
`cache_url` varchar(512) COLLATE utf8_unicode_ci DEFAULT NULL,
`cache_time` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',
`cache_queries` int(4) DEFAULT NULL,
`cache_process_time` float DEFAULT NULL,
PRIMARY KEY (`cache_id`, `cache_hash`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
`cache_id` BINARY(32) NOT NULL,
`cache_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
`cache_queries` INT(4) NOT NULL DEFAULT 0,
`cache_process_time` FLOAT NOT NULL DEFAULT 0,
PRIMARY KEY (`cache_id`)
) ENGINE=InnoDB;

--
-- Table structure for table `cache_data`
--

DROP TABLE IF EXISTS `cache_data`;
CREATE TABLE `cache_data` (
`cdata_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`cdata_cache_id` BINARY(32) NOT NULL,
`cdata_url` VARCHAR(512) NOT NULL,
`cdata_confidence` FLOAT NOT NULL DEFAULT 0,
`cdata_skipped` BOOLEAN NOT NULL DEFAULT "false",
PRIMARY KEY (`cdata_id`),
FOREIGN KEY (`cdata_cache_id`)
REFERENCES `cache` (`cache_id`)
ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE=InnoDB;

--
-- Table structure for table `processed`
@@ -29,8 +44,8 @@ CREATE TABLE `cache` (

DROP TABLE IF EXISTS `processed`;
CREATE TABLE `processed` (
`page_id` int(10) unsigned NOT NULL,
PRIMARY KEY (`page_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
`page_id` INT(10) UNSIGNED NOT NULL,
PRIMARY KEY (`page_id`)
) ENGINE=InnoDB;

-- Dump completed on 2014-08-04 20:00:00

Cargando…
Cancelar
Guardar