From 7cc85f9bc4be38feace522a5797cc433a38c0419 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 23 Feb 2012 23:35:59 -0500 Subject: [PATCH] afc_copyvios: optionally cache results for the Toolserver. --- earwigbot/tasks/afc_copyvios.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/earwigbot/tasks/afc_copyvios.py b/earwigbot/tasks/afc_copyvios.py index adf7e9e..26058ec 100644 --- a/earwigbot/tasks/afc_copyvios.py +++ b/earwigbot/tasks/afc_copyvios.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from hashlib import sha256 from os.path import expanduser from threading import Lock @@ -42,8 +43,9 @@ class Task(BaseTask): cfg = config.tasks.get(self.name, {}) self.template = cfg.get("template", "AfC suspected copyvio") self.ignore_list = cfg.get("ignoreList", []) - self.min_confidence = cfg.get("minConfidence", 0.75) + self.min_confidence = cfg.get("minConfidence", 0.5) self.max_queries = cfg.get("maxQueries", 10) + self.cache_results = cfg.get("cacheResults", False) default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}" self.summary = self.make_summary(cfg.get("summary", default_summary)) @@ -110,6 +112,8 @@ class Task(BaseTask): self.logger.debug(msg.format(url, confidence)) self.log_processed(pageid) + if self.cache_results: + self.cache_result(page, result) def has_been_processed(self, pageid): """Returns True if pageid was processed before, otherwise False.""" @@ -129,3 +133,32 @@ class Task(BaseTask): query = "INSERT INTO processed VALUES (?)" with self.conn.cursor() as cursor: cursor.execute(query, (pageid,)) + + def cache_result(self, page, result): + """Store the check's result in a cache table temporarily. + + The cache contains the page's ID, a hash of its content, the URL of the + best match, the time of caching, and the number of queries used. It + will replace any existing cache entries for that page. + + The cache is intended for EarwigBot's complementary Toolserver web + interface, in which copyvio checks can be done separately from the bot. + The cache saves time and money by saving the result of the web search + but neither the result of the comparison nor any actual text (which + could violate data retention policy). Cache entries are (intended to + be) retained for one day; this task does not remove old entries (that + is handled by the Toolserver component). + + This will only be called if "cache_results" == True in the task's, + config, which is False by default. + """ + pageid = page.pageid() + hash = sha256(page.get()).hexdigest() + query1 = "SELECT 1 FROM cache WHERE cache_id = ?" + query2 = "DELETE FROM cache WHERE cache_id = ?" + query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)" + with self.conn.cursor() as cursor: + cursor.execute(query1, (pageid,)) + if cursor.fetchall(): + cursor.execute(query2, (pageid,)) + cursor.execute(query3, (pageid, hash, result.url, result.queries, 0))