Browse Source

Implement new caching system and result structure.

pull/24/head
Ben Kurtovic 9 years ago
parent
commit
b2fba1fa56
3 changed files with 61 additions and 42 deletions
  1. +54
    -30
      copyvios/checker.py
  2. +4
    -1
      copyvios/highlighter.py
  3. +3
    -11
      templates/index.mako

+ 54
- 30
copyvios/checker.py View File

@@ -5,7 +5,9 @@ from hashlib import sha256
from urlparse import urlparse

from earwigbot import exceptions
from earwigbot.wiki.copyvios.markov import EMPTY
from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain
from earwigbot.wiki.copyvios.parsers import ArticleTextParser
from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult

from .misc import Query, get_cache_db
from .sites import get_site, get_sites
@@ -63,7 +65,7 @@ def _get_results(query, follow=True):
return
mode = "{0}:{1}:".format(use_engine, use_links)
if not query.nocache:
query.result = _get_cached_results(page, conn, query, mode)
query.result = _get_cached_results(page, conn, mode)
if not query.result:
query.result = page.copyvio_check(
min_confidence=T_SUSPECT, max_queries=10, max_time=45,
@@ -80,10 +82,13 @@ def _get_results(query, follow=True):
elif scheme not in ["http", "https"]:
query.error = "bad URI"
return
result = _do_copyvio_compare(query, page, query.url)
if result:
query.result = result
query.result.cached = False
result = page.copyvio_compare(query.url, min_confidence=T_SUSPECT,
max_time=30)
if result.best.chains[0] is EMPTY:
query.error = "timeout" if result.time > 30 else "no data"
return
query.result = result
query.result.cached = False
else:
query.error = "bad action"

@@ -105,36 +110,53 @@ def _get_page_by_revid(site, revid):
page._load_content(res)
return page

def _get_cached_results(page, conn, query, mode):
def _get_cached_results(page, conn, mode):
query1 = """DELETE FROM cache
WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"""
query2 = """SELECT cache_url, cache_time, cache_queries, cache_process_time
query2 = """SELECT cache_time, cache_queries, cache_process_time
FROM cache
WHERE cache_id = ? AND cache_hash = ?"""
shahash = sha256(mode + page.get().encode("utf8")).hexdigest()
WHERE cache_id = ?"""
query3 = """SELECT cdata_url, cdata_confidence, cdata_skipped
FROM cache_data
WHERE cdata_cache_id = ?"""
cache_id = sha256(mode + page.get().encode("utf8")).digest()

with conn.cursor() as cursor:
cursor.execute(query1)
cursor.execute(query2, (page.pageid, shahash))
cursor.execute(query2, (cache_id,))
results = cursor.fetchall()
if not results:
return None
cache_time, queries, check_time = results[0]
cursor.execute(query3, (cache_id,))
data = cursor.fetchall()

url, cache_time, num_queries, original_time = results[0]
result = _do_copyvio_compare(query, page, url)
if result:
if not data: # TODO: do something less hacky for this edge case
artchain = MarkovChain(ArticleTextParser(page.get()).strip())
result = CopyvioCheckResult(False, [], queries, check_time, artchain)
result.cached = True
result.queries = num_queries
result.original_time = original_time
result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
result.cache_age = _format_date(cache_time)
return result
return result

def _do_copyvio_compare(query, page, url):
url, confidence, skipped = data.pop(0)
if skipped: # Should be impossible: data must be bad; run a new check
return None
result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30)
if not url or result.source_chain is not EMPTY:
return result
query.error = "timeout" if result.time > 30 else "no data"
if result.confidence != confidence:
return None

for url, confidence, skipped in data:
source = CopyvioSource(None, url)
source.confidence = confidence
source.skipped = skipped
result.sources.append(source)
result.queries = queries
result.time = check_time
result.cached = True
result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
result.cache_age = _format_date(cache_time)
return result

def _format_date(cache_time):
diff = datetime.utcnow() - cache_time
@@ -145,13 +167,15 @@ def _format_date(cache_time):
return "{0} seconds".format(diff.seconds)

def _cache_result(page, result, conn, mode):
query = """INSERT INTO cache
VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)
ON DUPLICATE KEY UPDATE
cache_url = ?, cache_time = CURRENT_TIMESTAMP,
cache_queries = ?, cache_process_time = ?"""
shahash = sha256(mode + page.get().encode("utf8")).hexdigest()
args = (page.pageid, shahash, result.url, result.queries, result.time,
result.url, result.queries, result.time)
query1 = "DELETE FROM cache WHERE cache_id = ?"
query2 = "INSERT INTO cache VALUES (?, DEFAULT, ?, ?)"
query3 = "INSERT INTO cache_data VALUES (DEFAULT, ?, ?, ?, ?)"
cache_id = sha256(mode + page.get().encode("utf8")).digest()
data = [(cache_id, source.url, source.confidence, source.skipped)
for source in result.sources]
with conn.cursor() as cursor:
cursor.execute(query, args)
cursor.execute("START TRANSACTION")
cursor.execute(query1, (cache_id,))
cursor.execute(query2, (cache_id, result.queries, result.time))
cursor.executemany(query3, data)
cursor.execute("COMMIT")

+ 4
- 1
copyvios/highlighter.py View File

@@ -2,14 +2,17 @@

from re import sub, UNICODE

from earwigbot.wiki.copyvios.markov import EMPTY_INTERSECTION
from markupsafe import escape

__all__ = ["highlight_delta"]

def highlight_delta(context, chain, delta):
def highlight_delta(context, chain, delta=None):
degree = chain.degree - 1
highlights = [False] * degree
block = [chain.START] * degree
if not delta:
delta = EMPTY_INTERSECTION
for word in chain.text.split() + ([chain.END] * degree):
word = _strip_word(chain, word)
tblock = tuple(block)


+ 3
- 11
templates/index.mako View File

@@ -178,15 +178,7 @@
<li>Redirected from <a href="${query.redirected_from.url}">${query.redirected_from.title | h}</a>. <a href="${request.url | httpsfix, h}&amp;noredirect=1">Check the original page.</a></li>
% endif
% if result.cached:
<li>
Results are <a id="cv-cached" href="#">cached<span>To save time (and money), this tool will retain the results of checks for up to 72 hours. This includes the URL of the "violated" source, but neither its content nor the content of the article. Future checks on the same page (assuming it remains unchanged) will not involve additional search queries, but a fresh comparison against the source URL will be made. If the page is modified, a new check will be run.</span></a> from <abbr title="${result.cache_time}">${result.cache_age} ago</abbr>. Retrieved in <span class="mono">${round(result.time, 3)}</span> seconds (originally generated in
% if result.queries:
<span class="mono">${round(result.original_time, 3)}</span>s using <span class="mono">${result.queries}</span> queries).
% else:
<span class="mono">${round(result.original_time, 3)}</span>s).
% endif
<a href="${request.url | httpsfix, h}&amp;nocache=1">Bypass the cache.</a>
</li>
<li>Results are <a id="cv-cached" href="#">cached<span>To save time (and money), this tool will retain the results of checks for up to 72 hours. This includes the URLs of the checked sources, but neither their content nor the content of the article. Future checks on the same page (assuming it remains unchanged) will not involve additional search queries, but a fresh comparison against the source URL will be made. If the page is modified, a new check will be run.</span></a> from <abbr title="${result.cache_time}">${result.cache_age} ago</abbr>. Originally generated in <span class="mono">${round(result.time, 3)}</span> seconds using <span class="mono">${result.queries}</span> queries. <a href="${request.url | httpsfix, h}&amp;nocache=1">Bypass the cache.</a></li>
% else:
<li>Results generated in <span class="mono">${round(result.time, 3)}</span> seconds using <span class="mono">${result.queries}</span> queries.</li>
% endif
@@ -194,8 +186,8 @@
</ul>
<table id="cv-chain-table" style="display: ${'none' if hide_comparison else 'table'};">
<tr>
<td class="cv-chain-cell">Article: <div class="cv-chain-detail"><p>${highlight_delta(result.article_chain, result.delta_chain)}</p></div></td>
<td class="cv-chain-cell">Source: <div class="cv-chain-detail"><p>${highlight_delta(result.source_chain, result.delta_chain)}</p></div></td>
<td class="cv-chain-cell">Article: <div class="cv-chain-detail"><p>${highlight_delta(result.article_chain, result.best.chains[1] if result.best else None)}</p></div></td>
<td class="cv-chain-cell">Source: <div class="cv-chain-detail"><p>${highlight_delta(result.best.chains[0], result.best.chains[1]) if result.best else ""}</p></div></td>
</tr>
</table>
% endif


Loading…
Cancel
Save