Browse Source

Implement new caching system and result structure.

pull/24/head
Ben Kurtovic 10 years ago
parent
commit
b2fba1fa56
3 changed files with 61 additions and 42 deletions
  1. +54
    -30
      copyvios/checker.py
  2. +4
    -1
      copyvios/highlighter.py
  3. +3
    -11
      templates/index.mako

+ 54
- 30
copyvios/checker.py View File

@@ -5,7 +5,9 @@ from hashlib import sha256
from urlparse import urlparse from urlparse import urlparse


from earwigbot import exceptions from earwigbot import exceptions
from earwigbot.wiki.copyvios.markov import EMPTY
from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain
from earwigbot.wiki.copyvios.parsers import ArticleTextParser
from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult


from .misc import Query, get_cache_db from .misc import Query, get_cache_db
from .sites import get_site, get_sites from .sites import get_site, get_sites
@@ -63,7 +65,7 @@ def _get_results(query, follow=True):
return return
mode = "{0}:{1}:".format(use_engine, use_links) mode = "{0}:{1}:".format(use_engine, use_links)
if not query.nocache: if not query.nocache:
query.result = _get_cached_results(page, conn, query, mode)
query.result = _get_cached_results(page, conn, mode)
if not query.result: if not query.result:
query.result = page.copyvio_check( query.result = page.copyvio_check(
min_confidence=T_SUSPECT, max_queries=10, max_time=45, min_confidence=T_SUSPECT, max_queries=10, max_time=45,
@@ -80,10 +82,13 @@ def _get_results(query, follow=True):
elif scheme not in ["http", "https"]: elif scheme not in ["http", "https"]:
query.error = "bad URI" query.error = "bad URI"
return return
result = _do_copyvio_compare(query, page, query.url)
if result:
query.result = result
query.result.cached = False
result = page.copyvio_compare(query.url, min_confidence=T_SUSPECT,
max_time=30)
if result.best.chains[0] is EMPTY:
query.error = "timeout" if result.time > 30 else "no data"
return
query.result = result
query.result.cached = False
else: else:
query.error = "bad action" query.error = "bad action"


@@ -105,36 +110,53 @@ def _get_page_by_revid(site, revid):
page._load_content(res) page._load_content(res)
return page return page


def _get_cached_results(page, conn, query, mode):
def _get_cached_results(page, conn, mode):
query1 = """DELETE FROM cache query1 = """DELETE FROM cache
WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)""" WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"""
query2 = """SELECT cache_url, cache_time, cache_queries, cache_process_time
query2 = """SELECT cache_time, cache_queries, cache_process_time
FROM cache FROM cache
WHERE cache_id = ? AND cache_hash = ?"""
shahash = sha256(mode + page.get().encode("utf8")).hexdigest()
WHERE cache_id = ?"""
query3 = """SELECT cdata_url, cdata_confidence, cdata_skipped
FROM cache_data
WHERE cdata_cache_id = ?"""
cache_id = sha256(mode + page.get().encode("utf8")).digest()


with conn.cursor() as cursor: with conn.cursor() as cursor:
cursor.execute(query1) cursor.execute(query1)
cursor.execute(query2, (page.pageid, shahash))
cursor.execute(query2, (cache_id,))
results = cursor.fetchall() results = cursor.fetchall()
if not results: if not results:
return None return None
cache_time, queries, check_time = results[0]
cursor.execute(query3, (cache_id,))
data = cursor.fetchall()


url, cache_time, num_queries, original_time = results[0]
result = _do_copyvio_compare(query, page, url)
if result:
if not data: # TODO: do something less hacky for this edge case
artchain = MarkovChain(ArticleTextParser(page.get()).strip())
result = CopyvioCheckResult(False, [], queries, check_time, artchain)
result.cached = True result.cached = True
result.queries = num_queries
result.original_time = original_time
result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC") result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
result.cache_age = _format_date(cache_time) result.cache_age = _format_date(cache_time)
return result
return result


def _do_copyvio_compare(query, page, url):
url, confidence, skipped = data.pop(0)
if skipped: # Should be impossible: data must be bad; run a new check
return None
result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30) result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30)
if not url or result.source_chain is not EMPTY:
return result
query.error = "timeout" if result.time > 30 else "no data"
if result.confidence != confidence:
return None

for url, confidence, skipped in data:
source = CopyvioSource(None, url)
source.confidence = confidence
source.skipped = skipped
result.sources.append(source)
result.queries = queries
result.time = check_time
result.cached = True
result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
result.cache_age = _format_date(cache_time)
return result


def _format_date(cache_time): def _format_date(cache_time):
diff = datetime.utcnow() - cache_time diff = datetime.utcnow() - cache_time
@@ -145,13 +167,15 @@ def _format_date(cache_time):
return "{0} seconds".format(diff.seconds) return "{0} seconds".format(diff.seconds)


def _cache_result(page, result, conn, mode): def _cache_result(page, result, conn, mode):
query = """INSERT INTO cache
VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)
ON DUPLICATE KEY UPDATE
cache_url = ?, cache_time = CURRENT_TIMESTAMP,
cache_queries = ?, cache_process_time = ?"""
shahash = sha256(mode + page.get().encode("utf8")).hexdigest()
args = (page.pageid, shahash, result.url, result.queries, result.time,
result.url, result.queries, result.time)
query1 = "DELETE FROM cache WHERE cache_id = ?"
query2 = "INSERT INTO cache VALUES (?, DEFAULT, ?, ?)"
query3 = "INSERT INTO cache_data VALUES (DEFAULT, ?, ?, ?, ?)"
cache_id = sha256(mode + page.get().encode("utf8")).digest()
data = [(cache_id, source.url, source.confidence, source.skipped)
for source in result.sources]
with conn.cursor() as cursor: with conn.cursor() as cursor:
cursor.execute(query, args)
cursor.execute("START TRANSACTION")
cursor.execute(query1, (cache_id,))
cursor.execute(query2, (cache_id, result.queries, result.time))
cursor.executemany(query3, data)
cursor.execute("COMMIT")

+ 4
- 1
copyvios/highlighter.py View File

@@ -2,14 +2,17 @@


from re import sub, UNICODE from re import sub, UNICODE


from earwigbot.wiki.copyvios.markov import EMPTY_INTERSECTION
from markupsafe import escape from markupsafe import escape


__all__ = ["highlight_delta"] __all__ = ["highlight_delta"]


def highlight_delta(context, chain, delta):
def highlight_delta(context, chain, delta=None):
degree = chain.degree - 1 degree = chain.degree - 1
highlights = [False] * degree highlights = [False] * degree
block = [chain.START] * degree block = [chain.START] * degree
if not delta:
delta = EMPTY_INTERSECTION
for word in chain.text.split() + ([chain.END] * degree): for word in chain.text.split() + ([chain.END] * degree):
word = _strip_word(chain, word) word = _strip_word(chain, word)
tblock = tuple(block) tblock = tuple(block)


+ 3
- 11
templates/index.mako View File

@@ -178,15 +178,7 @@
<li>Redirected from <a href="${query.redirected_from.url}">${query.redirected_from.title | h}</a>. <a href="${request.url | httpsfix, h}&amp;noredirect=1">Check the original page.</a></li> <li>Redirected from <a href="${query.redirected_from.url}">${query.redirected_from.title | h}</a>. <a href="${request.url | httpsfix, h}&amp;noredirect=1">Check the original page.</a></li>
% endif % endif
% if result.cached: % if result.cached:
<li>
Results are <a id="cv-cached" href="#">cached<span>To save time (and money), this tool will retain the results of checks for up to 72 hours. This includes the URL of the "violated" source, but neither its content nor the content of the article. Future checks on the same page (assuming it remains unchanged) will not involve additional search queries, but a fresh comparison against the source URL will be made. If the page is modified, a new check will be run.</span></a> from <abbr title="${result.cache_time}">${result.cache_age} ago</abbr>. Retrieved in <span class="mono">${round(result.time, 3)}</span> seconds (originally generated in
% if result.queries:
<span class="mono">${round(result.original_time, 3)}</span>s using <span class="mono">${result.queries}</span> queries).
% else:
<span class="mono">${round(result.original_time, 3)}</span>s).
% endif
<a href="${request.url | httpsfix, h}&amp;nocache=1">Bypass the cache.</a>
</li>
<li>Results are <a id="cv-cached" href="#">cached<span>To save time (and money), this tool will retain the results of checks for up to 72 hours. This includes the URLs of the checked sources, but neither their content nor the content of the article. Future checks on the same page (assuming it remains unchanged) will not involve additional search queries, but a fresh comparison against the source URL will be made. If the page is modified, a new check will be run.</span></a> from <abbr title="${result.cache_time}">${result.cache_age} ago</abbr>. Originally generated in <span class="mono">${round(result.time, 3)}</span> seconds using <span class="mono">${result.queries}</span> queries. <a href="${request.url | httpsfix, h}&amp;nocache=1">Bypass the cache.</a></li>
% else: % else:
<li>Results generated in <span class="mono">${round(result.time, 3)}</span> seconds using <span class="mono">${result.queries}</span> queries.</li> <li>Results generated in <span class="mono">${round(result.time, 3)}</span> seconds using <span class="mono">${result.queries}</span> queries.</li>
% endif % endif
@@ -194,8 +186,8 @@
</ul> </ul>
<table id="cv-chain-table" style="display: ${'none' if hide_comparison else 'table'};"> <table id="cv-chain-table" style="display: ${'none' if hide_comparison else 'table'};">
<tr> <tr>
<td class="cv-chain-cell">Article: <div class="cv-chain-detail"><p>${highlight_delta(result.article_chain, result.delta_chain)}</p></div></td>
<td class="cv-chain-cell">Source: <div class="cv-chain-detail"><p>${highlight_delta(result.source_chain, result.delta_chain)}</p></div></td>
<td class="cv-chain-cell">Article: <div class="cv-chain-detail"><p>${highlight_delta(result.article_chain, result.best.chains[1] if result.best else None)}</p></div></td>
<td class="cv-chain-cell">Source: <div class="cv-chain-detail"><p>${highlight_delta(result.best.chains[0], result.best.chains[1]) if result.best else ""}</p></div></td>
</tr> </tr>
</table> </table>
% endif % endif


Loading…
Cancel
Save