@@ -1,299 +1,10 @@ | |||||
<%! | <%! | ||||
from datetime import datetime | |||||
from hashlib import sha256 | |||||
from itertools import count | |||||
from os.path import expanduser | |||||
from re import sub, UNICODE | |||||
from sys import path | |||||
from time import time | |||||
from urlparse import parse_qs, urlparse | |||||
from earwigbot import exceptions | |||||
from urlparse import parse_qs | |||||
from earwigbot.bot import Bot | from earwigbot.bot import Bot | ||||
import oursql | |||||
def get_results(bot, lang, project, name, all_projects, title, url, query): | |||||
site = get_site(bot, lang, project, name, all_projects) | |||||
if not site: | |||||
return None, None, None | |||||
page = site.get_page(title) | |||||
try: | |||||
page.get() # Make sure that the page exists before we check it! | |||||
except (exceptions.PageNotFoundError, exceptions.InvalidPageError): | |||||
return site, page, None | |||||
# if url: | |||||
# result = get_url_specific_results(page, url) | |||||
# else: | |||||
# conn = open_sql_connection(bot, "copyvioCache") | |||||
# if not query.get("nocache"): | |||||
# result = get_cached_results(page, conn) | |||||
# if query.get("nocache") or not result: | |||||
# result = get_fresh_results(page, conn) | |||||
tstart = time() | |||||
mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get()) | |||||
mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain(u"This is some random textual content for a page.") | |||||
mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2) | |||||
result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult( | |||||
True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci)) | |||||
result.cached = False | |||||
result.tdiff = time() - tstart | |||||
# END TEST BLOCK | |||||
return site, page, result | |||||
def get_site(bot, lang, project, name, all_projects): | |||||
if project not in [proj[0] for proj in all_projects]: | |||||
return None | |||||
if project == "wikimedia" and name: # Special sites: | |||||
try: | |||||
return bot.wiki.get_site(name=name) | |||||
except exceptions.SiteNotFoundError: | |||||
try: | |||||
return bot.wiki.add_site(lang=lang, project=project) | |||||
except (exceptions.APIError, exceptions.LoginError): | |||||
return None | |||||
try: | |||||
return bot.wiki.get_site(lang=lang, project=project) | |||||
except exceptions.SiteNotFoundError: | |||||
try: | |||||
return bot.wiki.add_site(lang=lang, project=project) | |||||
except (exceptions.APIError, exceptions.LoginError): | |||||
return None | |||||
def get_url_specific_results(page, url): | |||||
t_start = time() | |||||
result = page.copyvio_compare(url) | |||||
result.cached = False | |||||
result.tdiff = time() - t_start | |||||
return result | |||||
def open_sql_connection(bot, dbname): | |||||
conn_args = bot.config.wiki["_toolserverSQL"][dbname] | |||||
if "read_default_file" not in conn_args and "user" not in conn_args and "passwd" not in conn_args: | |||||
conn_args["read_default_file"] = expanduser("~/.my.cnf") | |||||
if "autoping" not in conn_args: | |||||
conn_args["autoping"] = True | |||||
if "autoreconnect" not in conn_args: | |||||
conn_args["autoreconnect"] = True | |||||
return oursql.connect(**conn_args) | |||||
def get_cached_results(page, conn): | |||||
query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)" | |||||
query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?" | |||||
pageid = page.pageid() | |||||
hash = sha256(page.get()).hexdigest() | |||||
t_start = time() | |||||
with conn.cursor() as cursor: | |||||
cursor.execute(query1) | |||||
cursor.execute(query2, (pageid, hash)) | |||||
results = cursor.fetchall() | |||||
if not results: | |||||
return None | |||||
url, cache_time, num_queries, original_tdiff = results[0] | |||||
result = page.copyvio_compare(url) | |||||
result.cached = True | |||||
result.queries = num_queries | |||||
result.tdiff = time() - t_start | |||||
result.original_tdiff = original_tdiff | |||||
result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC") | |||||
result.cache_age = format_date(cache_time) | |||||
return result | |||||
def format_date(cache_time): | |||||
diff = datetime.utcnow() - cache_time | |||||
if diff.seconds > 3600: | |||||
return "{0} hours".format(diff.seconds / 3600) | |||||
if diff.seconds > 60: | |||||
return "{0} minutes".format(diff.seconds / 60) | |||||
return "{0} seconds".format(diff.seconds) | |||||
def get_fresh_results(page, conn): | |||||
t_start = time() | |||||
result = page.copyvio_check(max_queries=10) | |||||
result.cached = False | |||||
result.tdiff = time() - t_start | |||||
cache_result(page, result, conn) | |||||
return result | |||||
def cache_result(page, result, conn): | |||||
pageid = page.pageid() | |||||
hash = sha256(page.get()).hexdigest() | |||||
query1 = "SELECT 1 FROM cache WHERE cache_id = ?" | |||||
query2 = "DELETE FROM cache WHERE cache_id = ?" | |||||
query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)" | |||||
with conn.cursor() as cursor: | |||||
cursor.execute(query1, (pageid,)) | |||||
if cursor.fetchall(): | |||||
cursor.execute(query2, (pageid,)) | |||||
cursor.execute(query3, (pageid, hash, result.url, result.queries, | |||||
result.tdiff)) | |||||
def get_sites(bot): | |||||
max_staleness = 60 * 60 * 24 * 7 | |||||
conn = open_sql_connection(bot, "globals") | |||||
query1 = "SELECT update_time FROM updates WHERE update_service = ?" | |||||
query2 = "SELECT lang_code, lang_name FROM language" | |||||
query3 = "SELECT project_code, project_name FROM project" | |||||
with conn.cursor() as cursor: | |||||
cursor.execute(query1, ("sites",)) | |||||
try: | |||||
time_since_update = int(time() - cursor.fetchall()[0][0]) | |||||
except IndexError: | |||||
time_since_update = time() | |||||
if time_since_update > max_staleness: | |||||
update_sites(bot.wiki.get_site(), cursor) | |||||
cursor.execute(query2) | |||||
langs = [] | |||||
for code, name in cursor.fetchall(): | |||||
if "\U" in name: | |||||
name = name.decode("unicode_escape") | |||||
langs.append((code, name)) | |||||
cursor.execute(query3) | |||||
projects = cursor.fetchall() | |||||
return langs, projects | |||||
def update_sites(site, cursor): | |||||
matrix = site.api_query(action="sitematrix")["sitematrix"] | |||||
del matrix["count"] | |||||
languages, projects = set(), set() | |||||
for site in matrix.itervalues(): | |||||
if isinstance(site, list): # Special sites | |||||
bad_sites = ["closed", "private", "fishbowl"] | |||||
for special in site: | |||||
if all([key not in special for key in bad_sites]): | |||||
full = urlparse(special["url"]).netloc | |||||
if full.count(".") == 1: # No subdomain, so use "www" | |||||
lang, project = "www", full.split(".")[0] | |||||
else: | |||||
lang, project = full.rsplit(".", 2)[:2] | |||||
code = u"{0}::{1}".format(lang, special["dbname"]) | |||||
name = special["code"].capitalize() | |||||
languages.add((code, u"{0} ({1})".format(lang, name))) | |||||
projects.add((project, project.capitalize())) | |||||
continue | |||||
this = set() | |||||
for web in site["site"]: | |||||
if "closed" in web: | |||||
continue | |||||
project = "wikipedia" if web["code"] == u"wiki" else web["code"] | |||||
this.add((project, project.capitalize())) | |||||
if this: | |||||
code = site["code"] | |||||
if "\U" in site["name"].encode("unicode_escape"): | |||||
name = site["name"].encode("unicode_escape") | |||||
else: | |||||
name = site["name"] | |||||
languages.add((code, u"{0} ({1})".format(code, name))) | |||||
projects |= this | |||||
save_site_updates(cursor, languages, projects) | |||||
def save_site_updates(cursor, languages, projects): | |||||
query1 = "SELECT lang_code, lang_name FROM language" | |||||
query2 = "DELETE FROM language WHERE lang_code = ? AND lang_name = ?" | |||||
query3 = "INSERT INTO language VALUES (?, ?)" | |||||
query4 = "SELECT project_code, project_name FROM project" | |||||
query5 = "DELETE FROM project WHERE project_code = ? AND project_name = ?" | |||||
query6 = "INSERT INTO project VALUES (?, ?)" | |||||
query7 = "SELECT 1 FROM updates WHERE update_service = ?" | |||||
query8 = "UPDATE updates SET update_time = ? WHERE update_service = ?" | |||||
query9 = "INSERT INTO updates VALUES (?, ?)" | |||||
synchronize_sites_with_db(cursor, languages, query1, query2, query3) | |||||
synchronize_sites_with_db(cursor, projects, query4, query5, query6) | |||||
cursor.execute(query7, ("sites",)) | |||||
if cursor.fetchall(): | |||||
cursor.execute(query8, (time(), "sites")) | |||||
else: | |||||
cursor.execute(query9, ("sites", time())) | |||||
def synchronize_sites_with_db(cursor, updates, q_list, q_rmv, q_update): | |||||
removals = [] | |||||
cursor.execute(q_list) | |||||
for site in cursor: | |||||
updates.remove(site) if site in updates else removals.append(site) | |||||
cursor.executemany(q_rmv, removals) | |||||
cursor.executemany(q_update, updates) | |||||
def highlight_delta(chain, delta): | |||||
processed = [] | |||||
prev_prev = prev = chain.START | |||||
i = 0 | |||||
all_words = chain.text.split() | |||||
paragraphs = chain.text.split("\n") | |||||
for paragraph in paragraphs: | |||||
processed_words = [] | |||||
words = paragraph.split(" ") | |||||
for word, i in zip(words, count(i)): | |||||
try: | |||||
next = strip_word(all_words[i+1]) | |||||
except IndexError: | |||||
next = chain.END | |||||
sword = strip_word(word) | |||||
block = (prev_prev, prev) # Block for before | |||||
alock = (prev, sword) # Block for after | |||||
before = [block in delta.chain and sword in delta.chain[block]] | |||||
after = [alock in delta.chain and next in delta.chain[alock]] | |||||
is_first = i == 0 | |||||
is_last = i + 1 == len(all_words) | |||||
res = highlight_word(word, before, after, is_first, is_last) | |||||
processed_words.append(res) | |||||
prev_prev = prev | |||||
prev = sword | |||||
processed.append(u" ".join(processed_words)) | |||||
i += 1 | |||||
return u"<br /><br />".join(processed) | |||||
def highlight_word(word, before, after, is_first, is_last): | |||||
if before and after: | |||||
# Word is in the middle of a highlighted block, so don't change | |||||
# anything unless this is the first word (force block to start) or | |||||
# the last word (force block to end): | |||||
res = word | |||||
if is_first: | |||||
res = u'<span class="cv-hl">' + res | |||||
if is_last: | |||||
res += u'</span>' | |||||
elif before: | |||||
# Word is the last in a highlighted block, so fade it out and then | |||||
# end the block; force open a block before the word if this is the | |||||
# first word: | |||||
res = fade_word(word, u"out") + u"</span>" | |||||
if is_first: | |||||
res = u'<span class="cv-hl">' + res | |||||
elif after: | |||||
# Word is the first in a highlighted block, so start the block and | |||||
# then fade it in; force close the block after the word if this is | |||||
# the last word: | |||||
res = u'<span class="cv-hl">' + fade_word(word, u"in") | |||||
if is_last: | |||||
res += u"</span>" | |||||
else: | |||||
# Word is completely outside of a highlighted block, so do nothing: | |||||
res = word | |||||
return res | |||||
def fade_word(word, dir): | |||||
if len(word) <= 4: | |||||
return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word) | |||||
if dir == u"out": | |||||
return u'{0}<span class="cv-hl-out">{1}</span>'.format(word[:-4], word[-4:]) | |||||
return u'<span class="cv-hl-in">{0}</span>{1}'.format(word[:4], word[4:]) | |||||
def strip_word(word): | |||||
return sub("[^\w\s-]", "", word.lower(), flags=UNICODE) | |||||
def urlstrip(url): | |||||
if url.startswith("http://"): | |||||
url = url[7:] | |||||
if url.startswith("https://"): | |||||
url = url[8:] | |||||
if url.startswith("www."): | |||||
url = url[4:] | |||||
if url.endswith("/"): | |||||
url = url[:-1] | |||||
return url | |||||
%>\ | %>\ | ||||
<%namespace file="/support/copyvios/__init__.py" import="get_results, highlight_delta"/>\ | |||||
<%namespace file="/support/sites.py" import="get_site, get_sites"/>\ | |||||
<%namespace file="/support/misc.py" import="urlstrip"/>\ | |||||
<% | <% | ||||
lang = orig_lang = project = name = title = url = None | lang = orig_lang = project = name = title = url = None | ||||
query = parse_qs(environ["QUERY_STRING"]) | query = parse_qs(environ["QUERY_STRING"]) | ||||
@@ -0,0 +1,4 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from .checker import get_results | |||||
from .highlighter import highlight_delta |
@@ -0,0 +1,96 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from datetime import datetime | |||||
from hashlib import sha256 | |||||
from time import time | |||||
from earwigbot import exceptions | |||||
def get_results(bot, lang, project, name, all_projects, title, url, query): | |||||
site = get_site(bot, lang, project, name, all_projects) | |||||
if not site: | |||||
return None, None, None | |||||
page = site.get_page(title) | |||||
try: | |||||
page.get() # Make sure that the page exists before we check it! | |||||
except (exceptions.PageNotFoundError, exceptions.InvalidPageError): | |||||
return site, page, None | |||||
# if url: | |||||
# result = get_url_specific_results(page, url) | |||||
# else: | |||||
# conn = open_sql_connection(bot, "copyvioCache") | |||||
# if not query.get("nocache"): | |||||
# result = get_cached_results(page, conn) | |||||
# if query.get("nocache") or not result: | |||||
# result = get_fresh_results(page, conn) | |||||
tstart = time() | |||||
mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get()) | |||||
mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain(u"This is some random textual content for a page.") | |||||
mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2) | |||||
result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult( | |||||
True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci)) | |||||
result.cached = False | |||||
result.tdiff = time() - tstart | |||||
# END TEST BLOCK | |||||
return site, page, result | |||||
def get_url_specific_results(page, url): | |||||
t_start = time() | |||||
result = page.copyvio_compare(url) | |||||
result.cached = False | |||||
result.tdiff = time() - t_start | |||||
return result | |||||
def get_cached_results(page, conn): | |||||
query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)" | |||||
query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?" | |||||
pageid = page.pageid() | |||||
hash = sha256(page.get()).hexdigest() | |||||
t_start = time() | |||||
with conn.cursor() as cursor: | |||||
cursor.execute(query1) | |||||
cursor.execute(query2, (pageid, hash)) | |||||
results = cursor.fetchall() | |||||
if not results: | |||||
return None | |||||
url, cache_time, num_queries, original_tdiff = results[0] | |||||
result = page.copyvio_compare(url) | |||||
result.cached = True | |||||
result.queries = num_queries | |||||
result.tdiff = time() - t_start | |||||
result.original_tdiff = original_tdiff | |||||
result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC") | |||||
result.cache_age = format_date(cache_time) | |||||
return result | |||||
def format_date(cache_time): | |||||
diff = datetime.utcnow() - cache_time | |||||
if diff.seconds > 3600: | |||||
return "{0} hours".format(diff.seconds / 3600) | |||||
if diff.seconds > 60: | |||||
return "{0} minutes".format(diff.seconds / 60) | |||||
return "{0} seconds".format(diff.seconds) | |||||
def get_fresh_results(page, conn): | |||||
t_start = time() | |||||
result = page.copyvio_check(max_queries=10) | |||||
result.cached = False | |||||
result.tdiff = time() - t_start | |||||
cache_result(page, result, conn) | |||||
return result | |||||
def cache_result(page, result, conn): | |||||
pageid = page.pageid() | |||||
hash = sha256(page.get()).hexdigest() | |||||
query1 = "SELECT 1 FROM cache WHERE cache_id = ?" | |||||
query2 = "DELETE FROM cache WHERE cache_id = ?" | |||||
query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)" | |||||
with conn.cursor() as cursor: | |||||
cursor.execute(query1, (pageid,)) | |||||
if cursor.fetchall(): | |||||
cursor.execute(query2, (pageid,)) | |||||
cursor.execute(query3, (pageid, hash, result.url, result.queries, | |||||
result.tdiff)) |
@@ -0,0 +1,71 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from re import sub, UNICODE | |||||
def highlight_delta(chain, delta): | |||||
processed = [] | |||||
prev_prev = prev = chain.START | |||||
i = 0 | |||||
all_words = chain.text.split() | |||||
paragraphs = chain.text.split("\n") | |||||
for paragraph in paragraphs: | |||||
processed_words = [] | |||||
words = paragraph.split(" ") | |||||
for i, word in enumerate(words, i) | |||||
try: | |||||
next = strip_word(all_words[i+1]) | |||||
except IndexError: | |||||
next = chain.END | |||||
sword = strip_word(word) | |||||
block = (prev_prev, prev) # Block for before | |||||
alock = (prev, sword) # Block for after | |||||
before = [block in delta.chain and sword in delta.chain[block]] | |||||
after = [alock in delta.chain and next in delta.chain[alock]] | |||||
is_first = i == 0 | |||||
is_last = i + 1 == len(all_words) | |||||
res = highlight_word(word, before, after, is_first, is_last) | |||||
processed_words.append(res) | |||||
prev_prev = prev | |||||
prev = sword | |||||
processed.append(u" ".join(processed_words)) | |||||
i += 1 | |||||
return u"<br /><br />".join(processed) | |||||
def highlight_word(word, before, after, is_first, is_last): | |||||
if before and after: | |||||
# Word is in the middle of a highlighted block, so don't change | |||||
# anything unless this is the first word (force block to start) or | |||||
# the last word (force block to end): | |||||
res = word | |||||
if is_first: | |||||
res = u'<span class="cv-hl">' + res | |||||
if is_last: | |||||
res += u'</span>' | |||||
elif before: | |||||
# Word is the last in a highlighted block, so fade it out and then | |||||
# end the block; force open a block before the word if this is the | |||||
# first word: | |||||
res = fade_word(word, u"out") + u"</span>" | |||||
if is_first: | |||||
res = u'<span class="cv-hl">' + res | |||||
elif after: | |||||
# Word is the first in a highlighted block, so start the block and | |||||
# then fade it in; force close the block after the word if this is | |||||
# the last word: | |||||
res = u'<span class="cv-hl">' + fade_word(word, u"in") | |||||
if is_last: | |||||
res += u"</span>" | |||||
else: | |||||
# Word is completely outside of a highlighted block, so do nothing: | |||||
res = word | |||||
return res | |||||
def fade_word(word, dir): | |||||
if len(word) <= 4: | |||||
return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word) | |||||
if dir == u"out": | |||||
return u'{0}<span class="cv-hl-out">{1}</span>'.format(word[:-4], word[-4:]) | |||||
return u'<span class="cv-hl-in">{0}</span>{1}'.format(word[:4], word[4:]) | |||||
def strip_word(word): | |||||
return sub("[^\w\s-]", "", word.lower(), flags=UNICODE) |
@@ -0,0 +1,26 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from os.path import expanduser | |||||
import oursql | |||||
def open_sql_connection(bot, dbname): | |||||
conn_args = bot.config.wiki["_toolserverSQL"][dbname] | |||||
if "read_default_file" not in conn_args and "user" not in conn_args and "passwd" not in conn_args: | |||||
conn_args["read_default_file"] = expanduser("~/.my.cnf") | |||||
if "autoping" not in conn_args: | |||||
conn_args["autoping"] = True | |||||
if "autoreconnect" not in conn_args: | |||||
conn_args["autoreconnect"] = True | |||||
return oursql.connect(**conn_args) | |||||
def urlstrip(context, url): | |||||
if url.startswith("http://"): | |||||
url = url[7:] | |||||
if url.startswith("https://"): | |||||
url = url[8:] | |||||
if url.startswith("www."): | |||||
url = url[4:] | |||||
if url.endswith("/"): | |||||
url = url[:-1] | |||||
return url |
@@ -0,0 +1,110 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from time import time | |||||
from urlparse import urlparse | |||||
from earwigbot import exceptions | |||||
def get_site(bot, lang, project, name, all_projects): | |||||
if project not in [proj[0] for proj in all_projects]: | |||||
return None | |||||
if project == "wikimedia" and name: # Special sites: | |||||
try: | |||||
return bot.wiki.get_site(name=name) | |||||
except exceptions.SiteNotFoundError: | |||||
try: | |||||
return bot.wiki.add_site(lang=lang, project=project) | |||||
except (exceptions.APIError, exceptions.LoginError): | |||||
return None | |||||
try: | |||||
return bot.wiki.get_site(lang=lang, project=project) | |||||
except exceptions.SiteNotFoundError: | |||||
try: | |||||
return bot.wiki.add_site(lang=lang, project=project) | |||||
except (exceptions.APIError, exceptions.LoginError): | |||||
return None | |||||
def get_sites(bot): | |||||
max_staleness = 60 * 60 * 24 * 7 | |||||
conn = open_sql_connection(bot, "globals") | |||||
query1 = "SELECT update_time FROM updates WHERE update_service = ?" | |||||
query2 = "SELECT lang_code, lang_name FROM language" | |||||
query3 = "SELECT project_code, project_name FROM project" | |||||
with conn.cursor() as cursor: | |||||
cursor.execute(query1, ("sites",)) | |||||
try: | |||||
time_since_update = int(time() - cursor.fetchall()[0][0]) | |||||
except IndexError: | |||||
time_since_update = time() | |||||
if time_since_update > max_staleness: | |||||
update_sites(bot.wiki.get_site(), cursor) | |||||
cursor.execute(query2) | |||||
langs = [] | |||||
for code, name in cursor.fetchall(): | |||||
if "\U" in name: | |||||
name = name.decode("unicode_escape") | |||||
langs.append((code, name)) | |||||
cursor.execute(query3) | |||||
projects = cursor.fetchall() | |||||
return langs, projects | |||||
def update_sites(site, cursor): | |||||
matrix = site.api_query(action="sitematrix")["sitematrix"] | |||||
del matrix["count"] | |||||
languages, projects = set(), set() | |||||
for site in matrix.itervalues(): | |||||
if isinstance(site, list): # Special sites | |||||
bad_sites = ["closed", "private", "fishbowl"] | |||||
for special in site: | |||||
if all([key not in special for key in bad_sites]): | |||||
full = urlparse(special["url"]).netloc | |||||
if full.count(".") == 1: # No subdomain, so use "www" | |||||
lang, project = "www", full.split(".")[0] | |||||
else: | |||||
lang, project = full.rsplit(".", 2)[:2] | |||||
code = u"{0}::{1}".format(lang, special["dbname"]) | |||||
name = special["code"].capitalize() | |||||
languages.add((code, u"{0} ({1})".format(lang, name))) | |||||
projects.add((project, project.capitalize())) | |||||
continue | |||||
this = set() | |||||
for web in site["site"]: | |||||
if "closed" in web: | |||||
continue | |||||
project = "wikipedia" if web["code"] == u"wiki" else web["code"] | |||||
this.add((project, project.capitalize())) | |||||
if this: | |||||
code = site["code"] | |||||
if "\U" in site["name"].encode("unicode_escape"): | |||||
name = site["name"].encode("unicode_escape") | |||||
else: | |||||
name = site["name"] | |||||
languages.add((code, u"{0} ({1})".format(code, name))) | |||||
projects |= this | |||||
save_site_updates(cursor, languages, projects) | |||||
def save_site_updates(cursor, languages, projects): | |||||
query1 = "SELECT lang_code, lang_name FROM language" | |||||
query2 = "DELETE FROM language WHERE lang_code = ? AND lang_name = ?" | |||||
query3 = "INSERT INTO language VALUES (?, ?)" | |||||
query4 = "SELECT project_code, project_name FROM project" | |||||
query5 = "DELETE FROM project WHERE project_code = ? AND project_name = ?" | |||||
query6 = "INSERT INTO project VALUES (?, ?)" | |||||
query7 = "SELECT 1 FROM updates WHERE update_service = ?" | |||||
query8 = "UPDATE updates SET update_time = ? WHERE update_service = ?" | |||||
query9 = "INSERT INTO updates VALUES (?, ?)" | |||||
synchronize_sites_with_db(cursor, languages, query1, query2, query3) | |||||
synchronize_sites_with_db(cursor, projects, query4, query5, query6) | |||||
cursor.execute(query7, ("sites",)) | |||||
if cursor.fetchall(): | |||||
cursor.execute(query8, (time(), "sites")) | |||||
else: | |||||
cursor.execute(query9, ("sites", time())) | |||||
def synchronize_sites_with_db(cursor, updates, q_list, q_rmv, q_update): | |||||
removals = [] | |||||
cursor.execute(q_list) | |||||
for site in cursor: | |||||
updates.remove(site) if site in updates else removals.append(site) | |||||
cursor.executemany(q_rmv, removals) | |||||
cursor.executemany(q_update, updates) |