From 5bff019edcaa6272ba259d391e5de2d1ff5a4427 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 22 Jul 2012 00:57:18 -0400
Subject: [PATCH] Trying a radical restructuring...

---
 pages/copyvios.mako                   | 297 +---------------------------------
 pages/support/copyvios/__init__.py    |   4 +
 pages/support/copyvios/checker.py     |  96 +++++++++++
 pages/support/copyvios/highlighter.py |  71 ++++++++
 pages/support/misc.py                 |  26 +++
 pages/support/sites.py                | 110 +++++++++++++
 6 files changed, 311 insertions(+), 293 deletions(-)
 create mode 100644 pages/support/copyvios/__init__.py
 create mode 100644 pages/support/copyvios/checker.py
 create mode 100644 pages/support/copyvios/highlighter.py
 create mode 100644 pages/support/misc.py
 create mode 100644 pages/support/sites.py

diff --git a/pages/copyvios.mako b/pages/copyvios.mako
index 1e8da4f..5fb64c2 100644
--- a/pages/copyvios.mako
+++ b/pages/copyvios.mako
@@ -1,299 +1,10 @@
 <%!
-    from datetime import datetime
-    from hashlib import sha256
-    from itertools import count
-    from os.path import expanduser
-    from re import sub, UNICODE
-    from sys import path
-    from time import time
-    from urlparse import parse_qs, urlparse
-
-    from earwigbot import exceptions
+    from urlparse import parse_qs
     from earwigbot.bot import Bot
-    import oursql
-
-    def get_results(bot, lang, project, name, all_projects, title, url, query):
-        site = get_site(bot, lang, project, name, all_projects)
-        if not site:
-            return None, None, None
-        page = site.get_page(title)
-        try:
-            page.get()  # Make sure that the page exists before we check it!
-        except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
-            return site, page, None
-
-        # if url:
-        #     result = get_url_specific_results(page, url)
-        # else:
-        #     conn = open_sql_connection(bot, "copyvioCache")
-        #     if not query.get("nocache"):
-        #         result = get_cached_results(page, conn)
-        #     if query.get("nocache") or not result:
-        #         result = get_fresh_results(page, conn)
-        tstart = time()
-        mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get())
-        mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain(u"This is some random textual content for a page.")
-        mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2)
-        result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult(
-            True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci))
-        result.cached = False
-        result.tdiff = time() - tstart
-        # END TEST BLOCK
-        return site, page, result
-
-    def get_site(bot, lang, project, name, all_projects):
-        if project not in [proj[0] for proj in all_projects]:
-            return None
-        if project == "wikimedia" and name:  # Special sites:
-            try:
-                return bot.wiki.get_site(name=name)
-            except exceptions.SiteNotFoundError:
-                try:
-                    return bot.wiki.add_site(lang=lang, project=project)
-                except (exceptions.APIError, exceptions.LoginError):
-                    return None
-        try:
-            return bot.wiki.get_site(lang=lang, project=project)
-        except exceptions.SiteNotFoundError:
-            try:
-                return bot.wiki.add_site(lang=lang, project=project)
-            except (exceptions.APIError, exceptions.LoginError):
-                return None
-
-    def get_url_specific_results(page, url):
-        t_start = time()
-        result = page.copyvio_compare(url)
-        result.cached = False
-        result.tdiff = time() - t_start
-        return result
-
-    def open_sql_connection(bot, dbname):
-        conn_args = bot.config.wiki["_toolserverSQL"][dbname]
-        if "read_default_file" not in conn_args and "user" not in conn_args and "passwd" not in conn_args:
-            conn_args["read_default_file"] = expanduser("~/.my.cnf")
-        if "autoping" not in conn_args:
-            conn_args["autoping"] = True
-        if "autoreconnect" not in conn_args:
-            conn_args["autoreconnect"] = True
-        return oursql.connect(**conn_args)
-
-    def get_cached_results(page, conn):
-        query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"
-        query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
-        pageid = page.pageid()
-        hash = sha256(page.get()).hexdigest()
-        t_start = time()
-
-        with conn.cursor() as cursor:
-            cursor.execute(query1)
-            cursor.execute(query2, (pageid, hash))
-            results = cursor.fetchall()
-            if not results:
-                return None
-
-        url, cache_time, num_queries, original_tdiff = results[0]
-        result = page.copyvio_compare(url)
-        result.cached = True
-        result.queries = num_queries
-        result.tdiff = time() - t_start
-        result.original_tdiff = original_tdiff
-        result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
-        result.cache_age = format_date(cache_time)
-        return result
-
-    def format_date(cache_time):
-        diff = datetime.utcnow() - cache_time
-        if diff.seconds > 3600:
-            return "{0} hours".format(diff.seconds / 3600)
-        if diff.seconds > 60:
-            return "{0} minutes".format(diff.seconds / 60)
-        return "{0} seconds".format(diff.seconds)
-
-    def get_fresh_results(page, conn):
-        t_start = time()
-        result = page.copyvio_check(max_queries=10)
-        result.cached = False
-        result.tdiff = time() - t_start
-        cache_result(page, result, conn)
-        return result
-
-    def cache_result(page, result, conn):
-        pageid = page.pageid()
-        hash = sha256(page.get()).hexdigest()
-        query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
-        query2 = "DELETE FROM cache WHERE cache_id = ?"
-        query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
-        with conn.cursor() as cursor:
-            cursor.execute(query1, (pageid,))
-            if cursor.fetchall():
-                cursor.execute(query2, (pageid,))
-            cursor.execute(query3, (pageid, hash, result.url, result.queries,
-                                    result.tdiff))
-
-    def get_sites(bot):
-        max_staleness = 60 * 60 * 24 * 7
-        conn = open_sql_connection(bot, "globals")
-        query1 = "SELECT update_time FROM updates WHERE update_service = ?"
-        query2 = "SELECT lang_code, lang_name FROM language"
-        query3 = "SELECT project_code, project_name FROM project"
-        with conn.cursor() as cursor:
-            cursor.execute(query1, ("sites",))
-            try:
-                time_since_update = int(time() - cursor.fetchall()[0][0])
-            except IndexError:
-                time_since_update = time()
-            if time_since_update > max_staleness:
-                update_sites(bot.wiki.get_site(), cursor)
-            cursor.execute(query2)
-            langs = []
-            for code, name in cursor.fetchall():
-                if "\U" in name:
-                    name = name.decode("unicode_escape")
-                langs.append((code, name))
-            cursor.execute(query3)
-            projects = cursor.fetchall()
-        return langs, projects
-
-    def update_sites(site, cursor):
-        matrix = site.api_query(action="sitematrix")["sitematrix"]
-        del matrix["count"]
-        languages, projects = set(), set()
-        for site in matrix.itervalues():
-            if isinstance(site, list):  # Special sites
-                bad_sites = ["closed", "private", "fishbowl"]
-                for special in site:
-                    if all([key not in special for key in bad_sites]):
-                        full = urlparse(special["url"]).netloc
-                        if full.count(".") == 1:  # No subdomain, so use "www"
-                            lang, project = "www", full.split(".")[0]
-                        else:
-                            lang, project = full.rsplit(".", 2)[:2]
-                        code = u"{0}::{1}".format(lang, special["dbname"])
-                        name = special["code"].capitalize()
-                        languages.add((code, u"{0} ({1})".format(lang, name)))
-                        projects.add((project, project.capitalize()))
-                continue
-            this = set()
-            for web in site["site"]:
-                if "closed" in web:
-                    continue
-                project = "wikipedia" if web["code"] == u"wiki" else web["code"]
-                this.add((project, project.capitalize()))
-            if this:
-                code = site["code"]
-                if "\U" in site["name"].encode("unicode_escape"):
-                    name = site["name"].encode("unicode_escape")
-                else:
-                    name = site["name"]
-                languages.add((code, u"{0} ({1})".format(code, name)))
-                projects |= this
-        save_site_updates(cursor, languages, projects)
-
-    def save_site_updates(cursor, languages, projects):
-        query1 = "SELECT lang_code, lang_name FROM language"
-        query2 = "DELETE FROM language WHERE lang_code = ? AND lang_name = ?"
-        query3 = "INSERT INTO language VALUES (?, ?)"
-        query4 = "SELECT project_code, project_name FROM project"
-        query5 = "DELETE FROM project WHERE project_code = ? AND project_name = ?"
-        query6 = "INSERT INTO project VALUES (?, ?)"
-        query7 = "SELECT 1 FROM updates WHERE update_service = ?"
-        query8 = "UPDATE updates SET update_time = ? WHERE update_service = ?"
-        query9 = "INSERT INTO updates VALUES (?, ?)"
-        synchronize_sites_with_db(cursor, languages, query1, query2, query3)
-        synchronize_sites_with_db(cursor, projects, query4, query5, query6)
-        cursor.execute(query7, ("sites",))
-        if cursor.fetchall():
-            cursor.execute(query8, (time(), "sites"))
-        else:
-            cursor.execute(query9, ("sites", time()))
-
-    def synchronize_sites_with_db(cursor, updates, q_list, q_rmv, q_update):
-        removals = []
-        cursor.execute(q_list)
-        for site in cursor:
-            updates.remove(site) if site in updates else removals.append(site)
-        cursor.executemany(q_rmv, removals)
-        cursor.executemany(q_update, updates)
-
-    def highlight_delta(chain, delta):
-        processed = []
-        prev_prev = prev = chain.START
-        i = 0
-        all_words = chain.text.split()
-        paragraphs = chain.text.split("\n")
-        for paragraph in paragraphs:
-            processed_words = []
-            words = paragraph.split(" ")
-            for word, i in zip(words, count(i)):
-                try:
-                    next = strip_word(all_words[i+1])
-                except IndexError:
-                    next = chain.END
-                sword = strip_word(word)
-                block = (prev_prev, prev)  # Block for before
-                alock = (prev, sword)  # Block for after
-                before = [block in delta.chain and sword in delta.chain[block]]
-                after = [alock in delta.chain and next in delta.chain[alock]]
-                is_first = i == 0
-                is_last = i + 1 == len(all_words)
-                res = highlight_word(word, before, after, is_first, is_last)
-                processed_words.append(res)
-                prev_prev = prev
-                prev = sword
-            processed.append(u" ".join(processed_words))
-            i += 1
-        return u"<br /><br />".join(processed)
-
-    def highlight_word(word, before, after, is_first, is_last):
-        if before and after:
-            # Word is in the middle of a highlighted block, so don't change
-            # anything unless this is the first word (force block to start) or
-            # the last word (force block to end):
-            res = word
-            if is_first:
-                res = u'<span class="cv-hl">' + res
-            if is_last:
-                res += u'</span>'
-        elif before:
-            # Word is the last in a highlighted block, so fade it out and then
-            # end the block; force open a block before the word if this is the
-            # first word:
-            res = fade_word(word, u"out") + u"</span>"
-            if is_first:
-                res = u'<span class="cv-hl">' + res
-        elif after:
-            # Word is the first in a highlighted block, so start the block and
-            # then fade it in; force close the block after the word if this is
-            # the last word:
-            res = u'<span class="cv-hl">' + fade_word(word, u"in")
-            if is_last:
-                res += u"</span>"
-        else:
-            # Word is completely outside of a highlighted block, so do nothing:
-            res = word
-        return res
-
-    def fade_word(word, dir):
-        if len(word) <= 4:
-            return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
-        if dir == u"out":
-            return u'{0}<span class="cv-hl-out">{1}</span>'.format(word[:-4], word[-4:])
-        return u'<span class="cv-hl-in">{0}</span>{1}'.format(word[:4], word[4:])
-
-    def strip_word(word):
-        return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)
-
-    def urlstrip(url):
-        if url.startswith("http://"):
-            url = url[7:]
-        if url.startswith("https://"):
-            url = url[8:]
-        if url.startswith("www."):
-            url = url[4:]
-        if url.endswith("/"):
-            url = url[:-1]
-        return url
 %>\
+<%namespace file="/support/copyvios/__init__.py" import="get_results, highlight_delta"/>\
+<%namespace file="/support/sites.py" import="get_site, get_sites"/>\
+<%namespace file="/support/misc.py" import="urlstrip"/>\
 <%
     lang = orig_lang = project = name = title = url = None
     query = parse_qs(environ["QUERY_STRING"])
diff --git a/pages/support/copyvios/__init__.py b/pages/support/copyvios/__init__.py
new file mode 100644
index 0000000..0908dfe
--- /dev/null
+++ b/pages/support/copyvios/__init__.py
@@ -0,0 +1,4 @@
+# -*- coding: utf-8  -*-
+
+from .checker import get_results
+from .highlighter import highlight_delta
diff --git a/pages/support/copyvios/checker.py b/pages/support/copyvios/checker.py
new file mode 100644
index 0000000..513a568
--- /dev/null
+++ b/pages/support/copyvios/checker.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8  -*-
+
+from datetime import datetime
+from hashlib import sha256
+from time import time
+
+from earwigbot import exceptions
+
+def get_results(bot, lang, project, name, all_projects, title, url, query):
+    site = get_site(bot, lang, project, name, all_projects)
+    if not site:
+        return None, None, None
+    page = site.get_page(title)
+    try:
+        page.get()  # Make sure that the page exists before we check it!
+    except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
+        return site, page, None
+
+    # if url:
+    #     result = get_url_specific_results(page, url)
+    # else:
+    #     conn = open_sql_connection(bot, "copyvioCache")
+    #     if not query.get("nocache"):
+    #         result = get_cached_results(page, conn)
+    #     if query.get("nocache") or not result:
+    #         result = get_fresh_results(page, conn)
+    tstart = time()
+    mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get())
+    mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain(u"This is some random textual content for a page.")
+    mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2)
+    result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult(
+        True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci))
+    result.cached = False
+    result.tdiff = time() - tstart
+    # END TEST BLOCK
+    return site, page, result
+
+def get_url_specific_results(page, url):
+    t_start = time()
+    result = page.copyvio_compare(url)
+    result.cached = False
+    result.tdiff = time() - t_start
+    return result
+
+def get_cached_results(page, conn):
+    query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"
+    query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
+    pageid = page.pageid()
+    hash = sha256(page.get()).hexdigest()
+    t_start = time()
+
+    with conn.cursor() as cursor:
+        cursor.execute(query1)
+        cursor.execute(query2, (pageid, hash))
+        results = cursor.fetchall()
+        if not results:
+            return None
+
+    url, cache_time, num_queries, original_tdiff = results[0]
+    result = page.copyvio_compare(url)
+    result.cached = True
+    result.queries = num_queries
+    result.tdiff = time() - t_start
+    result.original_tdiff = original_tdiff
+    result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
+    result.cache_age = format_date(cache_time)
+    return result
+
+def format_date(cache_time):
+    diff = datetime.utcnow() - cache_time
+    if diff.seconds > 3600:
+        return "{0} hours".format(diff.seconds / 3600)
+    if diff.seconds > 60:
+        return "{0} minutes".format(diff.seconds / 60)
+    return "{0} seconds".format(diff.seconds)
+
+def get_fresh_results(page, conn):
+    t_start = time()
+    result = page.copyvio_check(max_queries=10)
+    result.cached = False
+    result.tdiff = time() - t_start
+    cache_result(page, result, conn)
+    return result
+
+def cache_result(page, result, conn):
+    pageid = page.pageid()
+    hash = sha256(page.get()).hexdigest()
+    query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
+    query2 = "DELETE FROM cache WHERE cache_id = ?"
+    query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
+    with conn.cursor() as cursor:
+        cursor.execute(query1, (pageid,))
+        if cursor.fetchall():
+            cursor.execute(query2, (pageid,))
+        cursor.execute(query3, (pageid, hash, result.url, result.queries,
+                                result.tdiff))
diff --git a/pages/support/copyvios/highlighter.py b/pages/support/copyvios/highlighter.py
new file mode 100644
index 0000000..40be5b8
--- /dev/null
+++ b/pages/support/copyvios/highlighter.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8  -*-
+
+from re import sub, UNICODE
+
+def highlight_delta(chain, delta):
+    processed = []
+    prev_prev = prev = chain.START
+    i = 0
+    all_words = chain.text.split()
+    paragraphs = chain.text.split("\n")
+    for paragraph in paragraphs:
+        processed_words = []
+        words = paragraph.split(" ")
+        for i, word in enumerate(words, i)
+            try:
+                next = strip_word(all_words[i+1])
+            except IndexError:
+                next = chain.END
+            sword = strip_word(word)
+            block = (prev_prev, prev)  # Block for before
+            alock = (prev, sword)  # Block for after
+            before = [block in delta.chain and sword in delta.chain[block]]
+            after = [alock in delta.chain and next in delta.chain[alock]]
+            is_first = i == 0
+            is_last = i + 1 == len(all_words)
+            res = highlight_word(word, before, after, is_first, is_last)
+            processed_words.append(res)
+            prev_prev = prev
+            prev = sword
+        processed.append(u" ".join(processed_words))
+        i += 1
+    return u"<br /><br />".join(processed)
+
+def highlight_word(word, before, after, is_first, is_last):
+    if before and after:
+        # Word is in the middle of a highlighted block, so don't change
+        # anything unless this is the first word (force block to start) or
+        # the last word (force block to end):
+        res = word
+        if is_first:
+            res = u'<span class="cv-hl">' + res
+        if is_last:
+            res += u'</span>'
+    elif before:
+        # Word is the last in a highlighted block, so fade it out and then
+        # end the block; force open a block before the word if this is the
+        # first word:
+        res = fade_word(word, u"out") + u"</span>"
+        if is_first:
+            res = u'<span class="cv-hl">' + res
+    elif after:
+        # Word is the first in a highlighted block, so start the block and
+        # then fade it in; force close the block after the word if this is
+        # the last word:
+        res = u'<span class="cv-hl">' + fade_word(word, u"in")
+        if is_last:
+            res += u"</span>"
+    else:
+        # Word is completely outside of a highlighted block, so do nothing:
+        res = word
+    return res
+
+def fade_word(word, dir):
+    if len(word) <= 4:
+        return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
+    if dir == u"out":
+        return u'{0}<span class="cv-hl-out">{1}</span>'.format(word[:-4], word[-4:])
+    return u'<span class="cv-hl-in">{0}</span>{1}'.format(word[:4], word[4:])
+
+def strip_word(word):
+    return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)
diff --git a/pages/support/misc.py b/pages/support/misc.py
new file mode 100644
index 0000000..1c9a5b4
--- /dev/null
+++ b/pages/support/misc.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8  -*-
+
+from os.path import expanduser
+
+import oursql
+
+def open_sql_connection(bot, dbname):
+    conn_args = bot.config.wiki["_toolserverSQL"][dbname]
+    if "read_default_file" not in conn_args and "user" not in conn_args and "passwd" not in conn_args:
+        conn_args["read_default_file"] = expanduser("~/.my.cnf")
+    if "autoping" not in conn_args:
+        conn_args["autoping"] = True
+    if "autoreconnect" not in conn_args:
+        conn_args["autoreconnect"] = True
+    return oursql.connect(**conn_args)
+
+def urlstrip(context, url):
+    if url.startswith("http://"):
+        url = url[7:]
+    if url.startswith("https://"):
+        url = url[8:]
+    if url.startswith("www."):
+        url = url[4:]
+    if url.endswith("/"):
+        url = url[:-1]
+    return url
diff --git a/pages/support/sites.py b/pages/support/sites.py
new file mode 100644
index 0000000..e6e455f
--- /dev/null
+++ b/pages/support/sites.py
@@ -0,0 +1,110 @@
+# -*- coding: utf-8  -*-
+
+from time import time
+from urlparse import urlparse
+
+from earwigbot import exceptions
+
+def get_site(bot, lang, project, name, all_projects):
+    if project not in [proj[0] for proj in all_projects]:
+        return None
+    if project == "wikimedia" and name:  # Special sites:
+        try:
+            return bot.wiki.get_site(name=name)
+        except exceptions.SiteNotFoundError:
+            try:
+                return bot.wiki.add_site(lang=lang, project=project)
+            except (exceptions.APIError, exceptions.LoginError):
+                return None
+    try:
+        return bot.wiki.get_site(lang=lang, project=project)
+    except exceptions.SiteNotFoundError:
+        try:
+            return bot.wiki.add_site(lang=lang, project=project)
+        except (exceptions.APIError, exceptions.LoginError):
+            return None
+
+def get_sites(bot):
+    max_staleness = 60 * 60 * 24 * 7
+    conn = open_sql_connection(bot, "globals")
+    query1 = "SELECT update_time FROM updates WHERE update_service = ?"
+    query2 = "SELECT lang_code, lang_name FROM language"
+    query3 = "SELECT project_code, project_name FROM project"
+    with conn.cursor() as cursor:
+        cursor.execute(query1, ("sites",))
+        try:
+            time_since_update = int(time() - cursor.fetchall()[0][0])
+        except IndexError:
+            time_since_update = time()
+        if time_since_update > max_staleness:
+            update_sites(bot.wiki.get_site(), cursor)
+        cursor.execute(query2)
+        langs = []
+        for code, name in cursor.fetchall():
+            if "\U" in name:
+                name = name.decode("unicode_escape")
+            langs.append((code, name))
+        cursor.execute(query3)
+        projects = cursor.fetchall()
+    return langs, projects
+
+def update_sites(site, cursor):
+    matrix = site.api_query(action="sitematrix")["sitematrix"]
+    del matrix["count"]
+    languages, projects = set(), set()
+    for site in matrix.itervalues():
+        if isinstance(site, list):  # Special sites
+            bad_sites = ["closed", "private", "fishbowl"]
+            for special in site:
+                if all([key not in special for key in bad_sites]):
+                    full = urlparse(special["url"]).netloc
+                    if full.count(".") == 1:  # No subdomain, so use "www"
+                        lang, project = "www", full.split(".")[0]
+                    else:
+                        lang, project = full.rsplit(".", 2)[:2]
+                    code = u"{0}::{1}".format(lang, special["dbname"])
+                    name = special["code"].capitalize()
+                    languages.add((code, u"{0} ({1})".format(lang, name)))
+                    projects.add((project, project.capitalize()))
+            continue
+        this = set()
+        for web in site["site"]:
+            if "closed" in web:
+                continue
+            project = "wikipedia" if web["code"] == u"wiki" else web["code"]
+            this.add((project, project.capitalize()))
+        if this:
+            code = site["code"]
+            if "\U" in site["name"].encode("unicode_escape"):
+                name = site["name"].encode("unicode_escape")
+            else:
+                name = site["name"]
+            languages.add((code, u"{0} ({1})".format(code, name)))
+            projects |= this
+    save_site_updates(cursor, languages, projects)
+
+def save_site_updates(cursor, languages, projects):
+    query1 = "SELECT lang_code, lang_name FROM language"
+    query2 = "DELETE FROM language WHERE lang_code = ? AND lang_name = ?"
+    query3 = "INSERT INTO language VALUES (?, ?)"
+    query4 = "SELECT project_code, project_name FROM project"
+    query5 = "DELETE FROM project WHERE project_code = ? AND project_name = ?"
+    query6 = "INSERT INTO project VALUES (?, ?)"
+    query7 = "SELECT 1 FROM updates WHERE update_service = ?"
+    query8 = "UPDATE updates SET update_time = ? WHERE update_service = ?"
+    query9 = "INSERT INTO updates VALUES (?, ?)"
+    synchronize_sites_with_db(cursor, languages, query1, query2, query3)
+    synchronize_sites_with_db(cursor, projects, query4, query5, query6)
+    cursor.execute(query7, ("sites",))
+    if cursor.fetchall():
+        cursor.execute(query8, (time(), "sites"))
+    else:
+        cursor.execute(query9, ("sites", time()))
+
+def synchronize_sites_with_db(cursor, updates, q_list, q_rmv, q_update):
+    removals = []
+    cursor.execute(q_list)
+    for site in cursor:
+        updates.remove(site) if site in updates else removals.append(site)
+    cursor.executemany(q_rmv, removals)
+    cursor.executemany(q_update, updates)