Trying a radical restructuring...

12 years ago · 5bff019edc
--- a/pages/copyvios.mako
+++ b/pages/copyvios.mako
@@ -1,299 +1,10 @@
 <%!
    from datetime import datetime
    from hashlib import sha256
    from itertools import count
    from os.path import expanduser
    from re import sub, UNICODE
    from sys import path
    from time import time
    from urlparse import parse_qs, urlparse

    from earwigbot import exceptions
    from urlparse import parse_qs
    from earwigbot.bot import Bot
    import oursql

    def get_results(bot, lang, project, name, all_projects, title, url, query):
        site = get_site(bot, lang, project, name, all_projects)
        if not site:
            return None, None, None
        page = site.get_page(title)
        try:
            page.get()  # Make sure that the page exists before we check it!
        except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
            return site, page, None

        # if url:
        #     result = get_url_specific_results(page, url)
        # else:
        #     conn = open_sql_connection(bot, "copyvioCache")
        #     if not query.get("nocache"):
        #         result = get_cached_results(page, conn)
        #     if query.get("nocache") or not result:
        #         result = get_fresh_results(page, conn)
        tstart = time()
        mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get())
        mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain(u"This is some random textual content for a page.")
        mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2)
        result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult(
            True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci))
        result.cached = False
        result.tdiff = time() - tstart
        # END TEST BLOCK
        return site, page, result

    def get_site(bot, lang, project, name, all_projects):
        if project not in [proj[0] for proj in all_projects]:
            return None
        if project == "wikimedia" and name:  # Special sites:
            try:
                return bot.wiki.get_site(name=name)
            except exceptions.SiteNotFoundError:
                try:
                    return bot.wiki.add_site(lang=lang, project=project)
                except (exceptions.APIError, exceptions.LoginError):
                    return None
        try:
            return bot.wiki.get_site(lang=lang, project=project)
        except exceptions.SiteNotFoundError:
            try:
                return bot.wiki.add_site(lang=lang, project=project)
            except (exceptions.APIError, exceptions.LoginError):
                return None

    def get_url_specific_results(page, url):
        t_start = time()
        result = page.copyvio_compare(url)
        result.cached = False
        result.tdiff = time() - t_start
        return result

    def open_sql_connection(bot, dbname):
        conn_args = bot.config.wiki["_toolserverSQL"][dbname]
        if "read_default_file" not in conn_args and "user" not in conn_args and "passwd" not in conn_args:
            conn_args["read_default_file"] = expanduser("~/.my.cnf")
        if "autoping" not in conn_args:
            conn_args["autoping"] = True
        if "autoreconnect" not in conn_args:
            conn_args["autoreconnect"] = True
        return oursql.connect(**conn_args)

    def get_cached_results(page, conn):
        query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"
        query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
        pageid = page.pageid()
        hash = sha256(page.get()).hexdigest()
        t_start = time()

        with conn.cursor() as cursor:
            cursor.execute(query1)
            cursor.execute(query2, (pageid, hash))
            results = cursor.fetchall()
            if not results:
                return None

        url, cache_time, num_queries, original_tdiff = results[0]
        result = page.copyvio_compare(url)
        result.cached = True
        result.queries = num_queries
        result.tdiff = time() - t_start
        result.original_tdiff = original_tdiff
        result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
        result.cache_age = format_date(cache_time)
        return result

    def format_date(cache_time):
        diff = datetime.utcnow() - cache_time
        if diff.seconds > 3600:
            return "{0} hours".format(diff.seconds / 3600)
        if diff.seconds > 60:
            return "{0} minutes".format(diff.seconds / 60)
        return "{0} seconds".format(diff.seconds)

    def get_fresh_results(page, conn):
        t_start = time()
        result = page.copyvio_check(max_queries=10)
        result.cached = False
        result.tdiff = time() - t_start
        cache_result(page, result, conn)
        return result

    def cache_result(page, result, conn):
        pageid = page.pageid()
        hash = sha256(page.get()).hexdigest()
        query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
        query2 = "DELETE FROM cache WHERE cache_id = ?"
        query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
        with conn.cursor() as cursor:
            cursor.execute(query1, (pageid,))
            if cursor.fetchall():
                cursor.execute(query2, (pageid,))
            cursor.execute(query3, (pageid, hash, result.url, result.queries,
                                    result.tdiff))

    def get_sites(bot):
        max_staleness = 60 * 60 * 24 * 7
        conn = open_sql_connection(bot, "globals")
        query1 = "SELECT update_time FROM updates WHERE update_service = ?"
        query2 = "SELECT lang_code, lang_name FROM language"
        query3 = "SELECT project_code, project_name FROM project"
        with conn.cursor() as cursor:
            cursor.execute(query1, ("sites",))
            try:
                time_since_update = int(time() - cursor.fetchall()[0][0])
            except IndexError:
                time_since_update = time()
            if time_since_update > max_staleness:
                update_sites(bot.wiki.get_site(), cursor)
            cursor.execute(query2)
            langs = []
            for code, name in cursor.fetchall():
                if "\U" in name:
                    name = name.decode("unicode_escape")
                langs.append((code, name))
            cursor.execute(query3)
            projects = cursor.fetchall()
        return langs, projects

    def update_sites(site, cursor):
        matrix = site.api_query(action="sitematrix")["sitematrix"]
        del matrix["count"]
        languages, projects = set(), set()
        for site in matrix.itervalues():
            if isinstance(site, list):  # Special sites
                bad_sites = ["closed", "private", "fishbowl"]
                for special in site:
                    if all([key not in special for key in bad_sites]):
                        full = urlparse(special["url"]).netloc
                        if full.count(".") == 1:  # No subdomain, so use "www"
                            lang, project = "www", full.split(".")[0]
                        else:
                            lang, project = full.rsplit(".", 2)[:2]
                        code = u"{0}::{1}".format(lang, special["dbname"])
                        name = special["code"].capitalize()
                        languages.add((code, u"{0} ({1})".format(lang, name)))
                        projects.add((project, project.capitalize()))
                continue
            this = set()
            for web in site["site"]:
                if "closed" in web:
                    continue
                project = "wikipedia" if web["code"] == u"wiki" else web["code"]
                this.add((project, project.capitalize()))
            if this:
                code = site["code"]
                if "\U" in site["name"].encode("unicode_escape"):
                    name = site["name"].encode("unicode_escape")
                else:
                    name = site["name"]
                languages.add((code, u"{0} ({1})".format(code, name)))
                projects |= this
        save_site_updates(cursor, languages, projects)

    def save_site_updates(cursor, languages, projects):
        query1 = "SELECT lang_code, lang_name FROM language"
        query2 = "DELETE FROM language WHERE lang_code = ? AND lang_name = ?"
        query3 = "INSERT INTO language VALUES (?, ?)"
        query4 = "SELECT project_code, project_name FROM project"
        query5 = "DELETE FROM project WHERE project_code = ? AND project_name = ?"
        query6 = "INSERT INTO project VALUES (?, ?)"
        query7 = "SELECT 1 FROM updates WHERE update_service = ?"
        query8 = "UPDATE updates SET update_time = ? WHERE update_service = ?"
        query9 = "INSERT INTO updates VALUES (?, ?)"
        synchronize_sites_with_db(cursor, languages, query1, query2, query3)
        synchronize_sites_with_db(cursor, projects, query4, query5, query6)
        cursor.execute(query7, ("sites",))
        if cursor.fetchall():
            cursor.execute(query8, (time(), "sites"))
        else:
            cursor.execute(query9, ("sites", time()))

    def synchronize_sites_with_db(cursor, updates, q_list, q_rmv, q_update):
        removals = []
        cursor.execute(q_list)
        for site in cursor:
            updates.remove(site) if site in updates else removals.append(site)
        cursor.executemany(q_rmv, removals)
        cursor.executemany(q_update, updates)

    def highlight_delta(chain, delta):
        processed = []
        prev_prev = prev = chain.START
        i = 0
        all_words = chain.text.split()
        paragraphs = chain.text.split("\n")
        for paragraph in paragraphs:
            processed_words = []
            words = paragraph.split(" ")
            for word, i in zip(words, count(i)):
                try:
                    next = strip_word(all_words[i+1])
                except IndexError:
                    next = chain.END
                sword = strip_word(word)
                block = (prev_prev, prev)  # Block for before
                alock = (prev, sword)  # Block for after
                before = [block in delta.chain and sword in delta.chain[block]]
                after = [alock in delta.chain and next in delta.chain[alock]]
                is_first = i == 0
                is_last = i + 1 == len(all_words)
                res = highlight_word(word, before, after, is_first, is_last)
                processed_words.append(res)
                prev_prev = prev
                prev = sword
            processed.append(u" ".join(processed_words))
            i += 1
        return u"<br /><br />".join(processed)

    def highlight_word(word, before, after, is_first, is_last):
        if before and after:
            # Word is in the middle of a highlighted block, so don't change
            # anything unless this is the first word (force block to start) or
            # the last word (force block to end):
            res = word
            if is_first:
                res = u'<span class="cv-hl">' + res
            if is_last:
                res += u'</span>'
        elif before:
            # Word is the last in a highlighted block, so fade it out and then
            # end the block; force open a block before the word if this is the
            # first word:
            res = fade_word(word, u"out") + u"</span>"
            if is_first:
                res = u'<span class="cv-hl">' + res
        elif after:
            # Word is the first in a highlighted block, so start the block and
            # then fade it in; force close the block after the word if this is
            # the last word:
            res = u'<span class="cv-hl">' + fade_word(word, u"in")
            if is_last:
                res += u"</span>"
        else:
            # Word is completely outside of a highlighted block, so do nothing:
            res = word
        return res

    def fade_word(word, dir):
        if len(word) <= 4:
            return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
        if dir == u"out":
            return u'{0}<span class="cv-hl-out">{1}</span>'.format(word[:-4], word[-4:])
        return u'<span class="cv-hl-in">{0}</span>{1}'.format(word[:4], word[4:])

    def strip_word(word):
        return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)

    def urlstrip(url):
        if url.startswith("http://"):
            url = url[7:]
        if url.startswith("https://"):
            url = url[8:]
        if url.startswith("www."):
            url = url[4:]
        if url.endswith("/"):
            url = url[:-1]
        return url
 %>\
 <%namespace file="/support/copyvios/__init__.py" import="get_results, highlight_delta"/>\
 <%namespace file="/support/sites.py" import="get_site, get_sites"/>\
 <%namespace file="/support/misc.py" import="urlstrip"/>\
 <%
    lang = orig_lang = project = name = title = url = None
    query = parse_qs(environ["QUERY_STRING"])
--- a/pages/support/copyvios/init.py
+++ b/pages/support/copyvios/init.py
@@ -0,0 +1,4 @@
 # -*- coding: utf-8  -*-

 from .checker import get_results
 from .highlighter import highlight_delta
--- a/pages/support/copyvios/checker.py
+++ b/pages/support/copyvios/checker.py
@@ -0,0 +1,96 @@
 # -*- coding: utf-8  -*-

 from datetime import datetime
 from hashlib import sha256
 from time import time

 from earwigbot import exceptions

 def get_results(bot, lang, project, name, all_projects, title, url, query):
    site = get_site(bot, lang, project, name, all_projects)
    if not site:
        return None, None, None
    page = site.get_page(title)
    try:
        page.get()  # Make sure that the page exists before we check it!
    except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
        return site, page, None

    # if url:
    #     result = get_url_specific_results(page, url)
    # else:
    #     conn = open_sql_connection(bot, "copyvioCache")
    #     if not query.get("nocache"):
    #         result = get_cached_results(page, conn)
    #     if query.get("nocache") or not result:
    #         result = get_fresh_results(page, conn)
    tstart = time()
    mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get())
    mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain(u"This is some random textual content for a page.")
    mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2)
    result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult(
        True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci))
    result.cached = False
    result.tdiff = time() - tstart
    # END TEST BLOCK
    return site, page, result

 def get_url_specific_results(page, url):
    t_start = time()
    result = page.copyvio_compare(url)
    result.cached = False
    result.tdiff = time() - t_start
    return result

 def get_cached_results(page, conn):
    query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"
    query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
    pageid = page.pageid()
    hash = sha256(page.get()).hexdigest()
    t_start = time()

    with conn.cursor() as cursor:
        cursor.execute(query1)
        cursor.execute(query2, (pageid, hash))
        results = cursor.fetchall()
        if not results:
            return None

    url, cache_time, num_queries, original_tdiff = results[0]
    result = page.copyvio_compare(url)
    result.cached = True
    result.queries = num_queries
    result.tdiff = time() - t_start
    result.original_tdiff = original_tdiff
    result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
    result.cache_age = format_date(cache_time)
    return result

 def format_date(cache_time):
    diff = datetime.utcnow() - cache_time
    if diff.seconds > 3600:
        return "{0} hours".format(diff.seconds / 3600)
    if diff.seconds > 60:
        return "{0} minutes".format(diff.seconds / 60)
    return "{0} seconds".format(diff.seconds)

 def get_fresh_results(page, conn):
    t_start = time()
    result = page.copyvio_check(max_queries=10)
    result.cached = False
    result.tdiff = time() - t_start
    cache_result(page, result, conn)
    return result

 def cache_result(page, result, conn):
    pageid = page.pageid()
    hash = sha256(page.get()).hexdigest()
    query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
    query2 = "DELETE FROM cache WHERE cache_id = ?"
    query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
    with conn.cursor() as cursor:
        cursor.execute(query1, (pageid,))
        if cursor.fetchall():
            cursor.execute(query2, (pageid,))
        cursor.execute(query3, (pageid, hash, result.url, result.queries,
                                result.tdiff))
--- a/pages/support/copyvios/highlighter.py
+++ b/pages/support/copyvios/highlighter.py
@@ -0,0 +1,71 @@
 # -*- coding: utf-8  -*-

 from re import sub, UNICODE

 def highlight_delta(chain, delta):
    processed = []
    prev_prev = prev = chain.START
    i = 0
    all_words = chain.text.split()
    paragraphs = chain.text.split("\n")
    for paragraph in paragraphs:
        processed_words = []
        words = paragraph.split(" ")
        for i, word in enumerate(words, i)
            try:
                next = strip_word(all_words[i+1])
            except IndexError:
                next = chain.END
            sword = strip_word(word)
            block = (prev_prev, prev)  # Block for before
            alock = (prev, sword)  # Block for after
            before = [block in delta.chain and sword in delta.chain[block]]
            after = [alock in delta.chain and next in delta.chain[alock]]
            is_first = i == 0
            is_last = i + 1 == len(all_words)
            res = highlight_word(word, before, after, is_first, is_last)
            processed_words.append(res)
            prev_prev = prev
            prev = sword
        processed.append(u" ".join(processed_words))
        i += 1
    return u"<br /><br />".join(processed)

 def highlight_word(word, before, after, is_first, is_last):
    if before and after:
        # Word is in the middle of a highlighted block, so don't change
        # anything unless this is the first word (force block to start) or
        # the last word (force block to end):
        res = word
        if is_first:
            res = u'<span class="cv-hl">' + res
        if is_last:
            res += u'</span>'
    elif before:
        # Word is the last in a highlighted block, so fade it out and then
        # end the block; force open a block before the word if this is the
        # first word:
        res = fade_word(word, u"out") + u"</span>"
        if is_first:
            res = u'<span class="cv-hl">' + res
    elif after:
        # Word is the first in a highlighted block, so start the block and
        # then fade it in; force close the block after the word if this is
        # the last word:
        res = u'<span class="cv-hl">' + fade_word(word, u"in")
        if is_last:
            res += u"</span>"
    else:
        # Word is completely outside of a highlighted block, so do nothing:
        res = word
    return res

 def fade_word(word, dir):
    if len(word) <= 4:
        return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
    if dir == u"out":
        return u'{0}<span class="cv-hl-out">{1}</span>'.format(word[:-4], word[-4:])
    return u'<span class="cv-hl-in">{0}</span>{1}'.format(word[:4], word[4:])

 def strip_word(word):
    return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)
--- a/pages/support/misc.py
+++ b/pages/support/misc.py
@@ -0,0 +1,26 @@
 # -*- coding: utf-8  -*-

 from os.path import expanduser

 import oursql

 def open_sql_connection(bot, dbname):
    conn_args = bot.config.wiki["_toolserverSQL"][dbname]
    if "read_default_file" not in conn_args and "user" not in conn_args and "passwd" not in conn_args:
        conn_args["read_default_file"] = expanduser("~/.my.cnf")
    if "autoping" not in conn_args:
        conn_args["autoping"] = True
    if "autoreconnect" not in conn_args:
        conn_args["autoreconnect"] = True
    return oursql.connect(**conn_args)

 def urlstrip(context, url):
    if url.startswith("http://"):
        url = url[7:]
    if url.startswith("https://"):
        url = url[8:]
    if url.startswith("www."):
        url = url[4:]
    if url.endswith("/"):
        url = url[:-1]
    return url
--- a/pages/support/sites.py
+++ b/pages/support/sites.py
@@ -0,0 +1,110 @@
 # -*- coding: utf-8  -*-

 from time import time
 from urlparse import urlparse

 from earwigbot import exceptions

 def get_site(bot, lang, project, name, all_projects):
    if project not in [proj[0] for proj in all_projects]:
        return None
    if project == "wikimedia" and name:  # Special sites:
        try:
            return bot.wiki.get_site(name=name)
        except exceptions.SiteNotFoundError:
            try:
                return bot.wiki.add_site(lang=lang, project=project)
            except (exceptions.APIError, exceptions.LoginError):
                return None
    try:
        return bot.wiki.get_site(lang=lang, project=project)
    except exceptions.SiteNotFoundError:
        try:
            return bot.wiki.add_site(lang=lang, project=project)
        except (exceptions.APIError, exceptions.LoginError):
            return None

 def get_sites(bot):
    max_staleness = 60 * 60 * 24 * 7
    conn = open_sql_connection(bot, "globals")
    query1 = "SELECT update_time FROM updates WHERE update_service = ?"
    query2 = "SELECT lang_code, lang_name FROM language"
    query3 = "SELECT project_code, project_name FROM project"
    with conn.cursor() as cursor:
        cursor.execute(query1, ("sites",))
        try:
            time_since_update = int(time() - cursor.fetchall()[0][0])
        except IndexError:
            time_since_update = time()
        if time_since_update > max_staleness:
            update_sites(bot.wiki.get_site(), cursor)
        cursor.execute(query2)
        langs = []
        for code, name in cursor.fetchall():
            if "\U" in name:
                name = name.decode("unicode_escape")
            langs.append((code, name))
        cursor.execute(query3)
        projects = cursor.fetchall()
    return langs, projects

 def update_sites(site, cursor):
    matrix = site.api_query(action="sitematrix")["sitematrix"]
    del matrix["count"]
    languages, projects = set(), set()
    for site in matrix.itervalues():
        if isinstance(site, list):  # Special sites
            bad_sites = ["closed", "private", "fishbowl"]
            for special in site:
                if all([key not in special for key in bad_sites]):
                    full = urlparse(special["url"]).netloc
                    if full.count(".") == 1:  # No subdomain, so use "www"
                        lang, project = "www", full.split(".")[0]
                    else:
                        lang, project = full.rsplit(".", 2)[:2]
                    code = u"{0}::{1}".format(lang, special["dbname"])
                    name = special["code"].capitalize()
                    languages.add((code, u"{0} ({1})".format(lang, name)))
                    projects.add((project, project.capitalize()))
            continue
        this = set()
        for web in site["site"]:
            if "closed" in web:
                continue
            project = "wikipedia" if web["code"] == u"wiki" else web["code"]
            this.add((project, project.capitalize()))
        if this:
            code = site["code"]
            if "\U" in site["name"].encode("unicode_escape"):
                name = site["name"].encode("unicode_escape")
            else:
                name = site["name"]
            languages.add((code, u"{0} ({1})".format(code, name)))
            projects |= this
    save_site_updates(cursor, languages, projects)

 def save_site_updates(cursor, languages, projects):
    query1 = "SELECT lang_code, lang_name FROM language"
    query2 = "DELETE FROM language WHERE lang_code = ? AND lang_name = ?"
    query3 = "INSERT INTO language VALUES (?, ?)"
    query4 = "SELECT project_code, project_name FROM project"
    query5 = "DELETE FROM project WHERE project_code = ? AND project_name = ?"
    query6 = "INSERT INTO project VALUES (?, ?)"
    query7 = "SELECT 1 FROM updates WHERE update_service = ?"
    query8 = "UPDATE updates SET update_time = ? WHERE update_service = ?"
    query9 = "INSERT INTO updates VALUES (?, ?)"
    synchronize_sites_with_db(cursor, languages, query1, query2, query3)
    synchronize_sites_with_db(cursor, projects, query4, query5, query6)
    cursor.execute(query7, ("sites",))
    if cursor.fetchall():
        cursor.execute(query8, (time(), "sites"))
    else:
        cursor.execute(query9, ("sites", time()))

 def synchronize_sites_with_db(cursor, updates, q_list, q_rmv, q_update):
    removals = []
    cursor.execute(q_list)
    for site in cursor:
        updates.remove(site) if site in updates else removals.append(site)
    cursor.executemany(q_rmv, removals)
    cursor.executemany(q_update, updates)