From 5bff019edcaa6272ba259d391e5de2d1ff5a4427 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 22 Jul 2012 00:57:18 -0400 Subject: [PATCH] Trying a radical restructuring... --- pages/copyvios.mako | 297 +--------------------------------- pages/support/copyvios/__init__.py | 4 + pages/support/copyvios/checker.py | 96 +++++++++++ pages/support/copyvios/highlighter.py | 71 ++++++++ pages/support/misc.py | 26 +++ pages/support/sites.py | 110 +++++++++++++ 6 files changed, 311 insertions(+), 293 deletions(-) create mode 100644 pages/support/copyvios/__init__.py create mode 100644 pages/support/copyvios/checker.py create mode 100644 pages/support/copyvios/highlighter.py create mode 100644 pages/support/misc.py create mode 100644 pages/support/sites.py diff --git a/pages/copyvios.mako b/pages/copyvios.mako index 1e8da4f..5fb64c2 100644 --- a/pages/copyvios.mako +++ b/pages/copyvios.mako @@ -1,299 +1,10 @@ <%! - from datetime import datetime - from hashlib import sha256 - from itertools import count - from os.path import expanduser - from re import sub, UNICODE - from sys import path - from time import time - from urlparse import parse_qs, urlparse - - from earwigbot import exceptions + from urlparse import parse_qs from earwigbot.bot import Bot - import oursql - - def get_results(bot, lang, project, name, all_projects, title, url, query): - site = get_site(bot, lang, project, name, all_projects) - if not site: - return None, None, None - page = site.get_page(title) - try: - page.get() # Make sure that the page exists before we check it! - except (exceptions.PageNotFoundError, exceptions.InvalidPageError): - return site, page, None - - # if url: - # result = get_url_specific_results(page, url) - # else: - # conn = open_sql_connection(bot, "copyvioCache") - # if not query.get("nocache"): - # result = get_cached_results(page, conn) - # if query.get("nocache") or not result: - # result = get_fresh_results(page, conn) - tstart = time() - mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get()) - mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain(u"This is some random textual content for a page.") - mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2) - result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult( - True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci)) - result.cached = False - result.tdiff = time() - tstart - # END TEST BLOCK - return site, page, result - - def get_site(bot, lang, project, name, all_projects): - if project not in [proj[0] for proj in all_projects]: - return None - if project == "wikimedia" and name: # Special sites: - try: - return bot.wiki.get_site(name=name) - except exceptions.SiteNotFoundError: - try: - return bot.wiki.add_site(lang=lang, project=project) - except (exceptions.APIError, exceptions.LoginError): - return None - try: - return bot.wiki.get_site(lang=lang, project=project) - except exceptions.SiteNotFoundError: - try: - return bot.wiki.add_site(lang=lang, project=project) - except (exceptions.APIError, exceptions.LoginError): - return None - - def get_url_specific_results(page, url): - t_start = time() - result = page.copyvio_compare(url) - result.cached = False - result.tdiff = time() - t_start - return result - - def open_sql_connection(bot, dbname): - conn_args = bot.config.wiki["_toolserverSQL"][dbname] - if "read_default_file" not in conn_args and "user" not in conn_args and "passwd" not in conn_args: - conn_args["read_default_file"] = expanduser("~/.my.cnf") - if "autoping" not in conn_args: - conn_args["autoping"] = True - if "autoreconnect" not in conn_args: - conn_args["autoreconnect"] = True - return oursql.connect(**conn_args) - - def get_cached_results(page, conn): - query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)" - query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?" - pageid = page.pageid() - hash = sha256(page.get()).hexdigest() - t_start = time() - - with conn.cursor() as cursor: - cursor.execute(query1) - cursor.execute(query2, (pageid, hash)) - results = cursor.fetchall() - if not results: - return None - - url, cache_time, num_queries, original_tdiff = results[0] - result = page.copyvio_compare(url) - result.cached = True - result.queries = num_queries - result.tdiff = time() - t_start - result.original_tdiff = original_tdiff - result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC") - result.cache_age = format_date(cache_time) - return result - - def format_date(cache_time): - diff = datetime.utcnow() - cache_time - if diff.seconds > 3600: - return "{0} hours".format(diff.seconds / 3600) - if diff.seconds > 60: - return "{0} minutes".format(diff.seconds / 60) - return "{0} seconds".format(diff.seconds) - - def get_fresh_results(page, conn): - t_start = time() - result = page.copyvio_check(max_queries=10) - result.cached = False - result.tdiff = time() - t_start - cache_result(page, result, conn) - return result - - def cache_result(page, result, conn): - pageid = page.pageid() - hash = sha256(page.get()).hexdigest() - query1 = "SELECT 1 FROM cache WHERE cache_id = ?" - query2 = "DELETE FROM cache WHERE cache_id = ?" - query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)" - with conn.cursor() as cursor: - cursor.execute(query1, (pageid,)) - if cursor.fetchall(): - cursor.execute(query2, (pageid,)) - cursor.execute(query3, (pageid, hash, result.url, result.queries, - result.tdiff)) - - def get_sites(bot): - max_staleness = 60 * 60 * 24 * 7 - conn = open_sql_connection(bot, "globals") - query1 = "SELECT update_time FROM updates WHERE update_service = ?" - query2 = "SELECT lang_code, lang_name FROM language" - query3 = "SELECT project_code, project_name FROM project" - with conn.cursor() as cursor: - cursor.execute(query1, ("sites",)) - try: - time_since_update = int(time() - cursor.fetchall()[0][0]) - except IndexError: - time_since_update = time() - if time_since_update > max_staleness: - update_sites(bot.wiki.get_site(), cursor) - cursor.execute(query2) - langs = [] - for code, name in cursor.fetchall(): - if "\U" in name: - name = name.decode("unicode_escape") - langs.append((code, name)) - cursor.execute(query3) - projects = cursor.fetchall() - return langs, projects - - def update_sites(site, cursor): - matrix = site.api_query(action="sitematrix")["sitematrix"] - del matrix["count"] - languages, projects = set(), set() - for site in matrix.itervalues(): - if isinstance(site, list): # Special sites - bad_sites = ["closed", "private", "fishbowl"] - for special in site: - if all([key not in special for key in bad_sites]): - full = urlparse(special["url"]).netloc - if full.count(".") == 1: # No subdomain, so use "www" - lang, project = "www", full.split(".")[0] - else: - lang, project = full.rsplit(".", 2)[:2] - code = u"{0}::{1}".format(lang, special["dbname"]) - name = special["code"].capitalize() - languages.add((code, u"{0} ({1})".format(lang, name))) - projects.add((project, project.capitalize())) - continue - this = set() - for web in site["site"]: - if "closed" in web: - continue - project = "wikipedia" if web["code"] == u"wiki" else web["code"] - this.add((project, project.capitalize())) - if this: - code = site["code"] - if "\U" in site["name"].encode("unicode_escape"): - name = site["name"].encode("unicode_escape") - else: - name = site["name"] - languages.add((code, u"{0} ({1})".format(code, name))) - projects |= this - save_site_updates(cursor, languages, projects) - - def save_site_updates(cursor, languages, projects): - query1 = "SELECT lang_code, lang_name FROM language" - query2 = "DELETE FROM language WHERE lang_code = ? AND lang_name = ?" - query3 = "INSERT INTO language VALUES (?, ?)" - query4 = "SELECT project_code, project_name FROM project" - query5 = "DELETE FROM project WHERE project_code = ? AND project_name = ?" - query6 = "INSERT INTO project VALUES (?, ?)" - query7 = "SELECT 1 FROM updates WHERE update_service = ?" - query8 = "UPDATE updates SET update_time = ? WHERE update_service = ?" - query9 = "INSERT INTO updates VALUES (?, ?)" - synchronize_sites_with_db(cursor, languages, query1, query2, query3) - synchronize_sites_with_db(cursor, projects, query4, query5, query6) - cursor.execute(query7, ("sites",)) - if cursor.fetchall(): - cursor.execute(query8, (time(), "sites")) - else: - cursor.execute(query9, ("sites", time())) - - def synchronize_sites_with_db(cursor, updates, q_list, q_rmv, q_update): - removals = [] - cursor.execute(q_list) - for site in cursor: - updates.remove(site) if site in updates else removals.append(site) - cursor.executemany(q_rmv, removals) - cursor.executemany(q_update, updates) - - def highlight_delta(chain, delta): - processed = [] - prev_prev = prev = chain.START - i = 0 - all_words = chain.text.split() - paragraphs = chain.text.split("\n") - for paragraph in paragraphs: - processed_words = [] - words = paragraph.split(" ") - for word, i in zip(words, count(i)): - try: - next = strip_word(all_words[i+1]) - except IndexError: - next = chain.END - sword = strip_word(word) - block = (prev_prev, prev) # Block for before - alock = (prev, sword) # Block for after - before = [block in delta.chain and sword in delta.chain[block]] - after = [alock in delta.chain and next in delta.chain[alock]] - is_first = i == 0 - is_last = i + 1 == len(all_words) - res = highlight_word(word, before, after, is_first, is_last) - processed_words.append(res) - prev_prev = prev - prev = sword - processed.append(u" ".join(processed_words)) - i += 1 - return u"

".join(processed) - - def highlight_word(word, before, after, is_first, is_last): - if before and after: - # Word is in the middle of a highlighted block, so don't change - # anything unless this is the first word (force block to start) or - # the last word (force block to end): - res = word - if is_first: - res = u'' + res - if is_last: - res += u'' - elif before: - # Word is the last in a highlighted block, so fade it out and then - # end the block; force open a block before the word if this is the - # first word: - res = fade_word(word, u"out") + u"" - if is_first: - res = u'' + res - elif after: - # Word is the first in a highlighted block, so start the block and - # then fade it in; force close the block after the word if this is - # the last word: - res = u'' + fade_word(word, u"in") - if is_last: - res += u"" - else: - # Word is completely outside of a highlighted block, so do nothing: - res = word - return res - - def fade_word(word, dir): - if len(word) <= 4: - return u'{1}'.format(dir, word) - if dir == u"out": - return u'{0}{1}'.format(word[:-4], word[-4:]) - return u'{0}{1}'.format(word[:4], word[4:]) - - def strip_word(word): - return sub("[^\w\s-]", "", word.lower(), flags=UNICODE) - - def urlstrip(url): - if url.startswith("http://"): - url = url[7:] - if url.startswith("https://"): - url = url[8:] - if url.startswith("www."): - url = url[4:] - if url.endswith("/"): - url = url[:-1] - return url %>\ +<%namespace file="/support/copyvios/__init__.py" import="get_results, highlight_delta"/>\ +<%namespace file="/support/sites.py" import="get_site, get_sites"/>\ +<%namespace file="/support/misc.py" import="urlstrip"/>\ <% lang = orig_lang = project = name = title = url = None query = parse_qs(environ["QUERY_STRING"]) diff --git a/pages/support/copyvios/__init__.py b/pages/support/copyvios/__init__.py new file mode 100644 index 0000000..0908dfe --- /dev/null +++ b/pages/support/copyvios/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- + +from .checker import get_results +from .highlighter import highlight_delta diff --git a/pages/support/copyvios/checker.py b/pages/support/copyvios/checker.py new file mode 100644 index 0000000..513a568 --- /dev/null +++ b/pages/support/copyvios/checker.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +from datetime import datetime +from hashlib import sha256 +from time import time + +from earwigbot import exceptions + +def get_results(bot, lang, project, name, all_projects, title, url, query): + site = get_site(bot, lang, project, name, all_projects) + if not site: + return None, None, None + page = site.get_page(title) + try: + page.get() # Make sure that the page exists before we check it! + except (exceptions.PageNotFoundError, exceptions.InvalidPageError): + return site, page, None + + # if url: + # result = get_url_specific_results(page, url) + # else: + # conn = open_sql_connection(bot, "copyvioCache") + # if not query.get("nocache"): + # result = get_cached_results(page, conn) + # if query.get("nocache") or not result: + # result = get_fresh_results(page, conn) + tstart = time() + mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get()) + mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain(u"This is some random textual content for a page.") + mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2) + result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult( + True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci)) + result.cached = False + result.tdiff = time() - tstart + # END TEST BLOCK + return site, page, result + +def get_url_specific_results(page, url): + t_start = time() + result = page.copyvio_compare(url) + result.cached = False + result.tdiff = time() - t_start + return result + +def get_cached_results(page, conn): + query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)" + query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?" + pageid = page.pageid() + hash = sha256(page.get()).hexdigest() + t_start = time() + + with conn.cursor() as cursor: + cursor.execute(query1) + cursor.execute(query2, (pageid, hash)) + results = cursor.fetchall() + if not results: + return None + + url, cache_time, num_queries, original_tdiff = results[0] + result = page.copyvio_compare(url) + result.cached = True + result.queries = num_queries + result.tdiff = time() - t_start + result.original_tdiff = original_tdiff + result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC") + result.cache_age = format_date(cache_time) + return result + +def format_date(cache_time): + diff = datetime.utcnow() - cache_time + if diff.seconds > 3600: + return "{0} hours".format(diff.seconds / 3600) + if diff.seconds > 60: + return "{0} minutes".format(diff.seconds / 60) + return "{0} seconds".format(diff.seconds) + +def get_fresh_results(page, conn): + t_start = time() + result = page.copyvio_check(max_queries=10) + result.cached = False + result.tdiff = time() - t_start + cache_result(page, result, conn) + return result + +def cache_result(page, result, conn): + pageid = page.pageid() + hash = sha256(page.get()).hexdigest() + query1 = "SELECT 1 FROM cache WHERE cache_id = ?" + query2 = "DELETE FROM cache WHERE cache_id = ?" + query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)" + with conn.cursor() as cursor: + cursor.execute(query1, (pageid,)) + if cursor.fetchall(): + cursor.execute(query2, (pageid,)) + cursor.execute(query3, (pageid, hash, result.url, result.queries, + result.tdiff)) diff --git a/pages/support/copyvios/highlighter.py b/pages/support/copyvios/highlighter.py new file mode 100644 index 0000000..40be5b8 --- /dev/null +++ b/pages/support/copyvios/highlighter.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- + +from re import sub, UNICODE + +def highlight_delta(chain, delta): + processed = [] + prev_prev = prev = chain.START + i = 0 + all_words = chain.text.split() + paragraphs = chain.text.split("\n") + for paragraph in paragraphs: + processed_words = [] + words = paragraph.split(" ") + for i, word in enumerate(words, i) + try: + next = strip_word(all_words[i+1]) + except IndexError: + next = chain.END + sword = strip_word(word) + block = (prev_prev, prev) # Block for before + alock = (prev, sword) # Block for after + before = [block in delta.chain and sword in delta.chain[block]] + after = [alock in delta.chain and next in delta.chain[alock]] + is_first = i == 0 + is_last = i + 1 == len(all_words) + res = highlight_word(word, before, after, is_first, is_last) + processed_words.append(res) + prev_prev = prev + prev = sword + processed.append(u" ".join(processed_words)) + i += 1 + return u"

".join(processed) + +def highlight_word(word, before, after, is_first, is_last): + if before and after: + # Word is in the middle of a highlighted block, so don't change + # anything unless this is the first word (force block to start) or + # the last word (force block to end): + res = word + if is_first: + res = u'' + res + if is_last: + res += u'' + elif before: + # Word is the last in a highlighted block, so fade it out and then + # end the block; force open a block before the word if this is the + # first word: + res = fade_word(word, u"out") + u"
" + if is_first: + res = u'' + res + elif after: + # Word is the first in a highlighted block, so start the block and + # then fade it in; force close the block after the word if this is + # the last word: + res = u'' + fade_word(word, u"in") + if is_last: + res += u"" + else: + # Word is completely outside of a highlighted block, so do nothing: + res = word + return res + +def fade_word(word, dir): + if len(word) <= 4: + return u'{1}'.format(dir, word) + if dir == u"out": + return u'{0}{1}'.format(word[:-4], word[-4:]) + return u'{0}{1}'.format(word[:4], word[4:]) + +def strip_word(word): + return sub("[^\w\s-]", "", word.lower(), flags=UNICODE) diff --git a/pages/support/misc.py b/pages/support/misc.py new file mode 100644 index 0000000..1c9a5b4 --- /dev/null +++ b/pages/support/misc.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- + +from os.path import expanduser + +import oursql + +def open_sql_connection(bot, dbname): + conn_args = bot.config.wiki["_toolserverSQL"][dbname] + if "read_default_file" not in conn_args and "user" not in conn_args and "passwd" not in conn_args: + conn_args["read_default_file"] = expanduser("~/.my.cnf") + if "autoping" not in conn_args: + conn_args["autoping"] = True + if "autoreconnect" not in conn_args: + conn_args["autoreconnect"] = True + return oursql.connect(**conn_args) + +def urlstrip(context, url): + if url.startswith("http://"): + url = url[7:] + if url.startswith("https://"): + url = url[8:] + if url.startswith("www."): + url = url[4:] + if url.endswith("/"): + url = url[:-1] + return url diff --git a/pages/support/sites.py b/pages/support/sites.py new file mode 100644 index 0000000..e6e455f --- /dev/null +++ b/pages/support/sites.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- + +from time import time +from urlparse import urlparse + +from earwigbot import exceptions + +def get_site(bot, lang, project, name, all_projects): + if project not in [proj[0] for proj in all_projects]: + return None + if project == "wikimedia" and name: # Special sites: + try: + return bot.wiki.get_site(name=name) + except exceptions.SiteNotFoundError: + try: + return bot.wiki.add_site(lang=lang, project=project) + except (exceptions.APIError, exceptions.LoginError): + return None + try: + return bot.wiki.get_site(lang=lang, project=project) + except exceptions.SiteNotFoundError: + try: + return bot.wiki.add_site(lang=lang, project=project) + except (exceptions.APIError, exceptions.LoginError): + return None + +def get_sites(bot): + max_staleness = 60 * 60 * 24 * 7 + conn = open_sql_connection(bot, "globals") + query1 = "SELECT update_time FROM updates WHERE update_service = ?" + query2 = "SELECT lang_code, lang_name FROM language" + query3 = "SELECT project_code, project_name FROM project" + with conn.cursor() as cursor: + cursor.execute(query1, ("sites",)) + try: + time_since_update = int(time() - cursor.fetchall()[0][0]) + except IndexError: + time_since_update = time() + if time_since_update > max_staleness: + update_sites(bot.wiki.get_site(), cursor) + cursor.execute(query2) + langs = [] + for code, name in cursor.fetchall(): + if "\U" in name: + name = name.decode("unicode_escape") + langs.append((code, name)) + cursor.execute(query3) + projects = cursor.fetchall() + return langs, projects + +def update_sites(site, cursor): + matrix = site.api_query(action="sitematrix")["sitematrix"] + del matrix["count"] + languages, projects = set(), set() + for site in matrix.itervalues(): + if isinstance(site, list): # Special sites + bad_sites = ["closed", "private", "fishbowl"] + for special in site: + if all([key not in special for key in bad_sites]): + full = urlparse(special["url"]).netloc + if full.count(".") == 1: # No subdomain, so use "www" + lang, project = "www", full.split(".")[0] + else: + lang, project = full.rsplit(".", 2)[:2] + code = u"{0}::{1}".format(lang, special["dbname"]) + name = special["code"].capitalize() + languages.add((code, u"{0} ({1})".format(lang, name))) + projects.add((project, project.capitalize())) + continue + this = set() + for web in site["site"]: + if "closed" in web: + continue + project = "wikipedia" if web["code"] == u"wiki" else web["code"] + this.add((project, project.capitalize())) + if this: + code = site["code"] + if "\U" in site["name"].encode("unicode_escape"): + name = site["name"].encode("unicode_escape") + else: + name = site["name"] + languages.add((code, u"{0} ({1})".format(code, name))) + projects |= this + save_site_updates(cursor, languages, projects) + +def save_site_updates(cursor, languages, projects): + query1 = "SELECT lang_code, lang_name FROM language" + query2 = "DELETE FROM language WHERE lang_code = ? AND lang_name = ?" + query3 = "INSERT INTO language VALUES (?, ?)" + query4 = "SELECT project_code, project_name FROM project" + query5 = "DELETE FROM project WHERE project_code = ? AND project_name = ?" + query6 = "INSERT INTO project VALUES (?, ?)" + query7 = "SELECT 1 FROM updates WHERE update_service = ?" + query8 = "UPDATE updates SET update_time = ? WHERE update_service = ?" + query9 = "INSERT INTO updates VALUES (?, ?)" + synchronize_sites_with_db(cursor, languages, query1, query2, query3) + synchronize_sites_with_db(cursor, projects, query4, query5, query6) + cursor.execute(query7, ("sites",)) + if cursor.fetchall(): + cursor.execute(query8, (time(), "sites")) + else: + cursor.execute(query9, ("sites", time())) + +def synchronize_sites_with_db(cursor, updates, q_list, q_rmv, q_update): + removals = [] + cursor.execute(q_list) + for site in cursor: + updates.remove(site) if site in updates else removals.append(site) + cursor.executemany(q_rmv, removals) + cursor.executemany(q_update, updates)