Browse Source

Trying a radical restructuring...

pull/24/head
Ben Kurtovic 12 years ago
parent
commit
5bff019edc
6 changed files with 311 additions and 293 deletions
  1. +4
    -293
      pages/copyvios.mako
  2. +4
    -0
      pages/support/copyvios/__init__.py
  3. +96
    -0
      pages/support/copyvios/checker.py
  4. +71
    -0
      pages/support/copyvios/highlighter.py
  5. +26
    -0
      pages/support/misc.py
  6. +110
    -0
      pages/support/sites.py

+ 4
- 293
pages/copyvios.mako View File

@@ -1,299 +1,10 @@
<%!
from datetime import datetime
from hashlib import sha256
from itertools import count
from os.path import expanduser
from re import sub, UNICODE
from sys import path
from time import time
from urlparse import parse_qs, urlparse

from earwigbot import exceptions
from urlparse import parse_qs
from earwigbot.bot import Bot
import oursql

def get_results(bot, lang, project, name, all_projects, title, url, query):
site = get_site(bot, lang, project, name, all_projects)
if not site:
return None, None, None
page = site.get_page(title)
try:
page.get() # Make sure that the page exists before we check it!
except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
return site, page, None

# if url:
# result = get_url_specific_results(page, url)
# else:
# conn = open_sql_connection(bot, "copyvioCache")
# if not query.get("nocache"):
# result = get_cached_results(page, conn)
# if query.get("nocache") or not result:
# result = get_fresh_results(page, conn)
tstart = time()
mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get())
mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain(u"This is some random textual content for a page.")
mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2)
result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult(
True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci))
result.cached = False
result.tdiff = time() - tstart
# END TEST BLOCK
return site, page, result

def get_site(bot, lang, project, name, all_projects):
if project not in [proj[0] for proj in all_projects]:
return None
if project == "wikimedia" and name: # Special sites:
try:
return bot.wiki.get_site(name=name)
except exceptions.SiteNotFoundError:
try:
return bot.wiki.add_site(lang=lang, project=project)
except (exceptions.APIError, exceptions.LoginError):
return None
try:
return bot.wiki.get_site(lang=lang, project=project)
except exceptions.SiteNotFoundError:
try:
return bot.wiki.add_site(lang=lang, project=project)
except (exceptions.APIError, exceptions.LoginError):
return None

def get_url_specific_results(page, url):
t_start = time()
result = page.copyvio_compare(url)
result.cached = False
result.tdiff = time() - t_start
return result

def open_sql_connection(bot, dbname):
conn_args = bot.config.wiki["_toolserverSQL"][dbname]
if "read_default_file" not in conn_args and "user" not in conn_args and "passwd" not in conn_args:
conn_args["read_default_file"] = expanduser("~/.my.cnf")
if "autoping" not in conn_args:
conn_args["autoping"] = True
if "autoreconnect" not in conn_args:
conn_args["autoreconnect"] = True
return oursql.connect(**conn_args)

def get_cached_results(page, conn):
query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"
query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
pageid = page.pageid()
hash = sha256(page.get()).hexdigest()
t_start = time()

with conn.cursor() as cursor:
cursor.execute(query1)
cursor.execute(query2, (pageid, hash))
results = cursor.fetchall()
if not results:
return None

url, cache_time, num_queries, original_tdiff = results[0]
result = page.copyvio_compare(url)
result.cached = True
result.queries = num_queries
result.tdiff = time() - t_start
result.original_tdiff = original_tdiff
result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
result.cache_age = format_date(cache_time)
return result

def format_date(cache_time):
diff = datetime.utcnow() - cache_time
if diff.seconds > 3600:
return "{0} hours".format(diff.seconds / 3600)
if diff.seconds > 60:
return "{0} minutes".format(diff.seconds / 60)
return "{0} seconds".format(diff.seconds)

def get_fresh_results(page, conn):
t_start = time()
result = page.copyvio_check(max_queries=10)
result.cached = False
result.tdiff = time() - t_start
cache_result(page, result, conn)
return result

def cache_result(page, result, conn):
pageid = page.pageid()
hash = sha256(page.get()).hexdigest()
query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
query2 = "DELETE FROM cache WHERE cache_id = ?"
query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
with conn.cursor() as cursor:
cursor.execute(query1, (pageid,))
if cursor.fetchall():
cursor.execute(query2, (pageid,))
cursor.execute(query3, (pageid, hash, result.url, result.queries,
result.tdiff))

def get_sites(bot):
max_staleness = 60 * 60 * 24 * 7
conn = open_sql_connection(bot, "globals")
query1 = "SELECT update_time FROM updates WHERE update_service = ?"
query2 = "SELECT lang_code, lang_name FROM language"
query3 = "SELECT project_code, project_name FROM project"
with conn.cursor() as cursor:
cursor.execute(query1, ("sites",))
try:
time_since_update = int(time() - cursor.fetchall()[0][0])
except IndexError:
time_since_update = time()
if time_since_update > max_staleness:
update_sites(bot.wiki.get_site(), cursor)
cursor.execute(query2)
langs = []
for code, name in cursor.fetchall():
if "\U" in name:
name = name.decode("unicode_escape")
langs.append((code, name))
cursor.execute(query3)
projects = cursor.fetchall()
return langs, projects

def update_sites(site, cursor):
matrix = site.api_query(action="sitematrix")["sitematrix"]
del matrix["count"]
languages, projects = set(), set()
for site in matrix.itervalues():
if isinstance(site, list): # Special sites
bad_sites = ["closed", "private", "fishbowl"]
for special in site:
if all([key not in special for key in bad_sites]):
full = urlparse(special["url"]).netloc
if full.count(".") == 1: # No subdomain, so use "www"
lang, project = "www", full.split(".")[0]
else:
lang, project = full.rsplit(".", 2)[:2]
code = u"{0}::{1}".format(lang, special["dbname"])
name = special["code"].capitalize()
languages.add((code, u"{0} ({1})".format(lang, name)))
projects.add((project, project.capitalize()))
continue
this = set()
for web in site["site"]:
if "closed" in web:
continue
project = "wikipedia" if web["code"] == u"wiki" else web["code"]
this.add((project, project.capitalize()))
if this:
code = site["code"]
if "\U" in site["name"].encode("unicode_escape"):
name = site["name"].encode("unicode_escape")
else:
name = site["name"]
languages.add((code, u"{0} ({1})".format(code, name)))
projects |= this
save_site_updates(cursor, languages, projects)

def save_site_updates(cursor, languages, projects):
query1 = "SELECT lang_code, lang_name FROM language"
query2 = "DELETE FROM language WHERE lang_code = ? AND lang_name = ?"
query3 = "INSERT INTO language VALUES (?, ?)"
query4 = "SELECT project_code, project_name FROM project"
query5 = "DELETE FROM project WHERE project_code = ? AND project_name = ?"
query6 = "INSERT INTO project VALUES (?, ?)"
query7 = "SELECT 1 FROM updates WHERE update_service = ?"
query8 = "UPDATE updates SET update_time = ? WHERE update_service = ?"
query9 = "INSERT INTO updates VALUES (?, ?)"
synchronize_sites_with_db(cursor, languages, query1, query2, query3)
synchronize_sites_with_db(cursor, projects, query4, query5, query6)
cursor.execute(query7, ("sites",))
if cursor.fetchall():
cursor.execute(query8, (time(), "sites"))
else:
cursor.execute(query9, ("sites", time()))

def synchronize_sites_with_db(cursor, updates, q_list, q_rmv, q_update):
removals = []
cursor.execute(q_list)
for site in cursor:
updates.remove(site) if site in updates else removals.append(site)
cursor.executemany(q_rmv, removals)
cursor.executemany(q_update, updates)

def highlight_delta(chain, delta):
processed = []
prev_prev = prev = chain.START
i = 0
all_words = chain.text.split()
paragraphs = chain.text.split("\n")
for paragraph in paragraphs:
processed_words = []
words = paragraph.split(" ")
for word, i in zip(words, count(i)):
try:
next = strip_word(all_words[i+1])
except IndexError:
next = chain.END
sword = strip_word(word)
block = (prev_prev, prev) # Block for before
alock = (prev, sword) # Block for after
before = [block in delta.chain and sword in delta.chain[block]]
after = [alock in delta.chain and next in delta.chain[alock]]
is_first = i == 0
is_last = i + 1 == len(all_words)
res = highlight_word(word, before, after, is_first, is_last)
processed_words.append(res)
prev_prev = prev
prev = sword
processed.append(u" ".join(processed_words))
i += 1
return u"<br /><br />".join(processed)

def highlight_word(word, before, after, is_first, is_last):
if before and after:
# Word is in the middle of a highlighted block, so don't change
# anything unless this is the first word (force block to start) or
# the last word (force block to end):
res = word
if is_first:
res = u'<span class="cv-hl">' + res
if is_last:
res += u'</span>'
elif before:
# Word is the last in a highlighted block, so fade it out and then
# end the block; force open a block before the word if this is the
# first word:
res = fade_word(word, u"out") + u"</span>"
if is_first:
res = u'<span class="cv-hl">' + res
elif after:
# Word is the first in a highlighted block, so start the block and
# then fade it in; force close the block after the word if this is
# the last word:
res = u'<span class="cv-hl">' + fade_word(word, u"in")
if is_last:
res += u"</span>"
else:
# Word is completely outside of a highlighted block, so do nothing:
res = word
return res

def fade_word(word, dir):
if len(word) <= 4:
return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
if dir == u"out":
return u'{0}<span class="cv-hl-out">{1}</span>'.format(word[:-4], word[-4:])
return u'<span class="cv-hl-in">{0}</span>{1}'.format(word[:4], word[4:])

def strip_word(word):
return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)

def urlstrip(url):
if url.startswith("http://"):
url = url[7:]
if url.startswith("https://"):
url = url[8:]
if url.startswith("www."):
url = url[4:]
if url.endswith("/"):
url = url[:-1]
return url
%>\
<%namespace file="/support/copyvios/__init__.py" import="get_results, highlight_delta"/>\
<%namespace file="/support/sites.py" import="get_site, get_sites"/>\
<%namespace file="/support/misc.py" import="urlstrip"/>\
<%
lang = orig_lang = project = name = title = url = None
query = parse_qs(environ["QUERY_STRING"])


+ 4
- 0
pages/support/copyvios/__init__.py View File

@@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-

from .checker import get_results
from .highlighter import highlight_delta

+ 96
- 0
pages/support/copyvios/checker.py View File

@@ -0,0 +1,96 @@
# -*- coding: utf-8 -*-

from datetime import datetime
from hashlib import sha256
from time import time

from earwigbot import exceptions

def get_results(bot, lang, project, name, all_projects, title, url, query):
site = get_site(bot, lang, project, name, all_projects)
if not site:
return None, None, None
page = site.get_page(title)
try:
page.get() # Make sure that the page exists before we check it!
except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
return site, page, None

# if url:
# result = get_url_specific_results(page, url)
# else:
# conn = open_sql_connection(bot, "copyvioCache")
# if not query.get("nocache"):
# result = get_cached_results(page, conn)
# if query.get("nocache") or not result:
# result = get_fresh_results(page, conn)
tstart = time()
mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get())
mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain(u"This is some random textual content for a page.")
mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2)
result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult(
True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci))
result.cached = False
result.tdiff = time() - tstart
# END TEST BLOCK
return site, page, result

def get_url_specific_results(page, url):
t_start = time()
result = page.copyvio_compare(url)
result.cached = False
result.tdiff = time() - t_start
return result

def get_cached_results(page, conn):
query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"
query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
pageid = page.pageid()
hash = sha256(page.get()).hexdigest()
t_start = time()

with conn.cursor() as cursor:
cursor.execute(query1)
cursor.execute(query2, (pageid, hash))
results = cursor.fetchall()
if not results:
return None

url, cache_time, num_queries, original_tdiff = results[0]
result = page.copyvio_compare(url)
result.cached = True
result.queries = num_queries
result.tdiff = time() - t_start
result.original_tdiff = original_tdiff
result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
result.cache_age = format_date(cache_time)
return result

def format_date(cache_time):
diff = datetime.utcnow() - cache_time
if diff.seconds > 3600:
return "{0} hours".format(diff.seconds / 3600)
if diff.seconds > 60:
return "{0} minutes".format(diff.seconds / 60)
return "{0} seconds".format(diff.seconds)

def get_fresh_results(page, conn):
t_start = time()
result = page.copyvio_check(max_queries=10)
result.cached = False
result.tdiff = time() - t_start
cache_result(page, result, conn)
return result

def cache_result(page, result, conn):
pageid = page.pageid()
hash = sha256(page.get()).hexdigest()
query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
query2 = "DELETE FROM cache WHERE cache_id = ?"
query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
with conn.cursor() as cursor:
cursor.execute(query1, (pageid,))
if cursor.fetchall():
cursor.execute(query2, (pageid,))
cursor.execute(query3, (pageid, hash, result.url, result.queries,
result.tdiff))

+ 71
- 0
pages/support/copyvios/highlighter.py View File

@@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-

from re import sub, UNICODE

def highlight_delta(chain, delta):
processed = []
prev_prev = prev = chain.START
i = 0
all_words = chain.text.split()
paragraphs = chain.text.split("\n")
for paragraph in paragraphs:
processed_words = []
words = paragraph.split(" ")
for i, word in enumerate(words, i)
try:
next = strip_word(all_words[i+1])
except IndexError:
next = chain.END
sword = strip_word(word)
block = (prev_prev, prev) # Block for before
alock = (prev, sword) # Block for after
before = [block in delta.chain and sword in delta.chain[block]]
after = [alock in delta.chain and next in delta.chain[alock]]
is_first = i == 0
is_last = i + 1 == len(all_words)
res = highlight_word(word, before, after, is_first, is_last)
processed_words.append(res)
prev_prev = prev
prev = sword
processed.append(u" ".join(processed_words))
i += 1
return u"<br /><br />".join(processed)

def highlight_word(word, before, after, is_first, is_last):
if before and after:
# Word is in the middle of a highlighted block, so don't change
# anything unless this is the first word (force block to start) or
# the last word (force block to end):
res = word
if is_first:
res = u'<span class="cv-hl">' + res
if is_last:
res += u'</span>'
elif before:
# Word is the last in a highlighted block, so fade it out and then
# end the block; force open a block before the word if this is the
# first word:
res = fade_word(word, u"out") + u"</span>"
if is_first:
res = u'<span class="cv-hl">' + res
elif after:
# Word is the first in a highlighted block, so start the block and
# then fade it in; force close the block after the word if this is
# the last word:
res = u'<span class="cv-hl">' + fade_word(word, u"in")
if is_last:
res += u"</span>"
else:
# Word is completely outside of a highlighted block, so do nothing:
res = word
return res

def fade_word(word, dir):
if len(word) <= 4:
return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
if dir == u"out":
return u'{0}<span class="cv-hl-out">{1}</span>'.format(word[:-4], word[-4:])
return u'<span class="cv-hl-in">{0}</span>{1}'.format(word[:4], word[4:])

def strip_word(word):
return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)

+ 26
- 0
pages/support/misc.py View File

@@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-

from os.path import expanduser

import oursql

def open_sql_connection(bot, dbname):
conn_args = bot.config.wiki["_toolserverSQL"][dbname]
if "read_default_file" not in conn_args and "user" not in conn_args and "passwd" not in conn_args:
conn_args["read_default_file"] = expanduser("~/.my.cnf")
if "autoping" not in conn_args:
conn_args["autoping"] = True
if "autoreconnect" not in conn_args:
conn_args["autoreconnect"] = True
return oursql.connect(**conn_args)

def urlstrip(context, url):
if url.startswith("http://"):
url = url[7:]
if url.startswith("https://"):
url = url[8:]
if url.startswith("www."):
url = url[4:]
if url.endswith("/"):
url = url[:-1]
return url

+ 110
- 0
pages/support/sites.py View File

@@ -0,0 +1,110 @@
# -*- coding: utf-8 -*-

from time import time
from urlparse import urlparse

from earwigbot import exceptions

def get_site(bot, lang, project, name, all_projects):
if project not in [proj[0] for proj in all_projects]:
return None
if project == "wikimedia" and name: # Special sites:
try:
return bot.wiki.get_site(name=name)
except exceptions.SiteNotFoundError:
try:
return bot.wiki.add_site(lang=lang, project=project)
except (exceptions.APIError, exceptions.LoginError):
return None
try:
return bot.wiki.get_site(lang=lang, project=project)
except exceptions.SiteNotFoundError:
try:
return bot.wiki.add_site(lang=lang, project=project)
except (exceptions.APIError, exceptions.LoginError):
return None

def get_sites(bot):
max_staleness = 60 * 60 * 24 * 7
conn = open_sql_connection(bot, "globals")
query1 = "SELECT update_time FROM updates WHERE update_service = ?"
query2 = "SELECT lang_code, lang_name FROM language"
query3 = "SELECT project_code, project_name FROM project"
with conn.cursor() as cursor:
cursor.execute(query1, ("sites",))
try:
time_since_update = int(time() - cursor.fetchall()[0][0])
except IndexError:
time_since_update = time()
if time_since_update > max_staleness:
update_sites(bot.wiki.get_site(), cursor)
cursor.execute(query2)
langs = []
for code, name in cursor.fetchall():
if "\U" in name:
name = name.decode("unicode_escape")
langs.append((code, name))
cursor.execute(query3)
projects = cursor.fetchall()
return langs, projects

def update_sites(site, cursor):
matrix = site.api_query(action="sitematrix")["sitematrix"]
del matrix["count"]
languages, projects = set(), set()
for site in matrix.itervalues():
if isinstance(site, list): # Special sites
bad_sites = ["closed", "private", "fishbowl"]
for special in site:
if all([key not in special for key in bad_sites]):
full = urlparse(special["url"]).netloc
if full.count(".") == 1: # No subdomain, so use "www"
lang, project = "www", full.split(".")[0]
else:
lang, project = full.rsplit(".", 2)[:2]
code = u"{0}::{1}".format(lang, special["dbname"])
name = special["code"].capitalize()
languages.add((code, u"{0} ({1})".format(lang, name)))
projects.add((project, project.capitalize()))
continue
this = set()
for web in site["site"]:
if "closed" in web:
continue
project = "wikipedia" if web["code"] == u"wiki" else web["code"]
this.add((project, project.capitalize()))
if this:
code = site["code"]
if "\U" in site["name"].encode("unicode_escape"):
name = site["name"].encode("unicode_escape")
else:
name = site["name"]
languages.add((code, u"{0} ({1})".format(code, name)))
projects |= this
save_site_updates(cursor, languages, projects)

def save_site_updates(cursor, languages, projects):
query1 = "SELECT lang_code, lang_name FROM language"
query2 = "DELETE FROM language WHERE lang_code = ? AND lang_name = ?"
query3 = "INSERT INTO language VALUES (?, ?)"
query4 = "SELECT project_code, project_name FROM project"
query5 = "DELETE FROM project WHERE project_code = ? AND project_name = ?"
query6 = "INSERT INTO project VALUES (?, ?)"
query7 = "SELECT 1 FROM updates WHERE update_service = ?"
query8 = "UPDATE updates SET update_time = ? WHERE update_service = ?"
query9 = "INSERT INTO updates VALUES (?, ?)"
synchronize_sites_with_db(cursor, languages, query1, query2, query3)
synchronize_sites_with_db(cursor, projects, query4, query5, query6)
cursor.execute(query7, ("sites",))
if cursor.fetchall():
cursor.execute(query8, (time(), "sites"))
else:
cursor.execute(query9, ("sites", time()))

def synchronize_sites_with_db(cursor, updates, q_list, q_rmv, q_update):
removals = []
cursor.execute(q_list)
for site in cursor:
updates.remove(site) if site in updates else removals.append(site)
cursor.executemany(q_rmv, removals)
cursor.executemany(q_update, updates)

Loading…
Cancel
Save