A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

copyvios.mako 18 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. <%!
  2. from datetime import datetime
  3. from hashlib import sha256
  4. from itertools import count
  5. from os.path import expanduser
  6. from re import sub, UNICODE
  7. from sys import path
  8. from time import time
  9. from urlparse import parse_qs
  10. from earwigbot import bot, exceptions
  11. import oursql
  12. def get_results(bot, lang, project, title, url, query):
  13. try:
  14. site = bot.wiki.get_site(lang=lang, project=project) # UPDATE ME FOR SPECIAL SITES!
  15. except exceptions.SiteNotFoundError:
  16. try:
  17. site = bot.wiki.add_site(lang=lang, project=project) # TODO: what if the site doesn't exist?
  18. except exceptions.APIError:
  19. return None, None
  20. page = site.get_page(title) # TODO: what if the page doesn't exist?
  21. # if url:
  22. # result = get_url_specific_results(page, url)
  23. # else:
  24. # conn = open_sql_connection(bot, "copyvioCache")
  25. # if not query.get("nocache"):
  26. # result = get_cached_results(page, conn)
  27. # if query.get("nocache") or not result:
  28. # result = get_fresh_results(page, conn)
  29. mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get())
  30. mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain("This is some random textual content for a page.")
  31. mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2)
  32. result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult(
  33. True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci))
  34. return page, result
  35. def get_url_specific_results(page, url):
  36. t_start = time()
  37. result = page.copyvio_compare(url)
  38. result.tdiff = time() - t_start
  39. return result
  40. def open_sql_connection(bot, dbname):
  41. conn_args = bot.config.wiki["_toolserverSQL"][dbname]
  42. if "read_default_file" not in conn_args and "user" not in conn_args and "passwd" not in conn_args:
  43. conn_args["read_default_file"] = expanduser("~/.my.cnf")
  44. if "autoping" not in conn_args:
  45. conn_args["autoping"] = True
  46. if "autoreconnect" not in conn_args:
  47. conn_args["autoreconnect"] = True
  48. return oursql.connect(**conn_args)
  49. def get_cached_results(page, conn):
  50. query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"
  51. query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
  52. pageid = page.pageid()
  53. hash = sha256(page.get()).hexdigest()
  54. t_start = time()
  55. with conn.cursor() as cursor:
  56. cursor.execute(query1)
  57. cursor.execute(query2, (pageid, hash))
  58. results = cursor.fetchall()
  59. if not results:
  60. return None
  61. url, cache_time, num_queries, original_tdiff = results[0]
  62. result = page.copyvio_compare(url)
  63. result.cached = True
  64. result.queries = num_queries
  65. result.tdiff = time() - t_start
  66. result.original_tdiff = original_tdiff
  67. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  68. result.cache_age = format_date(cache_time)
  69. return result
  70. def format_date(cache_time):
  71. diff = datetime.utcnow() - cache_time
  72. if diff.seconds > 3600:
  73. return "{0} hours".format(diff.seconds / 3600)
  74. if diff.seconds > 60:
  75. return "{0} minutes".format(diff.seconds / 60)
  76. return "{0} seconds".format(diff.seconds)
  77. def get_fresh_results(page, conn):
  78. t_start = time()
  79. result = page.copyvio_check(max_queries=10)
  80. result.cached = False
  81. result.tdiff = time() - t_start
  82. cache_result(page, result, conn)
  83. return result
  84. def cache_result(page, result, conn):
  85. pageid = page.pageid()
  86. hash = sha256(page.get()).hexdigest()
  87. query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
  88. query2 = "DELETE FROM cache WHERE cache_id = ?"
  89. query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
  90. with conn.cursor() as cursor:
  91. cursor.execute(query1, (pageid,))
  92. if cursor.fetchall():
  93. cursor.execute(query2, (pageid,))
  94. cursor.execute(query3, (pageid, hash, result.url, result.queries,
  95. result.tdiff))
  96. def get_sites(bot):
  97. max_staleness = 60 * 60 * 24 * 7
  98. site = bot.wiki.get_site()
  99. conn = open_sql_connection(site, "globals")
  100. query1 = "SELECT update_time FROM updates WHERE update_service = ?"
  101. query2 = "SELECT lang_code, lang_name FROM languages"
  102. query3 = "SELECT project_code, project_name FROM projects"
  103. with conn.cursor() as cursor:
  104. cursor.execute(query1, ("sites",))
  105. time_since_update = int(time() - cursor.fetchall()[0][0])
  106. if time_since_update > max_staleness:
  107. update_sites(bot, cursor)
  108. langs = load_sites_from_db(cursor, query2, site.lang)
  109. projects = load_sites_from_db(cursor, query3, site.project)
  110. langs = "\n".join(langs)
  111. projects = "\n".join(projects)
  112. result = '<select name="lang">\n{0}\n</select>\n'.format(langs)
  113. result += '<select name="project">\n{0}\n</select>'.format(projects)
  114. return result
  115. def update_sites(site, cursor):
  116. matrix = site.api_query(action="sitematrix")["sitematrix"]
  117. del matrix["count"]
  118. languages, projects = set(), set()
  119. for site in matrix.itervalues():
  120. if isinstance(site, list): # Special sites
  121. projects.add(("wikimedia", "Wikimedia"))
  122. for special in site:
  123. if "closed" not in special and "private" not in special:
  124. code = special["dbname"]
  125. name = special["code"].capitalize()
  126. languages.add((code, name))
  127. this = set()
  128. for web in site["site"]:
  129. if "closed" in web:
  130. continue
  131. project = "wikipedia" if web["code"] == "wiki" else web["code"]
  132. this.add((project, project.capitalize()))
  133. if this:
  134. code = site["code"].encode("utf8")
  135. name = site["name"].encode("utf8")
  136. languages.add((code, "{0} ({1})".format(code, name)))
  137. projects |= this
  138. save_site_updates(cursor, languages, projects)
  139. def save_site_updates(cursor, languages, projects):
  140. query1 = "SELECT lang_code, lang_name FROM languages"
  141. query2 = "DELETE FROM languages WHERE lang_code = ? AND lang_name = ?"
  142. query3 = "INSERT INTO languages VALUES (?, ?)"
  143. query4 = "SELECT project_code, project_name FROM projects"
  144. query5 = "DELETE FROM projects WHERE project_code = ? AND project_name = ?"
  145. query6 = "INSERT INTO projects VALUES (?, ?)"
  146. query7 = "UPDATE updates SET update_time = ? WHERE update_service = ?"
  147. synchronize_sites_with_db(cursor, languages, query1, query2, query3)
  148. synchronize_sites_with_db(cursor, projects, query4, query5, query6)
  149. cursor.execute(query7, (time(), "sites"))
  150. def synchronize_sites_with_db(cursor, updates, q_list, q_rmv, q_update):
  151. removals = []
  152. for site in cursor.execute(q_list):
  153. updates.remove(site) if site in updates else removals.append(site)
  154. cursor.executemany(q_rmv, removals)
  155. cursor.executemany(q_update, updates)
  156. def load_sites_from_db(cursor, query, selected_code):
  157. tl_normal = '<option value="{0}">{1}</option>'
  158. tl_selected = '<option value="{0}" selected="selected">{1}</option>'
  159. cursor.execute(query)
  160. results = []
  161. for code, name in cursor.fetchall():
  162. template = tl_selected if code == selected_code else tl_normal
  163. results.append(template.format(code, name))
  164. return results
  165. def highlight_delta(chain, delta):
  166. processed = []
  167. prev_prev = prev = chain.START
  168. i = 0
  169. all_words = chain.text.split()
  170. paragraphs = chain.text.split("\n")
  171. for paragraph in paragraphs:
  172. processed_words = []
  173. words = paragraph.split(" ")
  174. for word, i in zip(words, count(i)):
  175. try:
  176. next = strip_word(all_words[i+1])
  177. except IndexError:
  178. next = chain.END
  179. sword = strip_word(word)
  180. block = [prev_prev, prev] # Block for before
  181. alock = [prev, sword] # Block for after
  182. before = [block in delta.chain and sword in delta.chain[block]]
  183. after = [alock in delta.chain and next in delta.chain[alock]]
  184. is_first = i == 0
  185. is_last = i + 1 == len(all_words)
  186. res = highlight_word(word, before, after, is_first, is_last)
  187. processed_words.append(res)
  188. prev_prev = prev
  189. prev = sword
  190. processed.append(u" ".join(processed_words))
  191. i += 1
  192. return u"<br /><br />".join(processed)
  193. def highlight_word(word, before, after, is_first, is_last):
  194. if before and after:
  195. # Word is in the middle of a highlighted block, so don't change
  196. # anything unless this is the first word (force block to start) or
  197. # the last word (force block to end):
  198. res = word
  199. if is_first:
  200. res = u'<span class="cv-hl">' + res
  201. if is_last:
  202. res += u'</span>'
  203. elif before:
  204. # Word is the last in a highlighted block, so fade it out and then
  205. # end the block; force open a block before the word if this is the
  206. # first word:
  207. res = fade_word(word, u"out") + u"</span>"
  208. if is_first:
  209. res = u'<span class="cv-hl">' + res
  210. elif after:
  211. # Word is the first in a highlighted block, so start the block and
  212. # then fade it in; force close the block after the word if this is
  213. # the last word:
  214. res = u'<span class="cv-hl">' + fade_word(word, u"in")
  215. if is_last:
  216. res += u"</span>"
  217. else:
  218. # Word is completely outside of a highlighted block, so do nothing:
  219. res = word
  220. return res
  221. def fade_word(word, dir):
  222. if len(word) <= 4:
  223. return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
  224. if dir == u"out":
  225. return u'{0}<span class="cv-hl-out">{1}</span>'.format(word[:-4], word[-4:])
  226. return u'<span class="cv-hl-in">{0}</span>{1}'.format(word[:4], word[4:])
  227. def strip_word(word):
  228. return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)
  229. def urlstrip(url):
  230. if url.startswith("http://"):
  231. url = url[7:]
  232. if url.startswith("https://"):
  233. url = url[8:]
  234. if url.startswith("www."):
  235. url = url[4:]
  236. if url.endswith("/"):
  237. url = url[:-1]
  238. return url
  239. %>\
  240. <%
  241. bot = bot.Bot(".earwigbot")
  242. query = parse_qs(environ["QUERY_STRING"])
  243. try:
  244. lang = query["lang"][0]
  245. project = query["project"][0]
  246. title = query["title"][0]
  247. url = query.get("url", [None])[0]
  248. except (KeyError, IndexError):
  249. page = None
  250. else:
  251. page, result = get_results(bot, lang, project, title, url, query)
  252. %>\
  253. <%include file="/support/header.mako" args="environ=environ, title='Copyvio Detector', add_css=('copyvios.css',), add_js=('copyvios.js',)"/>
  254. <h1>Copyvio Detector</h1>
  255. <p>This tool attempts to detect <a href="//en.wikipedia.org/wiki/WP:COPYVIO">copyright violations</a> in articles. Simply give the title of the page you want to check and hit Submit. The tool will then search for its content elsewhere on the web and display a report if a similar webpage is found. If you also provide a URL, it will not query any search engines and instead display a report comparing the article to that particular webpage, like the <a href="//toolserver.org/~dcoetzee/duplicationdetector/">Duplication Detector</a>. Check out the <a href="//en.wikipedia.org/wiki/User:EarwigBot/Copyvios/FAQ">FAQ</a> for more information and technical details.</p>
  256. <form action="${environ['PATH_INFO']}" method="get">
  257. <table>
  258. <tr>
  259. <td>Site:</td>
  260. <td>
  261. ${get_sites(bot)}
  262. </td>
  263. </tr>
  264. <tr>
  265. <td>Page title:</td>
  266. % if page:
  267. <td><input type="text" name="title" size="60" value="${page.title() | h}" /></td>
  268. % else:
  269. <td><input type="text" name="title" size="60" /></td>
  270. % endif
  271. </tr>
  272. <tr>
  273. <td>URL (optional):</td>
  274. % if url:
  275. <td><input type="text" name="url" size="120" value="${url | h}" /></td>
  276. % else:
  277. <td><input type="text" name="url" size="120" /></td>
  278. % endif
  279. </tr>
  280. % if query.get("nocache") or page:
  281. <tr>
  282. <td>Bypass cache:</td>
  283. % if query.get("nocache"):
  284. <td><input type="checkbox" name="nocache" value="1" checked="checked" /></td>
  285. % else:
  286. <td><input type="checkbox" name="nocache" value="1" /></td>
  287. % endif
  288. </tr>
  289. % endif
  290. <tr>
  291. <td><button type="submit">Submit</button></td>
  292. </tr>
  293. </table>
  294. </form>
  295. % if page:
  296. <div class="divider"></div>
  297. <div id="cv-result-${'yes' if result.violation else 'no'}">
  298. % if result.violation:
  299. <h2 id="cv-result-header"><a href="${page.url()}">${page.title() | h}</a> is a suspected violation of <a href="${result.url | h}">${result.url | urlstrip}</a>.</h2>
  300. % else:
  301. <h2 id="cv-result-header">No violations detected in <a href="${page.url()}">${page.title() | h}</a>.</h2>
  302. % endif
  303. <ul id="cv-result-list">
  304. <li><b><tt>${round(result.confidence * 100, 1)}%</tt></b> confidence of a violation.</li>
  305. % if result.cached:
  306. <li>Results are <a id="cv-cached" href="#">cached
  307. <span>To save time (and money), this tool will retain the results of checks for up to 72 hours. This includes the URL of the "violated" source, but neither its content nor the content of the article. Future checks on the same page (assuming it remains unchanged) will not involve additional search queries, but a fresh comparison against the source URL will be made. If the page is modified, a new check will be run.</span>
  308. </a> from ${result.cache_time} (${result.cache_age} ago). <a href="${environ['REQUEST_URI'] | h}&amp;nocache=1">Bypass the cache.</a></li>
  309. % else:
  310. <li>Results generated in <tt>${round(result.tdiff, 3)}</tt> seconds using <tt>${result.queries}</tt> queries.</li>
  311. % endif
  312. <li><a id="cv-result-detail-link" href="#cv-result-detail" onclick="copyvio_toggle_details()">Show details:</a></li>
  313. </ul>
  314. <div id="cv-result-detail" style="display: none;">
  315. <ul id="cv-result-detail-list">
  316. <li>Trigrams: <i>Article:</i> <tt>${result.article_chain.size()}</tt> / <i>Source:</i> <tt>${result.source_chain.size()}</tt> / <i>Delta:</i> <tt>${result.delta_chain.size()}</tt></li>
  317. % if result.cached:
  318. % if result.queries:
  319. <li>Retrieved from cache in <tt>${round(result.tdiff, 3)}</tt> seconds (originally generated in <tt>${round(result.original_tdiff, 3)}</tt>s using <tt>${result.queries}</tt> queries; <tt>${round(result.original_tdiff - result.tdiff, 3)}</tt>s saved).</li>
  320. % else:
  321. <li>Retrieved from cache in <tt>${round(result.tdiff, 3)}</tt> seconds (originally generated in <tt>${round(result.original_tdiff, 3)}</tt>s; <tt>${round(result.original_tdiff - result.tdiff, 3)}</tt>s saved).</li>
  322. % endif
  323. % endif
  324. % if result.queries:
  325. <li><i>Fun fact:</i> The Wikimedia Foundation paid Yahoo! Inc. <a href="http://info.yahoo.com/legal/us/yahoo/search/bosspricing/details.html">$${result.queries * 0.0008} USD</a> for these results.</li>
  326. % endif
  327. </ul>
  328. <table id="cv-chain-table">
  329. <tr>
  330. <td>Article: <div class="cv-chain-detail"><p>${highlight_delta(result.article_chain, result.delta_chain)}</p></div></td>
  331. <td>Source: <div class="cv-chain-detail"><p>${highlight_delta(result.source_chain, result.delta_chain)}</p></div></td>
  332. </tr>
  333. </table>
  334. </div>
  335. </div>
  336. % endif
  337. <%include file="/support/footer.mako" args="environ=environ"/>