A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

257 lines
12 KiB

  1. <%!
  2. from collections import defaultdict
  3. from datetime import datetime
  4. from hashlib import sha256
  5. from itertools import count
  6. from os.path import expanduser
  7. from re import sub, UNICODE
  8. from sys import path
  9. from time import time
  10. from urlparse import parse_qs
  11. import oursql
  12. path.insert(0, "../earwigbot")
  13. import earwigbot
  14. def get_results(lang, project, title, query):
  15. earwigbot.config.config.load("config.ts-earwigbot.json")
  16. try:
  17. site = earwigbot.wiki.get_site(lang=lang, project=project)
  18. except earwigbot.wiki.SiteNotFoundError:
  19. return None, None
  20. page = site.get_page(title)
  21. conn = open_sql_connection()
  22. if not query.get("nocache"):
  23. result = get_cached_results(page, conn)
  24. if query.get("nocache") or not result:
  25. result = get_fresh_results(page, conn)
  26. return page, result
  27. def open_sql_connection():
  28. conn_args = earwigbot.config.config.wiki["_toolserverSQLCache"]
  29. conn_args["read_default_file"] = expanduser("~/.my.cnf")
  30. return oursql.connect(**conn_args)
  31. def get_cached_results(page, conn):
  32. query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 1 DAY)"
  33. query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
  34. pageid = page.pageid()
  35. hash = sha256(page.get()).hexdigest()
  36. t_start = time()
  37. with conn.cursor() as cursor:
  38. cursor.execute(query1)
  39. cursor.execute(query2, (pageid, hash))
  40. results = cursor.fetchall()
  41. if not results:
  42. return None
  43. url, cache_time, num_queries, original_tdiff = results[0]
  44. result = page.copyvio_compare(url, min_confidence=0.5)
  45. result.cached = True
  46. result.queries = num_queries
  47. result.tdiff = time() - t_start
  48. result.original_tdiff = original_tdiff
  49. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  50. result.cache_age = format_date(cache_time)
  51. return result
  52. def format_date(cache_time):
  53. diff = datetime.utcnow() - cache_time
  54. if diff.seconds > 3600:
  55. return "{0} hours".format(diff.seconds / 3600)
  56. if diff.seconds > 60:
  57. return "{0} minutes".format(diff.seconds / 60)
  58. return "{0} seconds".format(diff.seconds)
  59. def get_fresh_results(page, conn):
  60. t_start = time()
  61. result = page.copyvio_check(min_confidence=0.5, max_queries=10)
  62. result.cached = False
  63. result.tdiff = time() - t_start
  64. cache_result(page, result, conn)
  65. return result
  66. def cache_result(page, result, conn):
  67. pageid = page.pageid()
  68. hash = sha256(page.get()).hexdigest()
  69. query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
  70. query2 = "DELETE FROM cache WHERE cache_id = ?"
  71. query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
  72. with conn.cursor() as cursor:
  73. cursor.execute(query1, (pageid,))
  74. if cursor.fetchall():
  75. cursor.execute(query2, (pageid,))
  76. cursor.execute(query3, (pageid, hash, result.url, result.queries,
  77. result.tdiff))
  78. def highlight_delta(chain, delta):
  79. processed = []
  80. prev = chain.START
  81. i = 0
  82. all_words = chain.text.split()
  83. paragraphs = chain.text.split("\n")
  84. for paragraph in paragraphs:
  85. processed_words = []
  86. words = paragraph.split(" ")
  87. for word, i in zip(words, count(i)):
  88. try:
  89. next = strip_word(all_words[i+1])
  90. except IndexError:
  91. next = chain.END
  92. sword = strip_word(word)
  93. before = prev in delta.chain and sword in delta.chain[prev]
  94. after = sword in delta.chain and next in delta.chain[sword]
  95. is_first = i == 0
  96. is_last = i + 1 == len(all_words)
  97. res = highlight_word(word, before, after, is_first, is_last)
  98. processed_words.append(res)
  99. prev = sword
  100. processed.append(u" ".join(processed_words))
  101. i += 1
  102. return u"<br /><br />".join(processed)
  103. def highlight_word(word, before, after, is_first, is_last):
  104. if before and after:
  105. # Word is in the middle of a highlighted block, so don't change
  106. # anything unless this is the first word (force block to start) or
  107. # the last word (force block to end):
  108. res = word
  109. if is_first:
  110. res = u'<span class="cv-hl">' + res
  111. if is_last:
  112. res += u'</span>'
  113. elif before:
  114. # Word is the last in a highlighted block, so fade it out and then
  115. # end the block; force open a block before the word if this is the
  116. # first word:
  117. res = fade_word(word, u"out") + u"</span>"
  118. if is_first:
  119. res = u'<span class="cv-hl">' + res
  120. elif after:
  121. # Word is the first in a highlighted block, so start the block and
  122. # then fade it in; force close the block after the word if this is
  123. # the last word:
  124. res = u'<span class="cv-hl">' + fade_word(word, u"in")
  125. if is_last:
  126. res += u"</span>"
  127. else:
  128. # Word is completely outside of a highlighted block, so do nothing:
  129. res = word
  130. return res
  131. def fade_word(word, dir):
  132. if len(word) <= 4:
  133. return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
  134. if dir == u"out":
  135. return u'{0}<span class="cv-hl-out">{1}</span>'.format(word[:-4], word[-4:])
  136. return u'<span class="cv-hl-in">{0}</span>{1}'.format(word[:4], word[4:])
  137. def strip_word(word):
  138. return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)
  139. def urlstrip(url):
  140. if url.startswith("http://"):
  141. url = url[7:]
  142. if url.startswith("https://"):
  143. url = url[8:]
  144. if url.startswith("www."):
  145. url = url[4:]
  146. if url.endswith("/"):
  147. url = url[:-1]
  148. return url
  149. %>\
  150. <%
  151. query = parse_qs(environ["QUERY_STRING"])
  152. try:
  153. lang = query["lang"][0]
  154. project = query["project"][0]
  155. title = query["title"][0]
  156. except (KeyError, IndexError):
  157. page = None
  158. else:
  159. page, result = get_results(lang, project, title, query)
  160. %>\
  161. <%include file="/support/header.mako" args="environ=environ, title='Copyvio Detector', add_css=('copyvios.css',), add_js=('copyvios.js',)"/>
  162. <h1>Copyvio Detector</h1>
  163. <p>This tool attempts to detect <a href="//en.wikipedia.org/wiki/WP:COPYVIO">copyright violations</a> in Wikipedia articles.</p>
  164. <form action="${environ['PATH_INFO']}" method="get">
  165. <table>
  166. <tr>
  167. <td>Site:</td>
  168. <td>
  169. <select name="lang">
  170. <option value="en" selected="selected">en (English)</option>
  171. </select>
  172. <select name="project">
  173. <option value="wikipedia" selected="selected">Wikipedia</option>
  174. </select>
  175. </td>
  176. </tr>
  177. <tr>
  178. <td>Page title:</td>
  179. % if page:
  180. <td><input type="text" name="title" size="50" value="${page.title() | h}" /></td>
  181. % else:
  182. <td><input type="text" name="title" size="50" /></td>
  183. % endif
  184. </tr>
  185. % if query.get("nocache") or page:
  186. <tr>
  187. <td>Bypass cache:</td>
  188. % if query.get("nocache"):
  189. <td><input type="checkbox" name="nocache" value="1" checked="checked" /></td>
  190. % else:
  191. <td><input type="checkbox" name="nocache" value="1" /></td>
  192. % endif
  193. </tr>
  194. % endif
  195. <tr>
  196. <td><button type="submit">Submit</button></td>
  197. </tr>
  198. </table>
  199. </form>
  200. % if page:
  201. <div class="divider"></div>
  202. <div id="cv-result-${'yes' if result.violation else 'no'}">
  203. % if result.violation:
  204. <h2 id="cv-result-header"><a href="${page.url()}">${page.title() | h}</a> is a suspected violation of <a href="${result.url | h}">${result.url | urlstrip}</a>.</h2>
  205. % else:
  206. <h2 id="cv-result-header">No violations detected in <a href="${page.url()}">${page.title() | h}</a>.</h2>
  207. % endif
  208. <ul id="cv-result-list">
  209. <li><b><tt>${round(result.confidence * 100, 1)}%</tt></b> confidence of a violation.</li>
  210. % if result.cached:
  211. <li>Results are <a id="cv-cached" href="#">cached
  212. <span>To save time (and money), this tool will retain the results of checks for up to 24 hours. This includes the URL of the "violated" source, but neither its content nor the content of the article. Future checks on the same page (assuming it remains unchanged) will not involve additional search queries, but a fresh comparison against the source URL will be made.</span>
  213. </a> from ${result.cache_time} (${result.cache_age} ago). <a href="${environ['REQUEST_URI'] | h}&amp;nocache=1">Bypass the cache.</a></li>
  214. % else:
  215. <li>Results generated in <tt>${round(result.tdiff, 3)}</tt> seconds using <tt>${result.queries}</tt> queries.</li>
  216. % endif
  217. <li><a id="cv-result-detail-link" href="#cv-result-detail" onclick="copyvio_toggle_details()">Show details:</a></li>
  218. </ul>
  219. <div id="cv-result-detail" style="display: none;">
  220. <ul id="cv-result-detail-list">
  221. <li>Markov chain size: Article: <tt>${result.article_chain.size()}</tt> / Source: <tt>${result.source_chain.size()}</tt> / Delta: <tt>${result.delta_chain.size()}</tt></li>
  222. % if result.cached:
  223. % if result.queries:
  224. <li>Retrieved from cache in <tt>${round(result.tdiff, 3)}</tt> seconds (originally generated in <tt>${round(result.original_tdiff, 3)}</tt>s using <tt>${result.queries}</tt> queries; <tt>${round(result.original_tdiff - result.tdiff, 3)}</tt>s saved).</li>
  225. % else:
  226. <li>Retrieved from cache in <tt>${round(result.tdiff, 3)}</tt> seconds (originally generated in <tt>${round(result.original_tdiff, 3)}</tt>s; <tt>${round(result.original_tdiff - result.tdiff, 3)}</tt>s saved).</li>
  227. % endif
  228. % endif
  229. <li><i>Fun fact:</i> The Wikimedia Foundation paid Yahoo! Inc. <a href="http://info.yahoo.com/legal/us/yahoo/search/bosspricing/details.html">$${result.queries * 0.0008} USD</a> for these results.</li>
  230. </ul>
  231. <table id="cv-chain-table">
  232. <tr>
  233. <td>Article: <div class="cv-chain-detail"><p>${highlight_delta(result.article_chain, result.delta_chain)}</p></div></td>
  234. <td>Source: <div class="cv-chain-detail"><p>${highlight_delta(result.source_chain, result.delta_chain)}</p></div></td>
  235. </tr>
  236. </table>
  237. </div>
  238. </div>
  239. % endif
  240. <%include file="/support/footer.mako" args="environ=environ"/>