A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 
 
 

437 lignes
22 KiB

  1. <%!
  2. from datetime import datetime
  3. from hashlib import sha256
  4. from itertools import count
  5. from os.path import expanduser
  6. from re import sub, UNICODE
  7. from sys import path
  8. from time import time
  9. from urlparse import parse_qs, urlparse
  10. from earwigbot import exceptions
  11. from earwigbot.bot import Bot
  12. import oursql
  13. def get_results(bot, lang, project, name, all_projects, title, url, query):
  14. site = get_site(bot, lang, project, name, all_projects)
  15. if not site:
  16. return None, None, None
  17. page = site.get_page(title)
  18. try:
  19. page.get() # Make sure that the page exists before we check it!
  20. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  21. return site, page, None
  22. # if url:
  23. # result = get_url_specific_results(page, url)
  24. # else:
  25. # conn = open_sql_connection(bot, "copyvioCache")
  26. # if not query.get("nocache"):
  27. # result = get_cached_results(page, conn)
  28. # if query.get("nocache") or not result:
  29. # result = get_fresh_results(page, conn)
  30. tstart = time()
  31. mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get())
  32. mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain(u"This is some random textual content for a page.")
  33. mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2)
  34. result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult(
  35. True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci))
  36. result.cached = False
  37. result.tdiff = time() - tstart
  38. # END TEST BLOCK
  39. return site, page, result
  40. def get_site(bot, lang, project, name, all_projects):
  41. if project not in [proj[0] for proj in all_projects]:
  42. return None
  43. if project == "wikimedia" and name: # Special sites:
  44. try:
  45. return bot.wiki.get_site(name=name)
  46. except exceptions.SiteNotFoundError:
  47. try:
  48. return bot.wiki.add_site(lang=lang, project=project)
  49. except (exceptions.APIError, exceptions.LoginError):
  50. return None
  51. try:
  52. return bot.wiki.get_site(lang=lang, project=project)
  53. except exceptions.SiteNotFoundError:
  54. try:
  55. return bot.wiki.add_site(lang=lang, project=project)
  56. except (exceptions.APIError, exceptions.LoginError):
  57. return None
  58. def get_url_specific_results(page, url):
  59. t_start = time()
  60. result = page.copyvio_compare(url)
  61. result.cached = False
  62. result.tdiff = time() - t_start
  63. return result
  64. def open_sql_connection(bot, dbname):
  65. conn_args = bot.config.wiki["_toolserverSQL"][dbname]
  66. if "read_default_file" not in conn_args and "user" not in conn_args and "passwd" not in conn_args:
  67. conn_args["read_default_file"] = expanduser("~/.my.cnf")
  68. if "autoping" not in conn_args:
  69. conn_args["autoping"] = True
  70. if "autoreconnect" not in conn_args:
  71. conn_args["autoreconnect"] = True
  72. return oursql.connect(**conn_args)
  73. def get_cached_results(page, conn):
  74. query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"
  75. query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
  76. pageid = page.pageid()
  77. hash = sha256(page.get()).hexdigest()
  78. t_start = time()
  79. with conn.cursor() as cursor:
  80. cursor.execute(query1)
  81. cursor.execute(query2, (pageid, hash))
  82. results = cursor.fetchall()
  83. if not results:
  84. return None
  85. url, cache_time, num_queries, original_tdiff = results[0]
  86. result = page.copyvio_compare(url)
  87. result.cached = True
  88. result.queries = num_queries
  89. result.tdiff = time() - t_start
  90. result.original_tdiff = original_tdiff
  91. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  92. result.cache_age = format_date(cache_time)
  93. return result
  94. def format_date(cache_time):
  95. diff = datetime.utcnow() - cache_time
  96. if diff.seconds > 3600:
  97. return "{0} hours".format(diff.seconds / 3600)
  98. if diff.seconds > 60:
  99. return "{0} minutes".format(diff.seconds / 60)
  100. return "{0} seconds".format(diff.seconds)
  101. def get_fresh_results(page, conn):
  102. t_start = time()
  103. result = page.copyvio_check(max_queries=10)
  104. result.cached = False
  105. result.tdiff = time() - t_start
  106. cache_result(page, result, conn)
  107. return result
  108. def cache_result(page, result, conn):
  109. pageid = page.pageid()
  110. hash = sha256(page.get()).hexdigest()
  111. query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
  112. query2 = "DELETE FROM cache WHERE cache_id = ?"
  113. query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
  114. with conn.cursor() as cursor:
  115. cursor.execute(query1, (pageid,))
  116. if cursor.fetchall():
  117. cursor.execute(query2, (pageid,))
  118. cursor.execute(query3, (pageid, hash, result.url, result.queries,
  119. result.tdiff))
  120. def get_sites(bot):
  121. max_staleness = 60 * 60 * 24 * 7
  122. conn = open_sql_connection(bot, "globals")
  123. query1 = "SELECT update_time FROM updates WHERE update_service = ?"
  124. query2 = "SELECT lang_code, lang_name FROM language"
  125. query3 = "SELECT project_code, project_name FROM project"
  126. with conn.cursor() as cursor:
  127. cursor.execute(query1, ("sites",))
  128. try:
  129. time_since_update = int(time() - cursor.fetchall()[0][0])
  130. except IndexError:
  131. time_since_update = time()
  132. if time_since_update > max_staleness:
  133. update_sites(bot.wiki.get_site(), cursor)
  134. cursor.execute(query2)
  135. langs = []
  136. for code, name in cursor.fetchall():
  137. if "\U" in name:
  138. name = name.decode("unicode_escape")
  139. langs.append((code, name))
  140. cursor.execute(query3)
  141. projects = cursor.fetchall()
  142. return langs, projects
  143. def update_sites(site, cursor):
  144. matrix = site.api_query(action="sitematrix")["sitematrix"]
  145. del matrix["count"]
  146. languages, projects = set(), set()
  147. for site in matrix.itervalues():
  148. if isinstance(site, list): # Special sites
  149. bad_sites = ["closed", "private", "fishbowl"]
  150. for special in site:
  151. if all([key not in special for key in bad_sites]):
  152. full = urlparse(special["url"]).netloc
  153. if full.count(".") == 1: # No subdomain, so use "www"
  154. lang, project = "www", full.split(".")[0]
  155. else:
  156. lang, project = full.rsplit(".", 2)[:2]
  157. code = u"{0}::{1}".format(lang, special["dbname"])
  158. name = special["code"].capitalize()
  159. languages.add((code, u"{0} ({1})".format(lang, name)))
  160. projects.add((project, project.capitalize()))
  161. continue
  162. this = set()
  163. for web in site["site"]:
  164. if "closed" in web:
  165. continue
  166. project = "wikipedia" if web["code"] == u"wiki" else web["code"]
  167. this.add((project, project.capitalize()))
  168. if this:
  169. code = site["code"]
  170. if "\U" in site["name"].encode("unicode_escape"):
  171. name = site["name"].encode("unicode_escape")
  172. else:
  173. name = site["name"]
  174. languages.add((code, u"{0} ({1})".format(code, name)))
  175. projects |= this
  176. save_site_updates(cursor, languages, projects)
  177. def save_site_updates(cursor, languages, projects):
  178. query1 = "SELECT lang_code, lang_name FROM language"
  179. query2 = "DELETE FROM language WHERE lang_code = ? AND lang_name = ?"
  180. query3 = "INSERT INTO language VALUES (?, ?)"
  181. query4 = "SELECT project_code, project_name FROM project"
  182. query5 = "DELETE FROM project WHERE project_code = ? AND project_name = ?"
  183. query6 = "INSERT INTO project VALUES (?, ?)"
  184. query7 = "SELECT 1 FROM updates WHERE update_service = ?"
  185. query8 = "UPDATE updates SET update_time = ? WHERE update_service = ?"
  186. query9 = "INSERT INTO updates VALUES (?, ?)"
  187. synchronize_sites_with_db(cursor, languages, query1, query2, query3)
  188. synchronize_sites_with_db(cursor, projects, query4, query5, query6)
  189. cursor.execute(query7, ("sites",))
  190. if cursor.fetchall():
  191. cursor.execute(query8, (time(), "sites"))
  192. else:
  193. cursor.execute(query9, ("sites", time()))
  194. def synchronize_sites_with_db(cursor, updates, q_list, q_rmv, q_update):
  195. removals = []
  196. cursor.execute(q_list)
  197. for site in cursor:
  198. updates.remove(site) if site in updates else removals.append(site)
  199. cursor.executemany(q_rmv, removals)
  200. cursor.executemany(q_update, updates)
  201. def highlight_delta(chain, delta):
  202. processed = []
  203. prev_prev = prev = chain.START
  204. i = 0
  205. all_words = chain.text.split()
  206. paragraphs = chain.text.split("\n")
  207. for paragraph in paragraphs:
  208. processed_words = []
  209. words = paragraph.split(" ")
  210. for word, i in zip(words, count(i)):
  211. try:
  212. next = strip_word(all_words[i+1])
  213. except IndexError:
  214. next = chain.END
  215. sword = strip_word(word)
  216. block = (prev_prev, prev) # Block for before
  217. alock = (prev, sword) # Block for after
  218. before = [block in delta.chain and sword in delta.chain[block]]
  219. after = [alock in delta.chain and next in delta.chain[alock]]
  220. is_first = i == 0
  221. is_last = i + 1 == len(all_words)
  222. res = highlight_word(word, before, after, is_first, is_last)
  223. processed_words.append(res)
  224. prev_prev = prev
  225. prev = sword
  226. processed.append(u" ".join(processed_words))
  227. i += 1
  228. return u"<br /><br />".join(processed)
  229. def highlight_word(word, before, after, is_first, is_last):
  230. if before and after:
  231. # Word is in the middle of a highlighted block, so don't change
  232. # anything unless this is the first word (force block to start) or
  233. # the last word (force block to end):
  234. res = word
  235. if is_first:
  236. res = u'<span class="cv-hl">' + res
  237. if is_last:
  238. res += u'</span>'
  239. elif before:
  240. # Word is the last in a highlighted block, so fade it out and then
  241. # end the block; force open a block before the word if this is the
  242. # first word:
  243. res = fade_word(word, u"out") + u"</span>"
  244. if is_first:
  245. res = u'<span class="cv-hl">' + res
  246. elif after:
  247. # Word is the first in a highlighted block, so start the block and
  248. # then fade it in; force close the block after the word if this is
  249. # the last word:
  250. res = u'<span class="cv-hl">' + fade_word(word, u"in")
  251. if is_last:
  252. res += u"</span>"
  253. else:
  254. # Word is completely outside of a highlighted block, so do nothing:
  255. res = word
  256. return res
  257. def fade_word(word, dir):
  258. if len(word) <= 4:
  259. return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
  260. if dir == u"out":
  261. return u'{0}<span class="cv-hl-out">{1}</span>'.format(word[:-4], word[-4:])
  262. return u'<span class="cv-hl-in">{0}</span>{1}'.format(word[:4], word[4:])
  263. def strip_word(word):
  264. return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)
  265. def urlstrip(url):
  266. if url.startswith("http://"):
  267. url = url[7:]
  268. if url.startswith("https://"):
  269. url = url[8:]
  270. if url.startswith("www."):
  271. url = url[4:]
  272. if url.endswith("/"):
  273. url = url[:-1]
  274. return url
  275. %>\
  276. <%
  277. lang = orig_lang = project = name = title = url = None
  278. query = parse_qs(environ["QUERY_STRING"])
  279. if "lang" in query:
  280. lang = orig_lang = query["lang"][0].decode("utf8").lower()
  281. if "::" in lang:
  282. lang, name = lang.split("::", 1)
  283. if "project" in query:
  284. project = query["project"][0].decode("utf8").lower()
  285. if "title" in query:
  286. title = query["title"][0].decode("utf8")
  287. if "url" in query:
  288. url = query["url"][0].decode("utf8")
  289. bot = Bot(".earwigbot")
  290. all_langs, all_projects = get_sites(bot)
  291. if lang and project and title:
  292. site, page, result = get_results(bot, lang, project, name,
  293. all_projects, title, url, query)
  294. else:
  295. site = page = result = None
  296. %>\
  297. <%include file="/support/header.mako" args="environ=environ, title='Copyvio Detector', add_css=('copyvios.css',), add_js=('copyvios.js',)"/>
  298. <h1>Copyvio Detector</h1>
  299. <p>This tool attempts to detect <a href="//en.wikipedia.org/wiki/WP:COPYVIO">copyright violations</a> in articles. Simply give the title of the page you want to check and hit Submit. The tool will then search for its content elsewhere on the web and display a report if a similar webpage is found. If you also provide a URL, it will not query any search engines and instead display a report comparing the article to that particular webpage, like the <a href="//toolserver.org/~dcoetzee/duplicationdetector/">Duplication Detector</a>. Check out the <a href="//en.wikipedia.org/wiki/User:EarwigBot/Copyvios/FAQ">FAQ</a> for more information and technical details.</p>
  300. <form action="${environ['PATH_INFO']}" method="get">
  301. <table>
  302. <tr>
  303. <td>Site:</td>
  304. <td>
  305. <tt>http://</tt>
  306. <select name="lang">
  307. <% selected_lang = orig_lang if orig_lang else bot.wiki.get_site().lang %>
  308. % for code, name in all_langs:
  309. % if code == selected_lang:
  310. <option value="${code}" selected="selected">${name}</option>
  311. % else:
  312. <option value="${code}">${name}</option>
  313. % endif
  314. % endfor
  315. </select>
  316. <tt>.</tt>
  317. <select name="project">
  318. <% selected_project = project if project else bot.wiki.get_site().project %>
  319. % for code, name in all_projects:
  320. % if code == selected_project:
  321. <option value="${code}" selected="selected">${name}</option>
  322. % else:
  323. <option value="${code}">${name}</option>
  324. % endif
  325. % endfor
  326. </select>
  327. <tt>.org</tt>
  328. </td>
  329. </tr>
  330. <tr>
  331. <td>Page title:</td>
  332. % if page:
  333. <td><input type="text" name="title" size="60" value="${page.title | h}" /></td>
  334. % elif title:
  335. <td><input type="text" name="title" size="60" value="${title | h}" /></td>
  336. % else:
  337. <td><input type="text" name="title" size="60" /></td>
  338. % endif
  339. </tr>
  340. <tr>
  341. <td>URL (optional):</td>
  342. % if url:
  343. <td><input type="text" name="url" size="120" value="${url | h}" /></td>
  344. % else:
  345. <td><input type="text" name="url" size="120" /></td>
  346. % endif
  347. </tr>
  348. % if query.get("nocache") or (result and result.cached):
  349. <tr>
  350. <td>Bypass cache:</td>
  351. % if query.get("nocache"):
  352. <td><input type="checkbox" name="nocache" value="1" checked="checked" /></td>
  353. % else:
  354. <td><input type="checkbox" name="nocache" value="1" /></td>
  355. % endif
  356. </tr>
  357. % endif
  358. <tr>
  359. <td><button type="submit">Submit</button></td>
  360. </tr>
  361. </table>
  362. </form>
  363. % if project and lang and title and not page:
  364. <div class="divider"></div>
  365. <div id="cv-result-yes">
  366. <p>The given site (project=<b><tt>${project}</tt></b>, language=<b><tt>${lang}</tt></b>) doesn't seem to exist. It may also be closed or private. <a href="//${lang}.${project}.org/">Confirm its URL.</a></p>
  367. </div>
  368. % elif project and lang and title and page and not result:
  369. <div class="divider"></div>
  370. <div id="cv-result-yes">
  371. <p>The given page doesn't seem to exist: <a href="${page.url}">${page.title | h}</a>.</p>
  372. </div>
  373. % elif page:
  374. <div class="divider"></div>
  375. <div id="cv-result-${'yes' if result.violation else 'no'}">
  376. % if result.violation:
  377. <h2 id="cv-result-header"><a href="${page.url}">${page.title | h}</a> is a suspected violation of <a href="${result.url | h}">${result.url | urlstrip}</a>.</h2>
  378. % else:
  379. <h2 id="cv-result-header">No violations detected in <a href="${page.url()}">${page.title | h}</a>.</h2>
  380. % endif
  381. <ul id="cv-result-list">
  382. <li><b><tt>${round(result.confidence * 100, 1)}%</tt></b> confidence of a violation.</li>
  383. % if result.cached:
  384. <li>Results are <a id="cv-cached" href="#">cached
  385. <span>To save time (and money), this tool will retain the results of checks for up to 72 hours. This includes the URL of the "violated" source, but neither its content nor the content of the article. Future checks on the same page (assuming it remains unchanged) will not involve additional search queries, but a fresh comparison against the source URL will be made. If the page is modified, a new check will be run.</span>
  386. </a> from ${result.cache_time} (${result.cache_age} ago). <a href="${environ['REQUEST_URI'].decode("utf8") | h}&amp;nocache=1">Bypass the cache.</a></li>
  387. % else:
  388. <li>Results generated in <tt>${round(result.tdiff, 3)}</tt> seconds using <tt>${result.queries}</tt> queries.</li>
  389. % endif
  390. <li><a id="cv-result-detail-link" href="#cv-result-detail" onclick="copyvio_toggle_details()">Show details:</a></li>
  391. </ul>
  392. <div id="cv-result-detail" style="display: none;">
  393. <ul id="cv-result-detail-list">
  394. <li>Trigrams: <i>Article:</i> <tt>${result.article_chain.size()}</tt> / <i>Source:</i> <tt>${result.source_chain.size()}</tt> / <i>Delta:</i> <tt>${result.delta_chain.size()}</tt></li>
  395. % if result.cached:
  396. % if result.queries:
  397. <li>Retrieved from cache in <tt>${round(result.tdiff, 3)}</tt> seconds (originally generated in <tt>${round(result.original_tdiff, 3)}</tt>s using <tt>${result.queries}</tt> queries; <tt>${round(result.original_tdiff - result.tdiff, 3)}</tt>s saved).</li>
  398. % else:
  399. <li>Retrieved from cache in <tt>${round(result.tdiff, 3)}</tt> seconds (originally generated in <tt>${round(result.original_tdiff, 3)}</tt>s; <tt>${round(result.original_tdiff - result.tdiff, 3)}</tt>s saved).</li>
  400. % endif
  401. % endif
  402. % if result.queries:
  403. <li><i>Fun fact:</i> The Wikimedia Foundation paid Yahoo! Inc. <a href="http://info.yahoo.com/legal/us/yahoo/search/bosspricing/details.html">$${result.queries * 0.0008} USD</a> for these results.</li>
  404. % endif
  405. </ul>
  406. <table id="cv-chain-table">
  407. <tr>
  408. <td>Article: <div class="cv-chain-detail"><p>${highlight_delta(result.article_chain, result.delta_chain)}</p></div></td>
  409. <td>Source: <div class="cv-chain-detail"><p>${highlight_delta(result.source_chain, result.delta_chain)}</p></div></td>
  410. </tr>
  411. </table>
  412. </div>
  413. </div>
  414. % endif
  415. <%include file="/support/footer.mako" args="environ=environ"/>