Browse Source

Copyvios NG: Initial proof-of-concept

copyvios-ng
Ben Kurtovic 10 months ago
parent
commit
bae024b2a4
9 changed files with 3193 additions and 0 deletions
  1. +2613
    -0
      Cargo.lock
  2. +22
    -0
      Cargo.toml
  3. +70
    -0
      src/background.rs
  4. +17
    -0
      src/error.rs
  5. +61
    -0
      src/main.rs
  6. +37
    -0
      src/site.rs
  7. +222
    -0
      static/style.css
  8. +37
    -0
      templates/base.html
  9. +114
    -0
      templates/index.html

+ 2613
- 0
Cargo.lock
File diff suppressed because it is too large
View File


+ 22
- 0
Cargo.toml View File

@@ -0,0 +1,22 @@
[package]
name = "copyvios"
version = "0.1.0"
authors = ["Ben Kurtovic <ben@benkurtovic.com>"]
license = "MIT"
edition = "2021"

[dependencies]
askama = { version = "0.12", features = ["with-axum"] }
askama_axum = "0.3"
axum = "0.6"
mwapi = "0.6.0"
mwapi_responses = "0.4.2"
mwbot = "0.6.1"
parsoid = "0.8.0"
rand = "0.8.5"
thiserror = "1.0.56"
tokio = { version = "1.0", features = ["full"] }
tower = { version = "0.4", features = ["util"] }
tower-http = { version = "0.4", features = ["fs", "trace"] }
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }

+ 70
- 0
src/background.rs View File

@@ -0,0 +1,70 @@
use crate::{site::Site, Error, Result};
use mwapi_responses::prelude::*;
use mwbot::{parsoid::WikinodeIterator, Bot, Page};
use rand::seq::SliceRandom;

pub struct Background {
pub image_url: String,
pub source_url: String,
}

async fn get_potd_images(bot: &Bot) -> Result<Vec<Page>> {
let page = bot.page("User:The Earwig/POTD")?;
let html = page.html().await?.into_mutable();

let mut images = Vec::new();
for link in html.filter_links() {
let target = bot.page(&link.target())?;
if !target.is_file() {
continue;
}
images.push(target);
}

Ok(images)
}

#[query(prop = "imageinfo", iiprop = "url|size|canonicaltitle")]
pub(crate) struct InfoResponse {}

async fn get_background_from_page(bot: &Bot, image: &Page) -> Result<Background> {
let mut resp: InfoResponse =
mwapi_responses::query_api(&bot.api(), [("titles", image.title())]).await?;
let info = resp
.query
.pages
.pop()
.ok_or(Error::NoBackgroundError(format!(
"Background image not found: {}",
{ image.title() }
)));

tracing::info!("info: {:?}", info);

// data = site.api_query(
// action="query", prop="imageinfo", iiprop="url|size|canonicaltitle",
// titles="File:" + filename)
// res = data["query"]["pages"].values()[0]["imageinfo"][0]
// name = res["canonicaltitle"][len("File:"):].replace(" ", "_")
// return name, res["url"], res["descriptionurl"], res["width"], res["height"]

let image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/9/92/Crepuscular_rays_at_Sunset_near_Waterberg_Plateau.jpg/2560px-Crepuscular_rays_at_Sunset_near_Waterberg_Plateau.jpg";
let source_url = "https://commons.wikimedia.org/wiki/File:Crepuscular_rays_at_Sunset_near_Waterberg_Plateau.jpg";
Ok(Background {
image_url: String::from(image_url),
source_url: String::from(source_url),
})
}

pub async fn get_background() -> Result<Background> {
let site = Site::new("wikimedia", "commons");
let bot = site.bot().await.unwrap();

let images = get_potd_images(&bot).await?;
let image = images.choose(&mut rand::thread_rng());
let image =
image.ok_or_else(|| Error::BackgroundError(String::from("no POTD images found")))?;

tracing::info!("Background image: {:?}", image.title());
get_background_from_page(&bot, image).await
}

+ 17
- 0
src/error.rs View File

@@ -0,0 +1,17 @@
use thiserror::Error as ThisError;

pub type Result<T, E = Error> = std::result::Result<T, E>;

#[non_exhaustive]
#[derive(ThisError, Debug)]
pub enum Error {
#[error("API error: {0}")]
ApiError(#[from] mwapi::Error),
#[error("Bot error: {0}")]
BotError(#[from] mwbot::Error),
#[error("Config error: {0}")]
ConfigError(#[from] mwbot::ConfigError),
#[error("Unable to find background image: {0}")]
BackgroundError(String),
// add nobackgroundserror, backgroundunavailableerror....
}

+ 61
- 0
src/main.rs View File

@@ -0,0 +1,61 @@
mod background;
mod error;
mod site;

pub use error::{Error, Result};

use crate::background::Background;
use askama::Template;
use axum::{routing::get, Router};
use std::net::SocketAddr;
use tower_http::services::ServeDir;
use tracing_subscriber::filter::LevelFilter;
use tracing_subscriber::EnvFilter;

#[tokio::main]
async fn main() {
let filter = EnvFilter::builder()
.with_default_directive(LevelFilter::INFO.into())
.from_env()
.expect("invalid RUST_LOG value");
tracing_subscriber::fmt().with_env_filter(filter).init();

let app = Router::new()
.route("/", get(index))
.route("/api", get(api))
.route("/language", get(language))
.route("/settings", get(settings))
.nest_service("/static", ServeDir::new("static"));

let addr = SocketAddr::from(([127, 0, 0, 1], 8080));
tracing::info!("listening on {}", addr);
axum::Server::bind(&addr)
.serve(app.into_make_service())
.await
.unwrap();
}

#[derive(Template)]
#[template(path = "index.html")]
struct IndexTemplate {
background: Background,
}

async fn index() -> IndexTemplate {
let background = background::get_background()
.await
.expect("failed to get background image"); // TODO: handle gracefully
IndexTemplate { background }
}

async fn api() -> &'static str {
"TODO"
}

async fn language() -> &'static str {
"TODO"
}

async fn settings() -> &'static str {
"TODO"
}

+ 37
- 0
src/site.rs View File

@@ -0,0 +1,37 @@
use crate::Result;
use mwbot::Bot;

const PKG_VERSION: &str = env!("CARGO_PKG_VERSION");

fn user_agent() -> String {
format!("EarwigCVDetector/{}", PKG_VERSION)
}

pub struct Site {
project: String,
lang: String,
}

impl Site {
pub fn new<S: ToString>(project: S, lang: S) -> Self {
Site {
project: project.to_string(),
lang: lang.to_string(),
}
}

fn domain(&self) -> String {
format!("{}.{}.org", self.lang, self.project)
}

pub async fn bot(&self) -> Result<Bot, mwbot::ConfigError> {
let domain = self.domain();
Bot::builder(
format!("https://{}/w/api.php", domain),
format!("https://{}/api/rest_v1", domain),
)
.set_user_agent(user_agent())
.build()
.await
}
}

+ 222
- 0
static/style.css View File

@@ -0,0 +1,222 @@
html, body {
height: 100%;
margin: 0;
}

body {
line-height: 1.4;
display: flex;
flex-direction: column;
background-color: #eaecf0;
background-attachment: fixed;
background-size: cover;
font-family: -apple-system,'BlinkMacSystemFont','Segoe UI','Roboto','Lato','Helvetica','Arial',sans-serif;
font-size: calc(1em * 0.875);
color: #000;
}

#container-inner {
margin: 1em 1em 4em;
max-width: 100em;
border: 1px solid #c8ccd1;
background-color: #fff;
padding: 1em;
filter: drop-shadow(0 0 10px rgba(0, 0, 0, 0.25));
}

#a-settings::before {
background-image: linear-gradient(transparent,transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 xmlns:xlink=%22http://www.w3.org/1999/xlink%22 width=%2220%22 height=%2220%22 viewBox=%220 0 20 20%22%3E%3Ctitle%3Esettings%3C/title%3E%3Cg transform=%22translate%2810 10%29%22%3E%3Cpath id=%22a%22 d=%22M1.5-10h-3l-1 6.5h5m0 7h-5l1 6.5h3%22/%3E%3Cuse transform=%22rotate%2845%29%22 xlink:href=%22%23a%22/%3E%3Cuse transform=%22rotate%2890%29%22 xlink:href=%22%23a%22/%3E%3Cuse transform=%22rotate%28135%29%22 xlink:href=%22%23a%22/%3E%3C/g%3E%3Cpath d=%22M10 2.5a7.5 7.5 0 000 15 7.5 7.5 0 000-15v4a3.5 3.5 0 010 7 3.5 3.5 0 010-7%22/%3E%3C/svg%3E");
background-repeat: no-repeat;
padding-left: 1.67em;
content: ' ';
color: black;
font-size: 0.85em;
opacity: 0.67;
background-size: contain;
}

header {
padding: 0.2em 1em;
background-color: #eaecf0;
}

header > * {
vertical-align: middle;
display: inline-block;
}

#a-language:before {
content: '';
background-image: url(https://commons.wikimedia.org/w/extensions/UniversalLanguageSelector/lib/jquery.uls/images/language.svg?80b0b);
background-repeat: no-repeat;
padding-left: 1.5em;
opacity: 0.67;
background-size: contain;
}

#a-language {
/* color: #54595d; */
margin-right: 1em;
}

#cv-tmp1 > * {
vertical-align: middle;
}

.cv-tmp2 {
display: inline-block;
}

.cv-tmp2 > * {
vertical-align: middle;
}

.cv-tmp2:not(:last-child) {
margin-right: 1em;
}

ol, ul {
line-height:1.5;
}

h2{
margin-bottom:.2em;
}

#cv-page-title {
max-width: 40em;
}

/* #header {
font-size:2.5em;
} */

/* #container {} */

#container {
min-height: 40em;
margin: auto;
}

footer{
width: 100%;
font-size: .90em;
text-align: center;
line-height: 1.5;
border-top: 1px solid #c8ccd1;
background: #fff;
}

footer li {
display:inline;
}

footer li:not(:last-child):after{
content:" \00b7";
}

#notice-box{
padding:10px;
margin:10px 5px;
}

#notice-box p{
margin:.25em 0;
}









#notice-box ul{padding-left:1.5em;margin:0}
#info-box{padding:0 10px;margin:10px 5px}
#cv-result{padding:5px;margin:10px 5px}
#attribution-warning{padding:1em;margin:15px 5px 10px}
#turnitin-container{padding:5px;margin:15px 5px 10px}
#sources-container{padding:5px 10px;margin:15px 5px 10px;background-color:#eee;border:1px solid #bbb}
#sources-title,#turnitin-title{margin-bottom:-5px;text-align:center;font-weight:700}
#turnitin-summary{padding-bottom:5px;font-style:italic}
#cv-additional{display:none}
#generation-time{margin-right:5px;text-align:right;font-style:italic}
#cv-chain-container{margin:0 4px}
#heading{width:100%}
#cv-form-outer{table-layout: fixed;width: 100%;max-width: 60em;}
.cv-form-inner{table-layout: fixed;width:100%;border-spacing:0;}
#cv-result-sources{width:100%;border-spacing:0 4px;table-layout:fixed}
#cv-result-sources col:first-child{width:80%}
#cv-result-sources col:nth-child(2),#cv-result-sources col:nth-child(3){width:10%}
#cv-result-sources th{text-align:left}
#cv-result-sources tr:nth-child(2n){background-color:#e0e0e0}
#cv-result-sources td:first-child{overflow:hidden;word-wrap:break-word}
#cv-result-head-table{width:100%;text-align:center;table-layout:fixed;border-spacing:0}
#cv-result-head-table col:nth-child(odd){width:42.5%}
#cv-result-head-table col:nth-child(2){width:15%}
#cv-result-head-table td:nth-child(odd){font-size:1.25em;font-weight:700;overflow:hidden;word-wrap:break-word}
#cv-result-head-table td:nth-child(2) div:first-child{font-weight:700}
#cv-result-head-table td:nth-child(2) div:nth-child(2){font-size:2.5em;font-weight:700;line-height:1}
#cv-result-head-table td:nth-child(2) div:nth-child(3){font-size:.8em}
#cv-chain-table{margin-bottom:10px}
#cv-chain-table,#turnitin-table{width:100%;border-spacing:0;table-layout:fixed}
#turnitin-table{word-wrap:break-word}
#source-row-selected{background-color:#cfcfcf!important}
#head-settings{text-align:right}
#cv-col1{width:15%}
#cv-col2{width:55%}
#cv-col3{text-align:center}
#cv-col3,#cv-col4{width:15%}
#cv-inner-col1{width:4%}
#cv-inner-col2{width:22%}
#cv-inner-col3{width:76%}
#cv-result-header{margin:0}
#redirected-from{font-size:.75em;font-weight:400}
#redirected-from,#result-head-no-sources{font-style:italic}
#source-selected{font-weight:700}
#cv-cached{position:relative}
#cv-cached span{display:none;position:absolute;top:20px;left:-50px;width:500px;padding:5px;z-index:1;background:#f3f3f3;border:1px solid #aaa;color:#000;font-style:normal;text-align:left}
.green-box{background-color:#efe;border:1px solid #7f7}
.yellow-box{background-color:#ffd;border:1px solid #ee5}
.red-box{background-color:#fee;border:1px solid #f77}
.gray-box{background-color:#eee;border:1px solid #aaa}
.indentable{white-space:pre-wrap}
.cv-source-footer{padding-bottom:5px;font-style:italic}
.cv-source-footer a{font-style:normal}
.cv-chain-detail{padding:0 10px;background-color:#fff;border:1px solid #bbb}
.cv-chain-cell{vertical-align:top;word-wrap:break-word}
.cv-chain-cell:first-child{padding-right:5px}
.cv-chain-cell:last-child{padding-left:5px}
.turnitin-table-cell{padding:.5em 0 .3em}
.turnitin-table-cell ul{margin:.2em 0 0;line-height:1.4}
.cv-text{width:100%;}
.cv-search{margin-left:0;margin-right:5px}
.cv-search~.cv-search{margin-left:20px}
.cv-hl{background:#faa}
.cv-hl-in{background:#fcc;background:linear-gradient(270deg,#faa,#fff)}
.cv-hl-out{background:#fcc;background:linear-gradient(90deg,#faa,#fff)}
.mono{font-family:monospace}
.light{color:#ccc}
.medium{color:#aaa}
.source-similarity{font-weight:700}
.source-suspect{color:#900}
.source-possible{color:#990}
.source-novio{color:#090}
.source-excluded,.source-skipped{font-style:italic}
a:link,a:visited{color: #002bb8;text-decoration:none;}
/* a:hover{ color:#040 } */
a:active,a:hover{text-decoration:underline}
a:active{color:#404}
#a-home:link,#a-home:visited{margin-right: 1em;font-size: 2em;/* font-weight: bold; */color: #54595d;}
#a-home:hover{color:#555}
#a-home:active{color:#333}
/* #a-settings:link,#a-settings:visited{color: #54595d;} */
/* #a-settings:hover{} */
#a-settings:active{color:#666}
#cv-cached:active{color:#040}
#cv-cached:active,#cv-cached:hover{text-decoration:none}
#cv-cached:hover span{display:block}
.source-url:link,.source-url:visited{color:#357}
.source-url:hover{color:#035}
.source-url:active{color:#404}

+ 37
- 0
templates/base.html View File

@@ -0,0 +1,37 @@
<html lang="en">
<head>
<meta charset="utf-8">
<title>
{% if title %}
{{ title }} -
{% endif %}
Earwig's Copyvio Detector
</title>
<link rel="stylesheet" href="/static/style.css"><!-- TODO: add hash -->
<style>
/* TODO: add dynamic background style */
</style>
</head>
<body style="background-image: url('{{ background.image_url }}');">
<div id="container">
<div id="container-inner">
<header>
<a id="a-home" href="/">
Earwig's <strong>Copyvio Detector</strong></a>
<a id="a-language" href="/language">Language</a>
<a id="a-settings" href="/settings">Settings</a>
</header>
{% block content %}
{% endblock %}
</div>
</div>
<footer>
<ul>
<li>Maintained by <a href="https://en.wikipedia.org/wiki/User:The_Earwig">Ben Kurtovic</a></li>
<li><a href="/api">API</a></li>
<li><a href="https://github.com/earwig/copyvios">Source code</a></li>
<li><a href="{{ background.source_url }}">Background image</a></li>
</ul>
</footer>
</body>
</html>

+ 114
- 0
templates/index.html View File

@@ -0,0 +1,114 @@
<html lang="en">
<head>
<meta charset="utf-8">
<title>Earwig's Copyvio Detector</title>
<link rel="stylesheet" href="/static/style.css"><!-- TODO: add hash -->
<style>
/* TODO: add dynamic background style */
</style>
</head>
<body style="background-image: url('{{ background.image_url }}');">
<div id="container">
<div id="container-inner">
<header>
<a id="a-home" href="/">
Earwig's <strong>Copyvio Detector</strong></a>
<a id="a-language" href="/language">Language</a>
<a id="a-settings" href="/settings">Settings</a>
</header>
<p>This tool attempts to detect <a href="https://en.wikipedia.org/wiki/WP:COPYVIO">copyright violations</a> in Wikipedia articles. In <i>search mode</i>, it will check for similar content elsewhere on the web using <a href="https://developers.google.com/custom-search/">Google</a>, external links present in the page, or <a href="https://en.wikipedia.org/wiki/Wikipedia:Turnitin">Turnitin</a> (via <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>), depending on the selected options. In <i>comparison mode</i>, the tool will compare the article to a specific webpage without making any searches, like the <a href="https://dupdet.toolforge.org/">Duplication Detector</a>.</p>
<p>Be aware that other websites can copy from Wikipedia, so check the results carefully, especially for older or well-developed articles. Specific websites can be skipped by being added to the <a href="https://en.wikipedia.org/wiki/User:EarwigBot/Copyvios/Exclusions">excluded URL list</a>.</p>
<form id="cv-form" action="/" method="get">
<table id="cv-form-outer">
<tr>
<td style="width: 4em;">Site:</td>
<td colspan="3">
<select name="lang" style="width: 10em;">
<option value="en" selected="selected">en (English)</option>
</select>
<select name="project" style="margin-left: 1em;">
<option value="wikipedia" selected="selected">Wikipedia</option>
</select>
</td>
</tr>
<tr>
<td id="cv-col1">Page:</td>
<td id="cv-col2" colspan="3">
<table class="cv-form-inner">
<tr>
<td style="width: 8em;">
<label for="cv-page-title">Title:</label>
</td>
<td>
<input class="cv-text" type="text" name="title" id="cv-page-title">
</td>
</tr>
<tr>
<td>
<label for="cv-rev-id">or revision ID:</label>
</td>
<td>
<input class="cv-text" type="text" name="oldid" id="cv-rev-id" style="width: 10em;">
</td>
</tr>
</table>
</td>
</tr>
<tr>
<td>Action:</td>
<td colspan="3">
<table class="cv-form-inner">
<tr>
<td style="width: 12em;">
<input id="action-search" type="radio" name="action" value="search" checked="checked" style="vertical-align: middle;">
<label for="action-search" style="vertical-align: middle;">Copyvio search:</label>
</td>
<td id="cv-tmp1">
<div class="cv-tmp2">
<input type="hidden" name="use_engine" value="0">
<input id="cv-cb-engine" class="cv-search" type="checkbox" name="use_engine" value="1" checked="checked">
<label for="cv-cb-engine">Use search engine</label>
</div>
<div class="cv-tmp2">
<input type="hidden" name="use_links" value="0">
<input id="cv-cb-links" class="cv-search" type="checkbox" name="use_links" value="1" checked="checked">
<label for="cv-cb-links">Use links in page</label>
</div>
<div class="cv-tmp2">
<input type="hidden" name="turnitin" value="0">
<input id="cv-cb-turnitin" class="cv-search" type="checkbox" name="turnitin" value="1">
<label for="cv-cb-turnitin">Use Turnitin</label>
</div>
</td>
</tr>
<tr>
<td>
<input id="action-compare" type="radio" name="action" value="compare" style="vertical-align: middle;">
<label for="action-compare" style="vertical-align: middle;">URL comparison:</label>
</td>
<td>
<input class="cv-compare cv-text" type="text" name="url" disabled="">
</td>
</tr>
</table>
</td>
</tr>
<tr>
<td colspan="4">
<input type="submit">
</td>
</tr>
</table>
</form>
</div>
</div>
<footer>
<ul>
<li>Maintained by <a href="https://en.wikipedia.org/wiki/User:The_Earwig">Ben Kurtovic</a></li>
<li><a href="/api">API</a></li>
<li><a href="https://github.com/earwig/copyvios">Source code</a></li>
<li><a href="{{ background.source_url }}">Background image</a></li>
</ul>
</footer>
</body>
</html>

Loading…
Cancel
Save