Merge pull request #24 from fhocutt/master

Add integration with turnitin/plagiabot/EranBot
8 lat temu · d31e24fc62
--- a/app.py
+++ b/app.py
@@ -103,7 +103,8 @@ def index():
    update_sites()
    query = do_check()
    return render_template(
        "index.mako", notice=notice, query=query, result=query.result)
        "index.mako", notice=notice, query=query, result=query.result,
        turnitin_result=query.turnitin_result)

@app.route("/settings", methods=["GET", "POST"])
@catch_errors
--- a/copyvios/checker.py
+++ b/copyvios/checker.py
@@ -11,6 +11,7 @@ from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult

 from .misc import Query, get_db
 from .sites import get_site
 from .turnitin import search_turnitin

 __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]

@@ -63,9 +64,16 @@ def _get_results(query, follow=True):
        conn = get_db()
        use_engine = 0 if query.use_engine in ("0", "false") else 1
        use_links = 0 if query.use_links in ("0", "false") else 1
        use_turnitin = 0 if query.turnitin in ("0", "false") else 1
        if not use_engine and not use_links:
            query.error = "no search method"
            return

        # Handle the turnitin check
        if use_turnitin:
            query.turnitin_result = search_turnitin(query.title, query.lang)

        # Handle the copyvio check
        mode = "{0}:{1}:".format(use_engine, use_links)
        if not _coerce_bool(query.nocache):
            query.result = _get_cached_results(
--- a/copyvios/misc.py
+++ b/copyvios/misc.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8  -*-

 import datetime
 from os.path import expanduser

 from flask import g, request
@@ -64,6 +65,9 @@ def httpsfix(context, url):
        url = url[len("http:"):]
    return url

 def parse_wiki_timestamp(timestamp):
    return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S')

 def urlstrip(context, url):
    if url.startswith("http://"):
        url = url[7:]
--- a/copyvios/turnitin.py
+++ b/copyvios/turnitin.py
@@ -0,0 +1,104 @@
 # -*- coding: utf-8 -*-
 from ast import literal_eval
 import re

 import requests

 from .misc import parse_wiki_timestamp

 __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']

 TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'

 def search_turnitin(page_title, lang):
    """ Search the Plagiabot database for Turnitin reports for a page.

    Keyword arguments:
    page_title -- string containing the page title
    lang       -- string containing the page's project language code

    Return a TurnitinResult (contains a list of TurnitinReports).
    """
    return TurnitinResult(_make_api_request(page_title, lang))

 def _make_api_request(page_title, lang):
    """ Query the plagiabot API for Turnitin reports for a given page.
    """
    stripped_page_title = page_title.replace(' ', '_')
    api_parameters = {'action': 'suspected_diffs',
                      'page_title': stripped_page_title,
                      'lang': lang,
                      'report': 1}

    result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters)
    # use literal_eval to *safely* parse the resulting dict-containing string
    parsed_api_result = literal_eval(result.text)
    return parsed_api_result

 class TurnitinResult:
    """ Container class for TurnitinReports. Each page may have zero or
    more reports of plagiarism. The list will have multiple
    TurnitinReports if plagiarism has been detected for more than one
    revision.

    TurnitinResult.reports -- list containing >= 0 TurnitinReport items
    """
    def __init__(self, turnitin_data):
        """
        Keyword argument:
        turnitin_data -- plagiabot API result
        """
        self.reports = []
        for item in turnitin_data:
            report = TurnitinReport(
                item['diff_timestamp'], item['diff'], item['report'])
            self.reports.append(report)

    def __repr__(self):
        return str(self.__dict__)

 class TurnitinReport:
    """ Contains data for each Turnitin report (one on each potentially
    plagiarized revision).

    TurnitinReport.reportid  -- Turnitin report ID, taken from plagiabot
    TurnitinReport.diffid    -- diff ID from Wikipedia database
    TurnitinReport.time_posted -- datetime of the time the diff posted
    TurnitinReport.sources   -- list of dicts with information on:
        percent -- percent of revision found in source as well
        words   -- number of words found in both source and revision
        url     -- url for the possibly-plagiarized source
    """
    def __init__(self, timestamp, diffid, report):
        """
        Keyword argument:
        timestamp  -- diff timestamp from Wikipedia database
        diffid     -- diff ID from Wikipedia database
        report     -- Turnitin report from the plagiabot database
        """
        self.report_data = self._parse_report(report)
        self.reportid = self.report_data[0]
        self.diffid = diffid
        self.time_posted = parse_wiki_timestamp(timestamp)

        self.sources = []
        for item in self.report_data[1]:
            source = {'percent': item[0],
                      'words': item[1],
                      'url': item[2]}
            self.sources.append(source)

    def __repr__(self):
        return str(self.__dict__)

    def _parse_report(self, report_text):
        # extract report ID
        report_id_pattern = re.compile(r'\?rid=(\d*)')
        report_id = report_id_pattern.search(report_text).groups()[0]

        # extract percent match, words, and URL for each source in the report
        extract_info_pattern = re.compile(
            r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
        results = extract_info_pattern.findall(report_text)

        return (report_id, results)
--- a/static/style.css
+++ b/static/style.css
@@ -63,6 +63,17 @@ div#info-box {
    margin: 10px 5px;
 }

 div#turnitin-container {
    padding: 5px 10px;
    margin: 15px 5px 10px 5px;
 }

 div#turnitin-title {
    margin-bottom: -5px;
    text-align: center;
    font-weight: bold;
 }

 div#cv-result {
    padding: 5px;
    margin: 10px 5px;
--- a/templates/index.mako
+++ b/templates/index.mako
@@ -113,6 +113,9 @@
                            <input class="cv-search" type="hidden" name="use_links" value="0" />
                            <input id="cv-cb-links" class="cv-search" type="checkbox" name="use_links" value="1" ${'checked="checked"' if (query.use_links != "0") else ""} />
                            <label for="cv-cb-links">Use&nbsp;links&nbsp;in&nbsp;page</label>
                            <input class="cv-search" type="hidden" name="use_links" value="0" />
                            <span style="white-space:nowrap"><input id="cv-cb-turnitin" class="cv-search" type="checkbox" name="turnitin" value="1" ${'checked="checked"' if (query.turnitin != "0") else ""}/>
                            <label for="cv-cb-turnitin">Use&nbsp;Turnitin&nbsp;database</label></span>
                        </td>
                    </tr>
                    <tr>
@@ -146,6 +149,7 @@
        </tr>
    </table>
 </form>

 % if result:
    <div id="generation-time">
        Results
@@ -160,6 +164,29 @@
        % endif
        <a href="${request.script_root | h}?lang=${query.lang | h}&amp;project=${query.project | h}&amp;oldid=${query.oldid or query.page.lastrevid | h}&amp;action=${query.action | h}&amp;${"use_engine={0}&use_links={1}".format(int(query.use_engine not in ("0", "false")), int(query.use_links not in ("0", "false"))) if query.action == "search" else "" | h}${"url=" if query.action == "compare" else ""}${query.url if query.action == "compare" else "" | u}">Permalink.</a>
    </div>

    % if query.turnitin:
        <div id="turnitin-container" class="${'red' if query.turnitin_result.reports else 'green'}-box">
            <div id="turnitin-title">Turnitin Results</div>
            % if query.turnitin_result.reports:
                <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them.</p>

                <table id="turnitin-table"><tbody>
                %for report in turnitin_result.reports:
                    <tr><td id="turnitin-table-cell"><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid}</a> for text added <a href="https://${query.lang}.wikipedia.org/w/index.php?title=${query.title}&diff=${report.diffid}"> at ${report.time_posted}</a>:
                    <ul>
                    % for source in report.sources:
                          <li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li>
                    % endfor
                    </ul></td></tr>
                %endfor
                </tbody></table>
            % else:
                <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.</p>
            % endif
        </div>
    % endif

    <div id="cv-result" class="${'red' if result.confidence >= T_SUSPECT else 'yellow' if result.confidence >= T_POSSIBLE else 'green'}-box">
        <table id="cv-result-head-table">
            <colgroup>