Browse Source

Check for preexisting copyvio tags; more prep for trial.

pull/15/head
Ben Kurtovic 11 years ago
parent
commit
162dde589c
1 changed files with 32 additions and 4 deletions
  1. +32
    -4
      tasks/afc_copyvios.py

+ 32
- 4
tasks/afc_copyvios.py View File

@@ -25,6 +25,7 @@ from os.path import expanduser
from threading import Lock from threading import Lock
from urllib import quote from urllib import quote


import mwparserfromhell
import oursql import oursql


from earwigbot.tasks import Task from earwigbot.tasks import Task
@@ -45,6 +46,9 @@ class AFCCopyvios(Task):
self.cache_results = cfg.get("cacheResults", False) self.cache_results = cfg.get("cacheResults", False)
default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}." default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}."
self.summary = self.make_summary(cfg.get("summary", default_summary)) self.summary = self.make_summary(cfg.get("summary", default_summary))
default_tags = [
"Db-g12", "Db-copyvio", "Copyvio", "Copyviocore" "Copypaste"]
self.tags = default_tags + cfg.get("tags", [])


# Connection data for our SQL database: # Connection data for our SQL database:
kwargs = cfg.get("sql", {}) kwargs = cfg.get("sql", {})
@@ -81,6 +85,10 @@ class AFCCopyvios(Task):
msg = u"Skipping check on already processed page [[{0}]]" msg = u"Skipping check on already processed page [[{0}]]"
self.logger.info(msg.format(title)) self.logger.info(msg.format(title))
return return
elif self.is_tagged(page.get()):
msg = u"Skipping check on already tagged page [[{0}]]"
self.logger.info(msg.format(title))
return


self.logger.info(u"Checking [[{0}]]".format(title)) self.logger.info(u"Checking [[{0}]]".format(title))
result = page.copyvio_check(self.min_confidence, self.max_queries, result = page.copyvio_check(self.min_confidence, self.max_queries,
@@ -92,12 +100,20 @@ class AFCCopyvios(Task):
# Things can change in the minute that it takes to do a check. # Things can change in the minute that it takes to do a check.
# Confirm that a violation still holds true: # Confirm that a violation still holds true:
page.load() page.load()
if self.is_tagged(page.get()):
msg = u"A violation was detected in [[{0}]], but it was tagged"
msg += " by someone else while checking (best: {1} at {2} confidence)"
self.logger.info(msg.format(title, url, orig_conf))
self._trial_reporter(title, False, url, orig_conf, result.queries, result.time, msg)
self.log_processed(pageid)
return
confirm = page.copyvio_compare(url, self.min_confidence) confirm = page.copyvio_compare(url, self.min_confidence)
new_conf = "{0}%".format(round(confirm.confidence * 100, 2)) new_conf = "{0}%".format(round(confirm.confidence * 100, 2))
if not confirm.violation: if not confirm.violation:
msg = u"A violation was detected in [[{0}]], but couldn't be confirmed." msg = u"A violation was detected in [[{0}]], but couldn't be confirmed."
msg += u" It may have just been edited (best: {1} at {2} -> {3} confidence)" msg += u" It may have just been edited (best: {1} at {2} -> {3} confidence)"
self.logger.info(msg.format(title, url, orig_conf, new_conf)) self.logger.info(msg.format(title, url, orig_conf, new_conf))
self._trial_reporter(title, False, url, orig_conf, result.queries, result.time, msg)
self.log_processed(pageid) self.log_processed(pageid)
return return


@@ -112,27 +128,39 @@ class AFCCopyvios(Task):
# page.edit(newtext, self.summary) # page.edit(newtext, self.summary)
msg = u"Found violation: [[{0}]] -> {1} ({2} confidence)" msg = u"Found violation: [[{0}]] -> {1} ({2} confidence)"
self.logger.info(msg.format(title, url, new_conf)) self.logger.info(msg.format(title, url, new_conf))
self._trial_reporter(True, url, new_conf, result.queries, result.time)
self._trial_reporter(title, True, url, new_conf, result.queries, result.time, msg)
else: else:
msg = u"No violations detected in [[{0}]] (best: {1} at {2} confidence)" msg = u"No violations detected in [[{0}]] (best: {1} at {2} confidence)"
self.logger.info(msg.format(title, url, orig_conf)) self.logger.info(msg.format(title, url, orig_conf))
self._trial_reporter(False, url, orig_conf, result.queries, result.time)
self._trial_reporter(title, False, url, orig_conf, result.queries, result.time, msg)


self.log_processed(pageid) self.log_processed(pageid)
if self.cache_results: if self.cache_results:
self.cache_result(page, result) self.cache_result(page, result)


def _trial_reporter(self, violation, url, conf, queries, time):
def _trial_reporter(self, title, violation, url, conf, queries, time, msg):
from datetime import datetime
date = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")
data = u"\n" + ("-" * 80) + u"\n" data = u"\n" + ("-" * 80) + u"\n"
data += u"[[{0}]]\n".format(title)
data += u"[[{0}]] at {1}\n".format(title, date)
data += u"Violation? {0}\n".format("***YES***" if violation else "No") data += u"Violation? {0}\n".format("***YES***" if violation else "No")
data += u"Confidence: {0}\n".format(conf) data += u"Confidence: {0}\n".format(conf)
data += u"Best match: {0}\n".format(url) data += u"Best match: {0}\n".format(url)
data += u"Num queries: {0}\n".format(queries) data += u"Num queries: {0}\n".format(queries)
data += u"Time: {0}\n".format(time) data += u"Time: {0}\n".format(time)
data += u"Log message: {0}\n".format(msg)
with open("/data/project/earwigbot/public_html/copyvio_bot_trial.txt", "a") as fp: with open("/data/project/earwigbot/public_html/copyvio_bot_trial.txt", "a") as fp:
fp.write(data.encode("utf8")) fp.write(data.encode("utf8"))


def is_tagged(self, text):
"""Return whether the text contains a copyvio check template."""
code = mwparserfromhell.parse(text)
for template in code.ifilter_templates():
for tag in self.tags:
if template.name.matches(tag):
return True
return False

def has_been_processed(self, pageid): def has_been_processed(self, pageid):
"""Returns True if pageid was processed before, otherwise False.""" """Returns True if pageid was processed before, otherwise False."""
query = "SELECT 1 FROM processed WHERE page_id = ?" query = "SELECT 1 FROM processed WHERE page_id = ?"


Loading…
Cancel
Save