Ver código fonte

More work on copyright violation detection code.

tags/v0.1^2
Ben Kurtovic 13 anos atrás
pai
commit
56e6140284
2 arquivos alterados com 98 adições e 11 exclusões
  1. +25
    -7
      earwigbot/tasks/afc_copyvios.py
  2. +73
    -4
      earwigbot/wiki/copyright.py

+ 25
- 7
earwigbot/tasks/afc_copyvios.py Ver arquivo

@@ -42,6 +42,8 @@ class Task(BaseTask):
cfg = config.tasks.get(self.name, {})
self.template = cfg.get("template", "AfC suspected copyvio")
self.ignore_list = cfg.get("ignoreList", [])
self.min_confidence = cfg.get("minConfidence", 0.75)
self.max_queries = cfg.get("maxQueries", 10)
default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}"
self.summary = self.make_summary(cfg.get("summary", default_summary))

@@ -74,28 +76,40 @@ class Task(BaseTask):

def process(self, page):
"""Detect copyvios in 'page' and add a note if any are found."""
title = page.title()
if title in self.ignore_list:
msg = "Skipping page in ignore list: [[{0}]]"
self.logger.info(msg.format(title))
return

pageid = page.pageid()
if self.has_been_processed(pageid):
msg = "Skipping check on already processed page [[{0}]]"
self.logger.info(msg.format(page.title()))
self.logger.info(msg.format(title))
return

self.logger.info("Checking [[{0}]]".format(page.title()))
self.logger.info("Checking [[{0}]]".format(title))
content = page.get()
result = page.copyvio_check(self.engine, self.credentials)
if result:
result = page.copyvio_check(self.engine, self.credentials,
self.min_confidence, self.max_queries)
if result.url:
url = result.url
content = page.get()
template = "\{\{{0}|url={1}\}\}".format(self.template, result)
template = "\{\{{0}|url={1}\}\}".format(self.template, url)
newtext = "\n".join((template, content))
page.edit(newtext, self.summary.format(url=result))
if "{url}" in self.summary:
page.edit(newtext, self.summary.format(url=url))
else:
page.edit(newtext, self.summary)
msg = "Found violation: [[{0}]] -> {1}"
self.logger.info(msg.format(page.title(), result))
self.logger.warn(msg.format(title, url))
else:
self.logger.debug("No violations detected")

self.log_processed(pageid)

def has_been_processed(self, pageid):
"""Returns True if pageid was processed before, otherwise False."""
query = "SELECT 1 FROM processed WHERE page_id = ?"
with self.conn.cursor() as cursor:
cursor.execute(query, (pageid,))
@@ -105,6 +119,10 @@ class Task(BaseTask):
return False

def log_processed(self, pageid):
"""Adds pageid to our database of processed pages.

Raises an exception if the page has already been processed.
"""
query = "INSERT INTO processed VALUES (?)"
with self.conn.cursor() as cursor:
cursor.execute(query, (pageid,))

+ 73
- 4
earwigbot/wiki/copyright.py Ver arquivo

@@ -21,6 +21,7 @@
# SOFTWARE.

from json import loads
from time import sleep, time
from urllib import quote_plus, urlencode

try:
@@ -30,6 +31,17 @@ except ImportError:

from earwigbot.wiki.exceptions import *

class CopyvioCheckResult(object):
def __init__(self, confidence, url, queries):
self.confidence = confidence
self.url = url
self.queries = queries

def __repr__(self):
r = "CopyvioCheckResult(confidence={0!r}, url={1!r}, queries={2|r})"
return r.format(self.confidence, self.url, self.queries)


class CopyrightMixin(object):
"""
EarwigBot's Wiki Toolset: Copyright Violation Mixin
@@ -45,7 +57,8 @@ class CopyrightMixin(object):
determined by Yahoo). Raises SearchQueryError() on errors.
"""
base_url = "http://yboss.yahooapis.com/ysearch/web"
params = {"q": quote_plus(query), "style": "raw", "format": "json"}
query = quote_plus(query.join('"', '"'))
params = {"q": query, "style": "raw", "format": "json"}
url = "{0}?{1}".format(base_url, urlencode(params))

consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"])
@@ -68,8 +81,40 @@ class CopyrightMixin(object):
return []
return [result["url"] for result in results]

def copyvio_check(self, engine, credentials, force=False):
"""Check the page for copyright violations."""
def _copyvio_strip_content(self, content):
return content

def _copyvio_explode_content(self, content):
return content

def _copyvio_compare_content(self, content, url):
return 0

def copyvio_check(self, engine, credentials, min_confidence=0.5,
max_queries=-1, interquery_sleep=1, force=False):
"""Check the page for copyright violations.

Returns a CopyvioCheckResult object, with three useful attributes:
"confidence", "url", and "queries". "confidence" is a number between
0 and 1; if it is less than min_confidence, we could not find any
indication of a violation (so "url" will be None), otherwise it
indicates the relative faith in our results, and "url" will be the
place the article is suspected of being copied from. "queries" is the
number of queries used to determine the results.

"max_queries" is self-explanatory; we will never make more than this
number of queries in a given check. If it's less than 0, we will not
limit our number of queries.

"interquery_sleep" is the minimum amount of time we will sleep between
search engine queries, in seconds.

"force" is simply passed to page.get() - it has the same behavior there
as it does here.

Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
SearchQueryError, ...) on errors.
"""
if engine == "Yahoo! BOSS":
if not oauth:
e = "The package 'oauth2' could not be imported"
@@ -77,5 +122,29 @@ class CopyrightMixin(object):
querier = self._yahoo_boss_query
else:
raise UnknownSearchEngineError(engine)

handled_urls = []
best_confidence = 0
best_match = None
num_queries = 0
content = self.get(force)
return querier(content, credentials)
clean = self._copyvio_strip_content(content)
fragments = self._copyvio_explode_content(clean)
last_query = time()

while (fragments and best_confidence < min_confidence and
(max_queries < 0 or num_queries < max_queries)):
urls = querier(fragments.pop(0), credentials)
urls = [url for url in urls if url not in handled_urls]
for url in urls:
confidence = self._copyvio_compare_content(content, url)
if confidence > best_confidence:
best_confidence = confidence
best_match = url
num_queries += 1
diff = time() - last_query
if diff < interquery_sleep:
sleep(interquery_sleep - diff)
last_query = time()

return CopyvioCheckResult(best_confidence, best_match, num_queries)

Carregando…
Cancelar
Salvar