Преглед на файлове

Some more work on copyvio detection code

Also removed the hardcoded version in user-agent strings.
tags/v0.1^2
Ben Kurtovic преди 13 години
родител
ревизия
24f7eabb77
променени са 6 файла, в които са добавени 103 реда и са изтрити 36 реда
  1. +4
    -2
      earwigbot/commands/ctcp.py
  2. +10
    -7
      earwigbot/tasks/afc_copyvios.py
  3. +2
    -1
      earwigbot/wiki/constants.py
  4. +83
    -25
      earwigbot/wiki/copyright.py
  5. +3
    -1
      earwigbot/wiki/functions.py
  6. +1
    -0
      earwigbot/wiki/page.py

+ 4
- 2
earwigbot/commands/ctcp.py Целия файл

@@ -23,6 +23,7 @@
import platform
import time

import earwigbot
from earwigbot.classes import BaseCommand
from earwigbot.config import config

@@ -61,7 +62,8 @@ class Command(BaseCommand):
self.connection.notice(target, "\x01TIME {0}\x01".format(ts))

elif command == "VERSION":
default = "EarwigBot - 0.1-dev - Python/$1 https://github.com/earwig/earwigbot"
default = "EarwigBot - $1 - Python/$2 https://github.com/earwig/earwigbot"
vers = config.irc.get("version", default)
vers = vers.replace("$1", platform.python_version())
vers = vers.replace("$1", earwigbot.__version__)
vers = vers.replace("$2", platform.python_version())
self.connection.notice(target, "\x01VERSION {0}\x01".format(vers))

+ 10
- 7
earwigbot/tasks/afc_copyvios.py Целия файл

@@ -89,22 +89,25 @@ class Task(BaseTask):
return

self.logger.info("Checking [[{0}]]".format(title))
content = page.get()
result = page.copyvio_check(self.engine, self.credentials,
self.min_confidence, self.max_queries)
if result.url:
url = result.url
url = result.url
confidence = "{0}%".format(round(result.confidence * 100, 2))

if result.violation:
content = page.get()
template = "\{\{{0}|url={1}\}\}".format(self.template, url)
template = "\{\{{0}|url={1}|confidence={2}\}\}"
template = template.format(self.template, url, confidence)
newtext = "\n".join((template, content))
if "{url}" in self.summary:
page.edit(newtext, self.summary.format(url=url))
else:
page.edit(newtext, self.summary)
msg = "Found violation: [[{0}]] -> {1}"
self.logger.warn(msg.format(title, url))
msg = "Found violation: [[{0}]] -> {1} ({2} confidence)"
self.logger.warn(msg.format(title, url, confidence))
else:
self.logger.debug("No violations detected")
msg = "No violations detected (best: {1} at {2} confidence)"
self.logger.debug(msg.format(url, confidence))

self.log_processed(pageid)



+ 2
- 1
earwigbot/wiki/constants.py Целия файл

@@ -31,8 +31,9 @@ Import with `from earwigbot.wiki import constants` or `from earwigbot.wiki.const
"""

# Default User Agent when making API queries:
from earwigbot import __version__ as _v
from platform import python_version as _p
USER_AGENT = "EarwigBot/0.1-dev (Python/{0}; https://github.com/earwig/earwigbot)".format(_p())
USER_AGENT = "EarwigBot/{0} (Python/{1}; https://github.com/earwig/earwigbot)".format(_v, _p())

# Default namespace IDs:
NS_MAIN = 0


+ 83
- 25
earwigbot/wiki/copyright.py Целия файл

@@ -20,9 +20,13 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from functools import partial
from gzip import GzipFile
from json import loads
from StringIO import StringIO
from time import sleep, time
from urllib import quote_plus, urlencode
from urllib2 import build_opener, URLError

try:
import oauth2 as oauth
@@ -32,14 +36,15 @@ except ImportError:
from earwigbot.wiki.exceptions import *

class CopyvioCheckResult(object):
def __init__(self, confidence, url, queries):
def __init__(self, violation, confidence, url, queries):
self.violation = violation
self.confidence = confidence
self.url = url
self.queries = queries

def __repr__(self):
r = "CopyvioCheckResult(confidence={0!r}, url={1!r}, queries={2|r})"
return r.format(self.confidence, self.url, self.queries)
r = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
return r.format(self.violation, self.confidence, self.url, self.queries)


class CopyrightMixin(object):
@@ -50,7 +55,57 @@ class CopyrightMixin(object):
checks the page for copyright violations using a search engine API. The
API keys must be provided to the method as arguments.
"""
def _yahoo_boss_query(self, query, cred):
def __init__(self):
self._opener = build_opener()
self._opener.addheaders = self._site._opener.addheaders

def _open_url_ignoring_errors(self, url):
"""Open a URL using self._opener and return its content, or None.

Will decompress the content if the headers contain "gzip" as its
content encoding, and will return None if URLError is raised while
opening the URL. IOErrors while gunzipping a compressed response are
ignored, and the original content is returned.
"""
try:
response = self._opener.open(url)
except URLError:
return None
result = response.read()

if response.headers.get("Content-Encoding") == "gzip":
stream = StringIO(result)
gzipper = GzipFile(fileobj=stream)
try:
result = gzipper.read()
except IOError:
pass

return result

def _select_search_engine(self, engine, credentials):
"""Return a function that can be called to do web searches.

The "function" is a functools.partial object that takes one argument, a
query, and returns a list of URLs, ranked by importance. The underlying
logic depends on the 'engine' argument; for example, if 'engine' is
"Yahoo! BOSS", we'll use self._yahoo_boss_query for querying.

Raises UnknownSearchEngineError if 'engine' is not known to us, and
UnsupportedSearchEngineError if we are missing a required package or
module, like oauth2 for "Yahoo! BOSS".
"""
if engine == "Yahoo! BOSS":
if not oauth:
e = "The package 'oauth2' could not be imported"
raise UnsupportedSearchEngineError(e)
searcher = self._yahoo_boss_query
else:
raise UnknownSearchEngineError(engine)

return partial(searcher, credentials)

def _yahoo_boss_query(self, cred, query):
"""Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials.

Returns a list of URLs, no more than fifty, ranked by relevance (as
@@ -84,21 +139,27 @@ class CopyrightMixin(object):
def _copyvio_strip_content(self, content):
return content

def _copyvio_explode_content(self, content):
return content
def _copyvio_chunk_content(self, content):
return [content]

def _copyvio_compare_content(self, content, url):
return 0
html = self._open_url_ignoring_errors(url)
if not html:
return 0

confidence = 0
return confidence

def copyvio_check(self, engine, credentials, min_confidence=0.5,
def copyvio_check(self, engine, credentials, min_confidence=0.75,
max_queries=-1, interquery_sleep=1, force=False):
"""Check the page for copyright violations.

Returns a CopyvioCheckResult object, with three useful attributes:
"confidence", "url", and "queries". "confidence" is a number between
0 and 1; if it is less than min_confidence, we could not find any
indication of a violation (so "url" will be None), otherwise it
indicates the relative faith in our results, and "url" will be the
Returns a CopyvioCheckResult object, with four useful attributes:
"violation", "confidence", "url", and "queries". "confidence" is a
number between 0 and 1; if it is less than "min_confidence", we could
not find any indication of a violation (so "violation" will be False
and "url" may or may not be None), otherwise it indicates the relative
faith in our results, "violation" will be True, and "url" will be the
place the article is suspected of being copied from. "queries" is the
number of queries used to determine the results.

@@ -115,26 +176,19 @@ class CopyrightMixin(object):
Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
SearchQueryError, ...) on errors.
"""
if engine == "Yahoo! BOSS":
if not oauth:
e = "The package 'oauth2' could not be imported"
raise UnsupportedSearchEngineError(e)
querier = self._yahoo_boss_query
else:
raise UnknownSearchEngineError(engine)

search = self._select_search_engine(engine, credentials)
handled_urls = []
best_confidence = 0
best_match = None
num_queries = 0
content = self.get(force)
clean = self._copyvio_strip_content(content)
fragments = self._copyvio_explode_content(clean)
chunks = self._copyvio_chunk_content(clean)
last_query = time()

while (fragments and best_confidence < min_confidence and
while (chunks and best_confidence < min_confidence and
(max_queries < 0 or num_queries < max_queries)):
urls = querier(fragments.pop(0), credentials)
urls = search(chunks.pop(0))
urls = [url for url in urls if url not in handled_urls]
for url in urls:
confidence = self._copyvio_compare_content(content, url)
@@ -147,4 +201,8 @@ class CopyrightMixin(object):
sleep(interquery_sleep - diff)
last_query = time()

return CopyvioCheckResult(best_confidence, best_match, num_queries)
if best_confidence >= min_confidence: # violation?
vi = True
else:
vi = False
return CopyvioCheckResult(vi, best_confidence, best_match, num_queries)

+ 3
- 1
earwigbot/wiki/functions.py Целия файл

@@ -37,6 +37,7 @@ from os import chmod, path
import platform
import stat

import earwigbot
from earwigbot.config import config
from earwigbot.wiki.exceptions import SiteNotFoundError
from earwigbot.wiki.site import Site
@@ -111,7 +112,8 @@ def _get_site_object_from_dict(name, d):
maxlag = config.wiki.get("maxlag")

if user_agent:
user_agent = user_agent.replace("$1", platform.python_version())
user_agent = user_agent.replace("$1", earwigbot.__version__)
user_agent = user_agent.replace("$2", platform.python_version())

for key, value in namespaces.items(): # Convert string keys to integers
del namespaces[key]


+ 1
- 0
earwigbot/wiki/page.py Целия файл

@@ -69,6 +69,7 @@ class Page(CopyrightMixin):
__init__ will not do any API queries, but it will use basic namespace
logic to determine our namespace ID and if we are a talkpage.
"""
super(Page, self).__init__()
self._site = site
self._title = title.strip()
self._follow_redirects = self._keep_following = follow_redirects


Зареждане…
Отказ
Запис