Browse Source

Adjust mirror hints to include direct links back to the article.

tags/v0.3
Ben Kurtovic 8 years ago
parent
commit
69cdb41d07
2 changed files with 15 additions and 13 deletions
  1. +3
    -3
      earwigbot/wiki/copyvios/__init__.py
  2. +12
    -10
      earwigbot/wiki/copyvios/exclusions.py

+ 3
- 3
earwigbot/wiki/copyvios/__init__.py View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# #
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
# Copyright (C) 2009-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
# #
# Permission is hereby granted, free of charge, to any person obtaining a copy # Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal # of this software and associated documentation files (the "Software"), to deal
@@ -124,8 +124,8 @@ class CopyvioMixIn(object):
if self._exclusions_db: if self._exclusions_db:
self._exclusions_db.sync(self.site.name) self._exclusions_db.sync(self.site.name)
exclude = lambda u: self._exclusions_db.check(self.site.name, u) exclude = lambda u: self._exclusions_db.check(self.site.name, u)
parser_args["mirror_hints"] = self._exclusions_db.get_mirror_hints(
self.site.name)
parser_args["mirror_hints"] = \
self._exclusions_db.get_mirror_hints(self)
else: else:
exclude = None exclude = None




+ 12
- 10
earwigbot/wiki/copyvios/exclusions.py View File

@@ -122,7 +122,7 @@ class ExclusionsDB(object):
site = self._sitesdb.get_site("enwiki") site = self._sitesdb.get_site("enwiki")
else: else:
site = self._sitesdb.get_site(sitename) site = self._sitesdb.get_site(sitename)
with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
with self._db_access_lock, sqlite.connect(self._dbfile) as conn:
urls = set() urls = set()
for (source,) in conn.execute(query1, (sitename,)): for (source,) in conn.execute(query1, (sitename,)):
urls |= self._load_source(site, source) urls |= self._load_source(site, source)
@@ -140,7 +140,7 @@ class ExclusionsDB(object):
def _get_last_update(self, sitename): def _get_last_update(self, sitename):
"""Return the UNIX timestamp of the last time the db was updated.""" """Return the UNIX timestamp of the last time the db was updated."""
query = "SELECT update_time FROM updates WHERE update_sitename = ?" query = "SELECT update_time FROM updates WHERE update_sitename = ?"
with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
with self._db_access_lock, sqlite.connect(self._dbfile) as conn:
try: try:
result = conn.execute(query, (sitename,)).fetchone() result = conn.execute(query, (sitename,)).fetchone()
except sqlite.OperationalError: except sqlite.OperationalError:
@@ -176,7 +176,7 @@ class ExclusionsDB(object):
normalized = re.sub(r"^https?://(www\.)?", "", url.lower()) normalized = re.sub(r"^https?://(www\.)?", "", url.lower())
query = """SELECT exclusion_url FROM exclusions query = """SELECT exclusion_url FROM exclusions
WHERE exclusion_sitename = ? OR exclusion_sitename = ?""" WHERE exclusion_sitename = ? OR exclusion_sitename = ?"""
with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
with self._db_access_lock, sqlite.connect(self._dbfile) as conn:
for (excl,) in conn.execute(query, (sitename, "all")): for (excl,) in conn.execute(query, (sitename, "all")):
if excl.startswith("*."): if excl.startswith("*."):
parsed = urlparse(url.lower()) parsed = urlparse(url.lower())
@@ -200,21 +200,23 @@ class ExclusionsDB(object):
self._logger.debug(log) self._logger.debug(log)
return False return False


def get_mirror_hints(self, sitename, try_mobile=True):
def get_mirror_hints(self, page, try_mobile=True):
"""Return a list of strings that indicate the existence of a mirror. """Return a list of strings that indicate the existence of a mirror.


The source parser checks for the presence of these strings inside of The source parser checks for the presence of these strings inside of
certain HTML tag attributes (``"href"`` and ``"src"``). certain HTML tag attributes (``"href"`` and ``"src"``).
""" """
site = self._sitesdb.get_site(sitename)
base = site.domain + site._script_path
roots = [base]
site = page.site
path = urlparse(page.url).path
roots = [site.domain]
scripts = ["index.php", "load.php", "api.php"] scripts = ["index.php", "load.php", "api.php"]


if try_mobile: if try_mobile:
fragments = re.search(r"^([\w]+)\.([\w]+).([\w]+)$", site.domain) fragments = re.search(r"^([\w]+)\.([\w]+).([\w]+)$", site.domain)
if fragments: if fragments:
mobile = "{0}.m.{1}.{2}".format(*fragments.groups())
roots.append(mobile + site._script_path)
roots.append("{0}.m.{1}.{2}".format(*fragments.groups()))


return [root + "/" + script for root in roots for script in scripts]
general = [root + site._script_path + "/" + script
for root in roots for script in scripts]
specific = [root + path for root in roots]
return general + specific

Loading…
Cancel
Save