Parcourir la source

Finish mirror hinting algorithm.

tags/v0.2
Ben Kurtovic il y a 9 ans
Parent
révision
108eca13ac
2 fichiers modifiés avec 27 ajouts et 3 suppressions
  1. +8
    -3
      earwigbot/wiki/copyvios/__init__.py
  2. +19
    -0
      earwigbot/wiki/copyvios/exclusions.py

+ 8
- 3
earwigbot/wiki/copyvios/__init__.py Voir le fichier

@@ -116,15 +116,20 @@ class CopyvioMixIn(object):
searcher = self._get_search_engine()
parser = ArticleTextParser(self.get())
article = MarkovChain(parser.strip())
workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders,
short_circuit=short_circuit, parser_args={"mirror_hints": ["wikipedia.org/w/"]})
parser_args = {}

if self._exclusions_db:
self._exclusions_db.sync(self.site.name)
exclude = lambda u: self._exclusions_db.check(self.site.name, u)
parser_args["mirror_hints"] = self._exclusions_db.get_mirror_hints(
self.site.name)
else:
exclude = None

workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders,
short_circuit=short_circuit, parser_args=parser_args)

if article.size < 20: # Auto-fail very small articles
result = workspace.get_result()
self._logger.info(result.get_log_message(self.title))


+ 19
- 0
earwigbot/wiki/copyvios/exclusions.py Voir le fichier

@@ -196,3 +196,22 @@ class ExclusionsDB(object):
log = u"No exclusions in {0} for {1}".format(sitename, url)
self._logger.debug(log)
return False

def get_mirror_hints(self, sitename, try_mobile=True):
"""Return a list of strings that indicate the existence of a mirror.

The source parser checks for the presence of these strings inside of
certain HTML tag attributes (``"href"`` and ``"src"``).
"""
site = self._sitesdb.get_site(sitename)
base = site.domain + site._script_path
roots = [base]
scripts = ["index.php", "load.php", "api.php"]

if try_mobile:
fragments = re.search(r"^([\w]+)\.([\w]+).([\w]+)$", site.domain)
if fragments:
mobile = "{0}.m.{1}.{2}".format(*fragments.groups())
roots.append(mobile + site._script_path)

return [root + "/" + script for root in roots for script in scripts]

Chargement…
Annuler
Enregistrer