From 108eca13ac8cb7715a5456d01be423272d9d2bec Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 29 Sep 2015 19:44:32 -0500 Subject: [PATCH] Finish mirror hinting algorithm. --- earwigbot/wiki/copyvios/__init__.py | 11 ++++++++--- earwigbot/wiki/copyvios/exclusions.py | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 23e1d2a..dbe8efa 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -116,15 +116,20 @@ class CopyvioMixIn(object): searcher = self._get_search_engine() parser = ArticleTextParser(self.get()) article = MarkovChain(parser.strip()) - workspace = CopyvioWorkspace( - article, min_confidence, max_time, self._logger, self._addheaders, - short_circuit=short_circuit, parser_args={"mirror_hints": ["wikipedia.org/w/"]}) + parser_args = {} + if self._exclusions_db: self._exclusions_db.sync(self.site.name) exclude = lambda u: self._exclusions_db.check(self.site.name, u) + parser_args["mirror_hints"] = self._exclusions_db.get_mirror_hints( + self.site.name) else: exclude = None + workspace = CopyvioWorkspace( + article, min_confidence, max_time, self._logger, self._addheaders, + short_circuit=short_circuit, parser_args=parser_args) + if article.size < 20: # Auto-fail very small articles result = workspace.get_result() self._logger.info(result.get_log_message(self.title)) diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py index 3c88011..59dc124 100644 --- a/earwigbot/wiki/copyvios/exclusions.py +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -196,3 +196,22 @@ class ExclusionsDB(object): log = u"No exclusions in {0} for {1}".format(sitename, url) self._logger.debug(log) return False + + def get_mirror_hints(self, sitename, try_mobile=True): + """Return a list of strings that indicate the existence of a mirror. + + The source parser checks for the presence of these strings inside of + certain HTML tag attributes (``"href"`` and ``"src"``). + """ + site = self._sitesdb.get_site(sitename) + base = site.domain + site._script_path + roots = [base] + scripts = ["index.php", "load.php", "api.php"] + + if try_mobile: + fragments = re.search(r"^([\w]+)\.([\w]+).([\w]+)$", site.domain) + if fragments: + mobile = "{0}.m.{1}.{2}".format(*fragments.groups()) + roots.append(mobile + site._script_path) + + return [root + "/" + script for root in roots for script in scripts]