From 1cdc0a5a4c2d7ee33aedddbc8f57ac3e98ea9e28 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 20 Jun 2019 23:32:04 -0400 Subject: [PATCH] Improve excluded URL list parsing --- CHANGELOG | 2 +- earwigbot/wiki/copyvios/exclusions.py | 37 ++++++++++++++++++++++------------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 349c9aa..a6cdfb0 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,6 @@ v0.4 (unreleased): -- ... +- Copyvio detector: improved parsing of excluded URL lists. v0.3 (released March 24, 2019): diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py index 3e86def..c566107 100644 --- a/earwigbot/wiki/copyvios/exclusions.py +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -36,13 +36,15 @@ DEFAULT_SOURCES = { "User:EranBot/Copyright/Blacklist" ], "enwiki": [ - "Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def", - "Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl", - "Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr", - "Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz" + "Wikipedia:Mirrors and forks/ABC", "Wikipedia:Mirrors and forks/DEF", + "Wikipedia:Mirrors and forks/GHI", "Wikipedia:Mirrors and forks/JKL", + "Wikipedia:Mirrors and forks/MNO", "Wikipedia:Mirrors and forks/PQR", + "Wikipedia:Mirrors and forks/STU", "Wikipedia:Mirrors and forks/VWXYZ" ] } +_RE_STRIP_PREFIX = r"^https?://(www\.)?" + class ExclusionsDB(object): """ **EarwigBot: Wiki Toolset: Exclusions Database Manager** @@ -87,10 +89,19 @@ class ExclusionsDB(object): """Load from a specific source and return a set of URLs.""" urls = set() try: - data = site.get_page(source).get() + data = site.get_page(source, follow_redirects=True).get() except exceptions.PageNotFoundError: return urls + if source == "User:EarwigBot/Copyvios/Exclusions": + for line in data.splitlines(): + match = re.match(r"^\s*url\s*=\s*(?:\\s*)?(.+?)\s*(?:\\s*)?$", line) + if match: + url = re.sub(_RE_STRIP_PREFIX, "", match.group(1)) + if url: + urls.add(url) + return urls + if source == "User:EranBot/Copyright/Blacklist": for line in data.splitlines()[1:]: line = re.sub(r"(#|==).*$", "", line).strip() @@ -98,14 +109,12 @@ class ExclusionsDB(object): urls.add("re:" + line) return urls - regexes = [ - r"url\s*=\s*(?:\)?(?:https?:)?(?://)?(.*?)(?:\.*?)?\s*$", - r"\*\s*Site:\s*(?:\[|\)?(?:https?:)?(?://)?(.*?)(?:\].*?|\.*?)?\s*$" - ] - for regex in regexes: - for url in re.findall(regex, data, re.I|re.M): - if url.strip(): - urls.add(url.lower().strip()) + for line in data.splitlines(): + if re.match(r"^(\s*\|?\s*url\s*=)|(\*?\s*Site:)", line): + for url in re.findall(r"(https?://.+?)(?:[ [\]<>{}()]|$)", line): + url = re.sub(_RE_STRIP_PREFIX, "", url) + if url: + urls.add(url) return urls def _update(self, sitename): @@ -173,7 +182,7 @@ class ExclusionsDB(object): Return ``True`` if the URL is in the database, or ``False`` otherwise. """ - normalized = re.sub(r"^https?://(www\.)?", "", url.lower()) + normalized = re.sub(_RE_STRIP_PREFIX, "", url.lower()) query = """SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ? OR exclusion_sitename = ?""" with self._db_access_lock, sqlite.connect(self._dbfile) as conn: