Browse Source

Improve excluded URL list parsing

legacy-python2
Ben Kurtovic 4 years ago
parent
commit
1cdc0a5a4c
2 changed files with 24 additions and 15 deletions
  1. +1
    -1
      CHANGELOG
  2. +23
    -14
      earwigbot/wiki/copyvios/exclusions.py

+ 1
- 1
CHANGELOG View File

@@ -1,6 +1,6 @@
v0.4 (unreleased):

- ...
- Copyvio detector: improved parsing of excluded URL lists.

v0.3 (released March 24, 2019):



+ 23
- 14
earwigbot/wiki/copyvios/exclusions.py View File

@@ -36,13 +36,15 @@ DEFAULT_SOURCES = {
"User:EranBot/Copyright/Blacklist"
],
"enwiki": [
"Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def",
"Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl",
"Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr",
"Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz"
"Wikipedia:Mirrors and forks/ABC", "Wikipedia:Mirrors and forks/DEF",
"Wikipedia:Mirrors and forks/GHI", "Wikipedia:Mirrors and forks/JKL",
"Wikipedia:Mirrors and forks/MNO", "Wikipedia:Mirrors and forks/PQR",
"Wikipedia:Mirrors and forks/STU", "Wikipedia:Mirrors and forks/VWXYZ"
]
}

_RE_STRIP_PREFIX = r"^https?://(www\.)?"

class ExclusionsDB(object):
"""
**EarwigBot: Wiki Toolset: Exclusions Database Manager**
@@ -87,10 +89,19 @@ class ExclusionsDB(object):
"""Load from a specific source and return a set of URLs."""
urls = set()
try:
data = site.get_page(source).get()
data = site.get_page(source, follow_redirects=True).get()
except exceptions.PageNotFoundError:
return urls

if source == "User:EarwigBot/Copyvios/Exclusions":
for line in data.splitlines():
match = re.match(r"^\s*url\s*=\s*(?:\<nowiki\>\s*)?(.+?)\s*(?:\</nowiki\>\s*)?$", line)
if match:
url = re.sub(_RE_STRIP_PREFIX, "", match.group(1))
if url:
urls.add(url)
return urls

if source == "User:EranBot/Copyright/Blacklist":
for line in data.splitlines()[1:]:
line = re.sub(r"(#|==).*$", "", line).strip()
@@ -98,14 +109,12 @@ class ExclusionsDB(object):
urls.add("re:" + line)
return urls

regexes = [
r"url\s*=\s*(?:\<nowiki\>)?(?:https?:)?(?://)?(.*?)(?:\</nowiki\>.*?)?\s*$",
r"\*\s*Site:\s*(?:\[|\<nowiki\>)?(?:https?:)?(?://)?(.*?)(?:\].*?|\</nowiki\>.*?)?\s*$"
]
for regex in regexes:
for url in re.findall(regex, data, re.I|re.M):
if url.strip():
urls.add(url.lower().strip())
for line in data.splitlines():
if re.match(r"^(\s*\|?\s*url\s*=)|(\*?\s*Site:)", line):
for url in re.findall(r"(https?://.+?)(?:[ [\]<>{}()]|$)", line):
url = re.sub(_RE_STRIP_PREFIX, "", url)
if url:
urls.add(url)
return urls

def _update(self, sitename):
@@ -173,7 +182,7 @@ class ExclusionsDB(object):

Return ``True`` if the URL is in the database, or ``False`` otherwise.
"""
normalized = re.sub(r"^https?://(www\.)?", "", url.lower())
normalized = re.sub(_RE_STRIP_PREFIX, "", url.lower())
query = """SELECT exclusion_url FROM exclusions
WHERE exclusion_sitename = ? OR exclusion_sitename = ?"""
with self._db_access_lock, sqlite.connect(self._dbfile) as conn:


Loading…
Cancel
Save