Преглед изворни кода

Improve excluded URL list parsing

tags/v0.4
Ben Kurtovic пре 5 година
родитељ
комит
1cdc0a5a4c
2 измењених фајлова са 24 додато и 15 уклоњено
  1. +1
    -1
      CHANGELOG
  2. +23
    -14
      earwigbot/wiki/copyvios/exclusions.py

+ 1
- 1
CHANGELOG Прегледај датотеку

@@ -1,6 +1,6 @@
v0.4 (unreleased): v0.4 (unreleased):


- ...
- Copyvio detector: improved parsing of excluded URL lists.


v0.3 (released March 24, 2019): v0.3 (released March 24, 2019):




+ 23
- 14
earwigbot/wiki/copyvios/exclusions.py Прегледај датотеку

@@ -36,13 +36,15 @@ DEFAULT_SOURCES = {
"User:EranBot/Copyright/Blacklist" "User:EranBot/Copyright/Blacklist"
], ],
"enwiki": [ "enwiki": [
"Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def",
"Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl",
"Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr",
"Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz"
"Wikipedia:Mirrors and forks/ABC", "Wikipedia:Mirrors and forks/DEF",
"Wikipedia:Mirrors and forks/GHI", "Wikipedia:Mirrors and forks/JKL",
"Wikipedia:Mirrors and forks/MNO", "Wikipedia:Mirrors and forks/PQR",
"Wikipedia:Mirrors and forks/STU", "Wikipedia:Mirrors and forks/VWXYZ"
] ]
} }


_RE_STRIP_PREFIX = r"^https?://(www\.)?"

class ExclusionsDB(object): class ExclusionsDB(object):
""" """
**EarwigBot: Wiki Toolset: Exclusions Database Manager** **EarwigBot: Wiki Toolset: Exclusions Database Manager**
@@ -87,10 +89,19 @@ class ExclusionsDB(object):
"""Load from a specific source and return a set of URLs.""" """Load from a specific source and return a set of URLs."""
urls = set() urls = set()
try: try:
data = site.get_page(source).get()
data = site.get_page(source, follow_redirects=True).get()
except exceptions.PageNotFoundError: except exceptions.PageNotFoundError:
return urls return urls


if source == "User:EarwigBot/Copyvios/Exclusions":
for line in data.splitlines():
match = re.match(r"^\s*url\s*=\s*(?:\<nowiki\>\s*)?(.+?)\s*(?:\</nowiki\>\s*)?$", line)
if match:
url = re.sub(_RE_STRIP_PREFIX, "", match.group(1))
if url:
urls.add(url)
return urls

if source == "User:EranBot/Copyright/Blacklist": if source == "User:EranBot/Copyright/Blacklist":
for line in data.splitlines()[1:]: for line in data.splitlines()[1:]:
line = re.sub(r"(#|==).*$", "", line).strip() line = re.sub(r"(#|==).*$", "", line).strip()
@@ -98,14 +109,12 @@ class ExclusionsDB(object):
urls.add("re:" + line) urls.add("re:" + line)
return urls return urls


regexes = [
r"url\s*=\s*(?:\<nowiki\>)?(?:https?:)?(?://)?(.*?)(?:\</nowiki\>.*?)?\s*$",
r"\*\s*Site:\s*(?:\[|\<nowiki\>)?(?:https?:)?(?://)?(.*?)(?:\].*?|\</nowiki\>.*?)?\s*$"
]
for regex in regexes:
for url in re.findall(regex, data, re.I|re.M):
if url.strip():
urls.add(url.lower().strip())
for line in data.splitlines():
if re.match(r"^(\s*\|?\s*url\s*=)|(\*?\s*Site:)", line):
for url in re.findall(r"(https?://.+?)(?:[ [\]<>{}()]|$)", line):
url = re.sub(_RE_STRIP_PREFIX, "", url)
if url:
urls.add(url)
return urls return urls


def _update(self, sitename): def _update(self, sitename):
@@ -173,7 +182,7 @@ class ExclusionsDB(object):


Return ``True`` if the URL is in the database, or ``False`` otherwise. Return ``True`` if the URL is in the database, or ``False`` otherwise.
""" """
normalized = re.sub(r"^https?://(www\.)?", "", url.lower())
normalized = re.sub(_RE_STRIP_PREFIX, "", url.lower())
query = """SELECT exclusion_url FROM exclusions query = """SELECT exclusion_url FROM exclusions
WHERE exclusion_sitename = ? OR exclusion_sitename = ?""" WHERE exclusion_sitename = ? OR exclusion_sitename = ?"""
with self._db_access_lock, sqlite.connect(self._dbfile) as conn: with self._db_access_lock, sqlite.connect(self._dbfile) as conn:


Loading…
Откажи
Сачувај