A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

244 lines
9.8 KiB

  1. # Copyright (C) 2009-2016 Ben Kurtovic <ben.kurtovic@gmail.com>
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19. # SOFTWARE.
  20. import re
  21. import sqlite3 as sqlite
  22. from threading import Lock
  23. from time import time
  24. from urllib.parse import urlparse
  25. from earwigbot import exceptions
  26. __all__ = ["ExclusionsDB"]
  27. DEFAULT_SOURCES = {
  28. "all": [ # Applies to all, but located on enwiki
  29. "User:EarwigBot/Copyvios/Exclusions",
  30. "User:EranBot/Copyright/Blacklist",
  31. ],
  32. "enwiki": [
  33. "Wikipedia:Mirrors and forks/ABC",
  34. "Wikipedia:Mirrors and forks/DEF",
  35. "Wikipedia:Mirrors and forks/GHI",
  36. "Wikipedia:Mirrors and forks/JKL",
  37. "Wikipedia:Mirrors and forks/MNO",
  38. "Wikipedia:Mirrors and forks/PQR",
  39. "Wikipedia:Mirrors and forks/STU",
  40. "Wikipedia:Mirrors and forks/VWXYZ",
  41. ],
  42. }
  43. _RE_STRIP_PREFIX = r"^https?://(www\.)?"
  44. class ExclusionsDB:
  45. """
  46. **EarwigBot: Wiki Toolset: Exclusions Database Manager**
  47. Controls the :file:`exclusions.db` file, which stores URLs excluded from
  48. copyright violation checks on account of being known mirrors, for example.
  49. """
  50. def __init__(self, sitesdb, dbfile, logger):
  51. self._sitesdb = sitesdb
  52. self._dbfile = dbfile
  53. self._logger = logger
  54. self._db_access_lock = Lock()
  55. def __repr__(self):
  56. """Return the canonical string representation of the ExclusionsDB."""
  57. res = "ExclusionsDB(sitesdb={0!r}, dbfile={1!r}, logger={2!r})"
  58. return res.format(self._sitesdb, self._dbfile, self._logger)
  59. def __str__(self):
  60. """Return a nice string representation of the ExclusionsDB."""
  61. return f"<ExclusionsDB at {self._dbfile}>"
  62. def _create(self):
  63. """Initialize the exclusions database with its necessary tables."""
  64. script = """
  65. CREATE TABLE sources (source_sitename, source_page);
  66. CREATE TABLE updates (update_sitename, update_time);
  67. CREATE TABLE exclusions (exclusion_sitename, exclusion_url);
  68. """
  69. query = "INSERT INTO sources VALUES (?, ?);"
  70. sources = []
  71. for sitename, pages in DEFAULT_SOURCES.items():
  72. for page in pages:
  73. sources.append((sitename, page))
  74. with sqlite.connect(self._dbfile) as conn:
  75. conn.executescript(script)
  76. conn.executemany(query, sources)
  77. def _load_source(self, site, source):
  78. """Load from a specific source and return a set of URLs."""
  79. urls = set()
  80. try:
  81. data = site.get_page(source, follow_redirects=True).get()
  82. except exceptions.PageNotFoundError:
  83. return urls
  84. if source == "User:EarwigBot/Copyvios/Exclusions":
  85. for line in data.splitlines():
  86. match = re.match(
  87. r"^\s*url\s*=\s*(?:\<nowiki\>\s*)?(.+?)\s*(?:\</nowiki\>\s*)?(?:#.*?)?$",
  88. line,
  89. )
  90. if match:
  91. url = re.sub(_RE_STRIP_PREFIX, "", match.group(1))
  92. if url:
  93. urls.add(url)
  94. return urls
  95. if source == "User:EranBot/Copyright/Blacklist":
  96. for line in data.splitlines()[1:]:
  97. line = re.sub(r"(#|==).*$", "", line).strip()
  98. if line:
  99. urls.add("re:" + line)
  100. return urls
  101. for line in data.splitlines():
  102. if re.match(r"^(\s*\|?\s*url\s*=)|(\*?\s*Site:)", line):
  103. for url in re.findall(r"(https?://.+?)(?:[ [\]<>{}()]|$)", line):
  104. url = re.sub(_RE_STRIP_PREFIX, "", url)
  105. if url:
  106. urls.add(url)
  107. return urls
  108. def _update(self, sitename):
  109. """Update the database from listed sources in the index."""
  110. query1 = "SELECT source_page FROM sources WHERE source_sitename = ?"
  111. query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
  112. query3 = (
  113. "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?"
  114. )
  115. query4 = "INSERT INTO exclusions VALUES (?, ?)"
  116. query5 = "SELECT 1 FROM updates WHERE update_sitename = ?"
  117. query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?"
  118. query7 = "INSERT INTO updates VALUES (?, ?)"
  119. if sitename == "all":
  120. site = self._sitesdb.get_site("enwiki")
  121. else:
  122. site = self._sitesdb.get_site(sitename)
  123. with self._db_access_lock, sqlite.connect(self._dbfile) as conn:
  124. urls = set()
  125. for (source,) in conn.execute(query1, (sitename,)):
  126. urls |= self._load_source(site, source)
  127. for (url,) in conn.execute(query2, (sitename,)):
  128. if url in urls:
  129. urls.remove(url)
  130. else:
  131. conn.execute(query3, (sitename, url))
  132. conn.executemany(query4, [(sitename, url) for url in urls])
  133. if conn.execute(query5, (sitename,)).fetchone():
  134. conn.execute(query6, (int(time()), sitename))
  135. else:
  136. conn.execute(query7, (sitename, int(time())))
  137. def _get_last_update(self, sitename):
  138. """Return the UNIX timestamp of the last time the db was updated."""
  139. query = "SELECT update_time FROM updates WHERE update_sitename = ?"
  140. with self._db_access_lock, sqlite.connect(self._dbfile) as conn:
  141. try:
  142. result = conn.execute(query, (sitename,)).fetchone()
  143. except sqlite.OperationalError:
  144. self._create()
  145. return 0
  146. return result[0] if result else 0
  147. def sync(self, sitename, force=False):
  148. """Update the database if it hasn't been updated recently.
  149. This updates the exclusions database for the site *sitename* and "all".
  150. Site-specific lists are considered stale after 48 hours; global lists
  151. after 12 hours.
  152. """
  153. max_staleness = 60 * 60 * (12 if sitename == "all" else 48)
  154. time_since_update = int(time() - self._get_last_update(sitename))
  155. if force or time_since_update > max_staleness:
  156. log = "Updating stale database: {0} (last updated {1} seconds ago)"
  157. self._logger.info(log.format(sitename, time_since_update))
  158. self._update(sitename)
  159. else:
  160. log = "Database for {0} is still fresh (last updated {1} seconds ago)"
  161. self._logger.debug(log.format(sitename, time_since_update))
  162. if sitename != "all":
  163. self.sync("all", force=force)
  164. def check(self, sitename, url):
  165. """Check whether a given URL is in the exclusions database.
  166. Return ``True`` if the URL is in the database, or ``False`` otherwise.
  167. """
  168. normalized = re.sub(_RE_STRIP_PREFIX, "", url.lower())
  169. query = """SELECT exclusion_url FROM exclusions
  170. WHERE exclusion_sitename = ? OR exclusion_sitename = ?"""
  171. with self._db_access_lock, sqlite.connect(self._dbfile) as conn:
  172. for (excl,) in conn.execute(query, (sitename, "all")):
  173. excl = excl.lower()
  174. if excl.startswith("*."):
  175. parsed = urlparse(url.lower())
  176. matches = excl[2:] in parsed.netloc
  177. if matches and "/" in excl:
  178. excl_path = excl[excl.index("/") + 1]
  179. matches = excl_path.startswith(parsed.path)
  180. elif excl.startswith("re:"):
  181. try:
  182. matches = re.match(excl[3:], normalized)
  183. except re.error:
  184. continue
  185. else:
  186. matches = normalized.startswith(excl)
  187. if matches:
  188. log = "Exclusion detected in {0} for {1}"
  189. self._logger.debug(log.format(sitename, url))
  190. return True
  191. log = f"No exclusions in {sitename} for {url}"
  192. self._logger.debug(log)
  193. return False
  194. def get_mirror_hints(self, page, try_mobile=True):
  195. """Return a list of strings that indicate the existence of a mirror.
  196. The source parser checks for the presence of these strings inside of
  197. certain HTML tag attributes (``"href"`` and ``"src"``).
  198. """
  199. site = page.site
  200. path = urlparse(page.url).path
  201. roots = [site.domain]
  202. scripts = ["index.php", "load.php", "api.php"]
  203. if try_mobile:
  204. fragments = re.search(r"^([\w]+)\.([\w]+).([\w]+)$", site.domain)
  205. if fragments:
  206. roots.append("{}.m.{}.{}".format(*fragments.groups()))
  207. general = [
  208. root + site._script_path + "/" + script
  209. for root in roots
  210. for script in scripts
  211. ]
  212. specific = [root + path for root in roots]
  213. return general + specific