A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

199 lines
8.3 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. import re
  23. import sqlite3 as sqlite
  24. from threading import Lock
  25. from time import time
  26. from urlparse import urlparse
  27. from earwigbot import exceptions
  28. __all__ = ["ExclusionsDB"]
  29. DEFAULT_SOURCES = {
  30. "all": [ # Applies to all, but located on enwiki
  31. "User:EarwigBot/Copyvios/Exclusions",
  32. "User:EranBot/Copyright/Blacklist"
  33. ],
  34. "enwiki": [
  35. "Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def",
  36. "Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl",
  37. "Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr",
  38. "Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz"
  39. ]
  40. }
  41. class ExclusionsDB(object):
  42. """
  43. **EarwigBot: Wiki Toolset: Exclusions Database Manager**
  44. Controls the :file:`exclusions.db` file, which stores URLs excluded from
  45. copyright violation checks on account of being known mirrors, for example.
  46. """
  47. def __init__(self, sitesdb, dbfile, logger):
  48. self._sitesdb = sitesdb
  49. self._dbfile = dbfile
  50. self._logger = logger
  51. self._db_access_lock = Lock()
  52. def __repr__(self):
  53. """Return the canonical string representation of the ExclusionsDB."""
  54. res = "ExclusionsDB(sitesdb={0!r}, dbfile={1!r}, logger={2!r})"
  55. return res.format(self._sitesdb, self._dbfile, self._logger)
  56. def __str__(self):
  57. """Return a nice string representation of the ExclusionsDB."""
  58. return "<ExclusionsDB at {0}>".format(self._dbfile)
  59. def _create(self):
  60. """Initialize the exclusions database with its necessary tables."""
  61. script = """
  62. CREATE TABLE sources (source_sitename, source_page);
  63. CREATE TABLE updates (update_sitename, update_time);
  64. CREATE TABLE exclusions (exclusion_sitename, exclusion_url);
  65. """
  66. query = "INSERT INTO sources VALUES (?, ?);"
  67. sources = []
  68. for sitename, pages in DEFAULT_SOURCES.iteritems():
  69. for page in pages:
  70. sources.append((sitename, page))
  71. with sqlite.connect(self._dbfile) as conn:
  72. conn.executescript(script)
  73. conn.executemany(query, sources)
  74. def _load_source(self, site, source):
  75. """Load from a specific source and return a set of URLs."""
  76. urls = set()
  77. try:
  78. data = site.get_page(source).get()
  79. except exceptions.PageNotFoundError:
  80. return urls
  81. if source == "User:EranBot/Copyright/Blacklist":
  82. for line in data.splitlines()[1:]:
  83. line = re.sub(r"(#|==).*$", "", line).strip()
  84. if line:
  85. urls.add("re:" + line)
  86. return urls
  87. regexes = [
  88. r"url\s*=\s*(?:\<nowiki\>)?(?:https?:)?(?://)?(.*?)(?:\</nowiki\>.*?)?\s*$",
  89. r"\*\s*Site:\s*(?:\[|\<nowiki\>)?(?:https?:)?(?://)?(.*?)(?:\].*?|\</nowiki\>.*?)?\s*$"
  90. ]
  91. for regex in regexes:
  92. for url in re.findall(regex, data, re.I|re.M):
  93. if url.strip():
  94. urls.add(url.lower().strip())
  95. return urls
  96. def _update(self, sitename):
  97. """Update the database from listed sources in the index."""
  98. query1 = "SELECT source_page FROM sources WHERE source_sitename = ?"
  99. query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
  100. query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?"
  101. query4 = "INSERT INTO exclusions VALUES (?, ?)"
  102. query5 = "SELECT 1 FROM updates WHERE update_sitename = ?"
  103. query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?"
  104. query7 = "INSERT INTO updates VALUES (?, ?)"
  105. if sitename == "all":
  106. site = self._sitesdb.get_site("enwiki")
  107. else:
  108. site = self._sitesdb.get_site(sitename)
  109. with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
  110. urls = set()
  111. for (source,) in conn.execute(query1, (sitename,)):
  112. urls |= self._load_source(site, source)
  113. for (url,) in conn.execute(query2, (sitename,)):
  114. if url in urls:
  115. urls.remove(url)
  116. else:
  117. conn.execute(query3, (sitename, url))
  118. conn.executemany(query4, [(sitename, url) for url in urls])
  119. if conn.execute(query5, (sitename,)).fetchone():
  120. conn.execute(query6, (int(time()), sitename))
  121. else:
  122. conn.execute(query7, (sitename, int(time())))
  123. def _get_last_update(self, sitename):
  124. """Return the UNIX timestamp of the last time the db was updated."""
  125. query = "SELECT update_time FROM updates WHERE update_sitename = ?"
  126. with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
  127. try:
  128. result = conn.execute(query, (sitename,)).fetchone()
  129. except sqlite.OperationalError:
  130. self._create()
  131. return 0
  132. return result[0] if result else 0
  133. def sync(self, sitename, force=False):
  134. """Update the database if it hasn't been updated in the past day.
  135. This updates the exclusions database for the site *sitename* and "all".
  136. """
  137. max_staleness = 60 * 60 * 24
  138. time_since_update = int(time() - self._get_last_update(sitename))
  139. if force or time_since_update > max_staleness:
  140. log = u"Updating stale database: {0} (last updated {1} seconds ago)"
  141. self._logger.info(log.format(sitename, time_since_update))
  142. self._update(sitename)
  143. else:
  144. log = u"Database for {0} is still fresh (last updated {1} seconds ago)"
  145. self._logger.debug(log.format(sitename, time_since_update))
  146. if sitename != "all":
  147. self.sync("all", force=force)
  148. def check(self, sitename, url):
  149. """Check whether a given URL is in the exclusions database.
  150. Return ``True`` if the URL is in the database, or ``False`` otherwise.
  151. """
  152. normalized = re.sub(r"^https?://(www\.)?", "", url.lower())
  153. query = """SELECT exclusion_url FROM exclusions
  154. WHERE exclusion_sitename = ? OR exclusion_sitename = ?"""
  155. with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
  156. for (excl,) in conn.execute(query, (sitename, "all")):
  157. if excl.startswith("*."):
  158. parsed = urlparse(url.lower())
  159. matches = excl[2:] in parsed.netloc
  160. if matches and "/" in excl:
  161. excl_path = excl[excl.index("/") + 1]
  162. matches = excl_path.startswith(parsed.path)
  163. elif excl.startswith("re:"):
  164. try:
  165. matches = re.match(excl[3:], normalized)
  166. except re.error:
  167. continue
  168. else:
  169. matches = normalized.startswith(excl)
  170. if matches:
  171. log = u"Exclusion detected in {0} for {1}"
  172. self._logger.debug(log.format(sitename, url))
  173. return True
  174. log = u"No exclusions in {0} for {1}".format(sitename, url)
  175. self._logger.debug(log)
  176. return False