A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

439 lines
20 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from collections import OrderedDict
  23. from cookielib import LWPCookieJar, LoadError
  24. import errno
  25. from os import chmod, path
  26. from platform import python_version
  27. import stat
  28. import sqlite3 as sqlite
  29. from earwigbot import __version__
  30. from earwigbot.exceptions import SiteNotFoundError
  31. from earwigbot.wiki.copyvios.exclusions import ExclusionsDB
  32. from earwigbot.wiki.site import Site
  33. __all__ = ["SitesDB"]
  34. class SitesDB(object):
  35. """
  36. **EarwigBot: Wiki Toolset: Sites Database Manager**
  37. This class controls the :file:`sites.db` file, which stores information
  38. about all wiki sites known to the bot. Three public methods act as bridges
  39. between the bot's config files and :py:class:`~earwigbot.wiki.site.Site`
  40. objects:
  41. - :py:meth:`get_site`: returns a Site object corresponding to a site
  42. - :py:meth:`add_site`: stores a site in the database
  43. - :py:meth:`remove_site`: removes a site from the database
  44. There's usually no need to use this class directly. All public methods
  45. here are available as :py:meth:`bot.wiki.get_site`,
  46. :py:meth:`bot.wiki.add_site`, and :py:meth:`bot.wiki.remove_site`, which
  47. use a :file:`sites.db` file located in the same directory as our
  48. :file:`config.yml` file. Lower-level access can be achieved by importing
  49. the manager class (``from earwigbot.wiki import SitesDB``).
  50. """
  51. def __init__(self, bot):
  52. """Set up the manager with an attribute for the base Bot object."""
  53. self.config = bot.config
  54. self._logger = bot.logger.getChild("wiki")
  55. self._sites = {} # Internal site cache
  56. self._sitesdb = path.join(bot.config.root_dir, "sites.db")
  57. self._cookie_file = path.join(bot.config.root_dir, ".cookies")
  58. self._cookiejar = None
  59. excl_db = path.join(bot.config.root_dir, "exclusions.db")
  60. excl_logger = self._logger.getChild("exclusionsdb")
  61. self._exclusions_db = ExclusionsDB(self, excl_db, excl_logger)
  62. def __repr__(self):
  63. """Return the canonical string representation of the SitesDB."""
  64. res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})"
  65. return res.format(self.config, self._sitesdb, self._cookie_file)
  66. def __str__(self):
  67. """Return a nice string representation of the SitesDB."""
  68. return "<SitesDB at {0}>".format(self._sitesdb)
  69. def _get_cookiejar(self):
  70. """Return a LWPCookieJar object loaded from our .cookies file.
  71. The same .cookies file is returned every time, located in the project
  72. root, same directory as config.yml and bot.py. If it doesn't exist, we
  73. will create the file and set it to be readable and writeable only by
  74. us. If it exists but the information inside is bogus, we'll ignore it.
  75. This is normally called by _make_site_object() (in turn called by
  76. get_site()), and the cookiejar is passed to our Site's constructor,
  77. used when it makes API queries. This way, we can easily preserve
  78. cookies between sites (e.g., for CentralAuth), making logins easier.
  79. """
  80. if self._cookiejar:
  81. return self._cookiejar
  82. self._cookiejar = LWPCookieJar(self._cookie_file)
  83. try:
  84. self._cookiejar.load()
  85. except LoadError:
  86. pass # File contains bad data, so ignore it completely
  87. except IOError as e:
  88. if e.errno == errno.ENOENT: # "No such file or directory"
  89. # Create the file and restrict reading/writing only to the
  90. # owner, so others can't peak at our cookies:
  91. open(self._cookie_file, "w").close()
  92. chmod(self._cookie_file, stat.S_IRUSR|stat.S_IWUSR)
  93. else:
  94. raise
  95. return self._cookiejar
  96. def _create_sitesdb(self):
  97. """Initialize the sitesdb file with its three necessary tables."""
  98. script = """
  99. CREATE TABLE sites (site_name, site_project, site_lang, site_base_url,
  100. site_article_path, site_script_path);
  101. CREATE TABLE sql_data (sql_site, sql_data_key, sql_data_value);
  102. CREATE TABLE namespaces (ns_site, ns_id, ns_name, ns_is_primary_name);
  103. """
  104. with sqlite.connect(self._sitesdb) as conn:
  105. conn.executescript(script)
  106. def _get_site_object(self, name):
  107. """Return the site from our cache, or create it if it doesn't exist.
  108. This is essentially just a wrapper around _make_site_object that
  109. returns the same object each time a specific site is asked for.
  110. """
  111. try:
  112. return self._sites[name]
  113. except KeyError:
  114. site = self._make_site_object(name)
  115. self._sites[name] = site
  116. return site
  117. def _load_site_from_sitesdb(self, name):
  118. """Return all information stored in the sitesdb relating to given site.
  119. The information will be returned as a tuple, containing the site's
  120. name, project, language, base URL, article path, script path, SQL
  121. connection data, and namespaces, in that order. If the site is not
  122. found in the database, SiteNotFoundError will be raised. An empty
  123. database will be created before the exception is raised if none exists.
  124. """
  125. query1 = "SELECT * FROM sites WHERE site_name = ?"
  126. query2 = "SELECT sql_data_key, sql_data_value FROM sql_data WHERE sql_site = ?"
  127. query3 = "SELECT ns_id, ns_name, ns_is_primary_name FROM namespaces WHERE ns_site = ?"
  128. error = "Site '{0}' not found in the sitesdb.".format(name)
  129. with sqlite.connect(self._sitesdb) as conn:
  130. try:
  131. site_data = conn.execute(query1, (name,)).fetchone()
  132. except sqlite.OperationalError:
  133. self._create_sitesdb()
  134. raise SiteNotFoundError(error)
  135. if not site_data:
  136. raise SiteNotFoundError(error)
  137. sql_data = conn.execute(query2, (name,)).fetchall()
  138. ns_data = conn.execute(query3, (name,)).fetchall()
  139. name, project, lang, base_url, article_path, script_path = site_data
  140. sql = dict(sql_data)
  141. namespaces = {}
  142. for ns_id, ns_name, ns_is_primary_name in ns_data:
  143. try:
  144. if ns_is_primary_name: # "Primary" name goes first in list
  145. namespaces[ns_id].insert(0, ns_name)
  146. else: # Ordering of the aliases doesn't matter
  147. namespaces[ns_id].append(ns_name)
  148. except KeyError:
  149. namespaces[ns_id] = [ns_name]
  150. return (name, project, lang, base_url, article_path, script_path, sql,
  151. namespaces)
  152. def _make_site_object(self, name):
  153. """Return a Site object associated with the site *name* in our sitesdb.
  154. This calls _load_site_from_sitesdb(), so SiteNotFoundError will be
  155. raised if the site is not in our sitesdb.
  156. """
  157. cookiejar = self._get_cookiejar()
  158. (name, project, lang, base_url, article_path, script_path, sql,
  159. namespaces) = self._load_site_from_sitesdb(name)
  160. config = self.config
  161. login = (config.wiki.get("username"), config.wiki.get("password"))
  162. user_agent = config.wiki.get("userAgent")
  163. use_https = config.wiki.get("useHTTPS", False)
  164. assert_edit = config.wiki.get("assert")
  165. maxlag = config.wiki.get("maxlag")
  166. wait_between_queries = config.wiki.get("waitTime", 2)
  167. logger = self._logger.getChild(name)
  168. search_config = config.wiki.get("search", OrderedDict()).copy()
  169. if user_agent:
  170. user_agent = user_agent.replace("$1", __version__)
  171. user_agent = user_agent.replace("$2", python_version())
  172. if search_config:
  173. nltk_dir = path.join(self.config.root_dir, ".nltk")
  174. search_config["nltk_dir"] = nltk_dir
  175. search_config["exclusions_db"] = self._exclusions_db
  176. if not sql:
  177. sql = config.wiki.get("sql", OrderedDict()).copy()
  178. for key, value in sql.iteritems():
  179. if isinstance(value, basestring) and "$1" in value:
  180. sql[key] = value.replace("$1", name)
  181. return Site(name=name, project=project, lang=lang, base_url=base_url,
  182. article_path=article_path, script_path=script_path,
  183. sql=sql, namespaces=namespaces, login=login,
  184. cookiejar=cookiejar, user_agent=user_agent,
  185. use_https=use_https, assert_edit=assert_edit,
  186. maxlag=maxlag, wait_between_queries=wait_between_queries,
  187. logger=logger, search_config=search_config)
  188. def _get_site_name_from_sitesdb(self, project, lang):
  189. """Return the name of the first site with the given project and lang.
  190. If we can't find the site with the given information, we'll also try
  191. searching for a site whose base_url contains "{lang}.{project}". There
  192. are a few sites, like the French Wikipedia, that set their project to
  193. something other than the expected "wikipedia" ("wikipédia" in this
  194. case), but we should correctly find them when doing get_site(lang="fr",
  195. project="wikipedia").
  196. If the site is not found, return None. An empty sitesdb will be created
  197. if none exists.
  198. """
  199. query1 = "SELECT site_name FROM sites WHERE site_project = ? and site_lang = ?"
  200. query2 = "SELECT site_name FROM sites WHERE site_base_url LIKE ?"
  201. with sqlite.connect(self._sitesdb) as conn:
  202. try:
  203. site = conn.execute(query1, (project, lang)).fetchone()
  204. if site:
  205. return site[0]
  206. else:
  207. url = "%{0}.{1}%".format(lang, project)
  208. site = conn.execute(query2, (url,)).fetchone()
  209. return site[0] if site else None
  210. except sqlite.OperationalError:
  211. self._create_sitesdb()
  212. def _add_site_to_sitesdb(self, site):
  213. """Extract relevant info from a Site object and add it to the sitesdb.
  214. Works like a reverse _load_site_from_sitesdb(); the site's project,
  215. language, base URL, article path, script path, SQL connection data, and
  216. namespaces are extracted from the site and inserted into the sites
  217. database. If the sitesdb doesn't exist, we'll create it first.
  218. """
  219. name = site.name
  220. sites_data = (name, site.project, site.lang, site._base_url,
  221. site._article_path, site._script_path)
  222. sql_data = [(name, key, val) for key, val in site._sql_data.iteritems()]
  223. ns_data = []
  224. for ns_id, ns_names in site._namespaces.iteritems():
  225. ns_data.append((name, ns_id, ns_names.pop(0), True))
  226. for ns_name in ns_names:
  227. ns_data.append((name, ns_id, ns_name, False))
  228. with sqlite.connect(self._sitesdb) as conn:
  229. check_exists = "SELECT 1 FROM sites WHERE site_name = ?"
  230. try:
  231. exists = conn.execute(check_exists, (name,)).fetchone()
  232. except sqlite.OperationalError:
  233. self._create_sitesdb()
  234. else:
  235. if exists:
  236. conn.execute("DELETE FROM sites WHERE site_name = ?", (name,))
  237. conn.execute("DELETE FROM sql_data WHERE sql_site = ?", (name,))
  238. conn.execute("DELETE FROM namespaces WHERE ns_site = ?", (name,))
  239. conn.execute("INSERT INTO sites VALUES (?, ?, ?, ?, ?, ?)", sites_data)
  240. conn.executemany("INSERT INTO sql_data VALUES (?, ?, ?)", sql_data)
  241. conn.executemany("INSERT INTO namespaces VALUES (?, ?, ?, ?)", ns_data)
  242. def _remove_site_from_sitesdb(self, name):
  243. """Remove a site by name from the sitesdb and the internal cache."""
  244. try:
  245. del self._sites[name]
  246. except KeyError:
  247. pass
  248. with sqlite.connect(self._sitesdb) as conn:
  249. cursor = conn.execute("DELETE FROM sites WHERE site_name = ?", (name,))
  250. if cursor.rowcount == 0:
  251. return False
  252. else:
  253. conn.execute("DELETE FROM sql_data WHERE sql_site = ?", (name,))
  254. conn.execute("DELETE FROM namespaces WHERE ns_site = ?", (name,))
  255. self._logger.info("Removed site '{0}'".format(name))
  256. return True
  257. def get_site(self, name=None, project=None, lang=None):
  258. """Return a Site instance based on information from the sitesdb.
  259. With no arguments, return the default site as specified by our config
  260. file. This is ``config.wiki["defaultSite"]``.
  261. With *name* specified, return the site with that name. This is
  262. equivalent to the site's ``wikiid`` in the API, like *enwiki*.
  263. With *project* and *lang* specified, return the site whose project and
  264. language match these values. If there are multiple sites with the same
  265. values (unlikely), this is not a reliable way of loading a site. Call
  266. the function with an explicit *name* in that case.
  267. We will attempt to login to the site automatically using
  268. ``config.wiki["username"]`` and ``config.wiki["password"]`` if both are
  269. defined.
  270. Specifying a project without a lang or a lang without a project will
  271. raise :py:exc:`TypeError`. If all three args are specified, *name* will
  272. be first tried, then *project* and *lang* if *name* doesn't work. If a
  273. site cannot be found in the sitesdb,
  274. :py:exc:`~earwigbot.exceptions.SiteNotFoundError` will be raised. An
  275. empty sitesdb will be created if none is found.
  276. """
  277. # Someone specified a project without a lang, or vice versa:
  278. if (project and not lang) or (not project and lang):
  279. e = "Keyword arguments 'lang' and 'project' must be specified together."
  280. raise TypeError(e)
  281. # No args given, so return our default site:
  282. if not name and not project and not lang:
  283. try:
  284. default = self.config.wiki["defaultSite"]
  285. except KeyError:
  286. e = "Default site is not specified in config."
  287. raise SiteNotFoundError(e)
  288. return self._get_site_object(default)
  289. # Name arg given, but don't look at others unless `name` isn't found:
  290. if name:
  291. try:
  292. return self._get_site_object(name)
  293. except SiteNotFoundError:
  294. if project and lang:
  295. name = self._get_site_name_from_sitesdb(project, lang)
  296. if name:
  297. return self._get_site_object(name)
  298. raise
  299. # If we end up here, then project and lang are the only args given:
  300. name = self._get_site_name_from_sitesdb(project, lang)
  301. if name:
  302. return self._get_site_object(name)
  303. e = "Site '{0}:{1}' not found in the sitesdb.".format(project, lang)
  304. raise SiteNotFoundError(e)
  305. def add_site(self, project=None, lang=None, base_url=None,
  306. script_path="/w", sql=None):
  307. """Add a site to the sitesdb so it can be retrieved with get_site().
  308. If only a project and a lang are given, we'll guess the *base_url* as
  309. ``"//{lang}.{project}.org"`` (which is protocol-relative, becoming
  310. ``"https"`` if *useHTTPS* is ``True`` in config otherwise ``"http"``).
  311. If this is wrong, provide the correct *base_url* as an argument (in
  312. which case project and lang are ignored). Most wikis use ``"/w"`` as
  313. the script path (meaning the API is located at
  314. ``"{base_url}{script_path}/api.php"`` ->
  315. ``"//{lang}.{project}.org/w/api.php"``), so this is the default. If
  316. your wiki is different, provide the script_path as an argument. SQL
  317. connection settings are guessed automatically using config's template
  318. value. If this is wrong or not specified, provide a dict of kwargs as
  319. *sql* and Site will pass it to :py:func:`oursql.connect(**sql)
  320. <oursql.connect>`, allowing you to make queries with
  321. :py:meth:`site.sql_query <earwigbot.wiki.site.Site.sql_query>`.
  322. Returns ``True`` if the site was added successfully or ``False`` if the
  323. site is already in our sitesdb (this can be done purposefully to update
  324. old site info). Raises :py:exc:`~earwigbot.exception.SiteNotFoundError`
  325. if not enough information has been provided to identify the site (e.g.
  326. a *project* but not a *lang*).
  327. """
  328. if not base_url:
  329. if not project or not lang:
  330. e = "Without a base_url, both a project and a lang must be given."
  331. raise SiteNotFoundError(e)
  332. base_url = "//{0}.{1}.org".format(lang, project)
  333. cookiejar = self._get_cookiejar()
  334. config = self.config
  335. login = (config.wiki.get("username"), config.wiki.get("password"))
  336. user_agent = config.wiki.get("userAgent")
  337. use_https = config.wiki.get("useHTTPS", True)
  338. assert_edit = config.wiki.get("assert")
  339. maxlag = config.wiki.get("maxlag")
  340. wait_between_queries = config.wiki.get("waitTime", 2)
  341. if user_agent:
  342. user_agent = user_agent.replace("$1", __version__)
  343. user_agent = user_agent.replace("$2", python_version())
  344. # Create a Site object to log in and load the other attributes:
  345. site = Site(base_url=base_url, script_path=script_path, sql=sql,
  346. login=login, cookiejar=cookiejar, user_agent=user_agent,
  347. use_https=use_https, assert_edit=assert_edit,
  348. maxlag=maxlag, wait_between_queries=wait_between_queries)
  349. self._logger.info("Added site '{0}'".format(site.name))
  350. self._add_site_to_sitesdb(site)
  351. return self._get_site_object(site.name)
  352. def remove_site(self, name=None, project=None, lang=None):
  353. """Remove a site from the sitesdb.
  354. Returns ``True`` if the site was removed successfully or ``False`` if
  355. the site was not in our sitesdb originally. If all three args (*name*,
  356. *project*, and *lang*) are given, we'll first try *name* and then try
  357. the latter two if *name* wasn't found in the database. Raises
  358. :py:exc:`TypeError` if a project was given but not a language, or vice
  359. versa. Will create an empty sitesdb if none was found.
  360. """
  361. # Someone specified a project without a lang, or vice versa:
  362. if (project and not lang) or (not project and lang):
  363. e = "Keyword arguments 'lang' and 'project' must be specified together."
  364. raise TypeError(e)
  365. if name:
  366. was_removed = self._remove_site_from_sitesdb(name)
  367. if not was_removed:
  368. if project and lang:
  369. name = self._get_site_name_from_sitesdb(project, lang)
  370. if name:
  371. return self._remove_site_from_sitesdb(name)
  372. return was_removed
  373. if project and lang:
  374. name = self._get_site_name_from_sitesdb(project, lang)
  375. if name:
  376. return self._remove_site_from_sitesdb(name)
  377. return False