A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

165 řádky
6.9 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. import re
  23. import sqlite3 as sqlite
  24. from threading import Lock
  25. from time import time
  26. from earwigbot import exceptions
  27. __all__ = ["ExclusionsDB"]
  28. default_sources = {
  29. "enwiki": [
  30. "Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def",
  31. "Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl",
  32. "Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr",
  33. "Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz"
  34. ]
  35. }
  36. class ExclusionsDB(object):
  37. """
  38. **EarwigBot: Wiki Toolset: Exclusions Database Manager**
  39. Controls the :file:`.exclusions.db` file, which stores URLs excluded from
  40. copyright violation checks on account of being known mirrors, for example.
  41. """
  42. def __init__(self, sitesdb, dbfile, logger):
  43. self._sitesdb = sitesdb
  44. self._dbfile = dbfile
  45. self._logger = logger
  46. self._db_access_lock = Lock()
  47. def __repr__(self):
  48. """Return the canonical string representation of the ExclusionsDB."""
  49. res = "ExclusionsDB(sitesdb={0!r}, dbfile={1!r}, logger={2!r})"
  50. return res.format(self._sitesdb, self._dbfile, self._logger)
  51. def __str__(self):
  52. """Return a nice string representation of the ExclusionsDB."""
  53. return "<ExclusionsDB at {0}>".format(self._dbfile)
  54. def _create(self):
  55. """Initialize the exclusions database with its necessary tables."""
  56. script = """
  57. CREATE TABLE sources (source_sitename, source_page);
  58. CREATE TABLE updates (update_sitename, update_time);
  59. CREATE TABLE exclusions (exclusion_sitename, exclusion_url);
  60. """
  61. query = "INSERT INTO sources VALUES (?, ?);"
  62. sources = []
  63. for sitename, pages in default_sources.iteritems():
  64. [sources.append((sitename, page)) for page in pages]
  65. with sqlite.connect(self._dbfile) as conn:
  66. conn.executescript(script)
  67. conn.executemany(query, sources)
  68. def _load_source(self, site, source):
  69. """Load from a specific source and return a set of URLs."""
  70. urls = set()
  71. try:
  72. data = site.get_page(source).get()
  73. except exceptions.PageNotFoundError:
  74. return urls
  75. regexes = [
  76. "url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>",
  77. "\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?"
  78. ]
  79. for regex in regexes:
  80. [urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)]
  81. return urls
  82. def _update(self, sitename):
  83. """Update the database from listed sources in the index."""
  84. query1 = "SELECT source_page FROM sources WHERE source_sitename = ?;"
  85. query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
  86. query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?"
  87. query4 = "INSERT INTO exclusions VALUES (?, ?);"
  88. query5 = "SELECT 1 FROM updates WHERE update_sitename = ?;"
  89. query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?;"
  90. query7 = "INSERT INTO updates VALUES (?, ?);"
  91. site = self._sitesdb.get_site(sitename)
  92. with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
  93. urls = set()
  94. for (source,) in conn.execute(query1, (sitename,)):
  95. urls |= self._load_source(site, source)
  96. for (url,) in conn.execute(query2, (sitename,)):
  97. if url in urls:
  98. urls.remove(url)
  99. else:
  100. conn.execute(query3, (sitename, url))
  101. conn.executemany(query4, [(sitename, url) for url in urls])
  102. if conn.execute(query5, (sitename,)).fetchone():
  103. conn.execute(query6, (time(), sitename))
  104. else:
  105. conn.execute(query7, (sitename, time()))
  106. def _get_last_update(self, sitename):
  107. """Return the UNIX timestamp of the last time the db was updated."""
  108. query = "SELECT update_time FROM updates WHERE update_sitename = ?;"
  109. with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
  110. try:
  111. result = conn.execute(query, (sitename,)).fetchone()
  112. except sqlite.OperationalError:
  113. self._create()
  114. return 0
  115. return result[0] if result else 0
  116. def sync(self, sitename):
  117. """Update the database if it hasn't been updated in the past month.
  118. This only updates the exclusions database for the *sitename* site.
  119. """
  120. max_staleness = 60 * 60 * 24 * 30
  121. time_since_update = int(time() - self._get_last_update(sitename))
  122. if time_since_update > max_staleness:
  123. log = u"Updating stale database: {0} (last updated {1} seconds ago)"
  124. self._logger.info(log.format(sitename, time_since_update))
  125. self._update(sitename)
  126. else:
  127. log = u"Database for {0} is still fresh (last updated {1} seconds ago)"
  128. self._logger.debug(log.format(sitename, time_since_update))
  129. def check(self, sitename, url):
  130. """Check whether a given URL is in the exclusions database.
  131. Return ``True`` if the URL is in the database, or ``False`` otherwise.
  132. """
  133. normalized = re.sub("https?://", "", url.lower())
  134. query = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
  135. with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
  136. for row in conn.execute(query, (sitename,)):
  137. if normalized.startswith(row[0]):
  138. log = u"Exclusion detected in {0} for {1}"
  139. self._logger.debug(log.format(sitename, url))
  140. return True
  141. log = u"No exclusions in {0} for {1}".format(sitename, url)
  142. self._logger.debug(log)
  143. return False