A Python robot that edits Wikipedia and interacts with people over IRC https://en.wikipedia.org/wiki/User:EarwigBot
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

176 lines
6.8 KiB

  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22. from threading import Event
  23. from time import time
  24. from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION
  25. __all__ = ["CopyvioSource", "CopyvioCheckResult"]
  26. class CopyvioSource:
  27. """
  28. **EarwigBot: Wiki Toolset: Copyvio Source**
  29. A class that represents a single possible source of a copyright violation,
  30. i.e., a URL.
  31. *Attributes:*
  32. - :py:attr:`url`: the URL of the source
  33. - :py:attr:`confidence`: the confidence of a violation, between 0 and 1
  34. - :py:attr:`chains`: a 2-tuple of the source chain and the delta chain
  35. - :py:attr:`skipped`: whether this URL was skipped during the check
  36. - :py:attr:`excluded`: whether this URL was in the exclusions list
  37. """
  38. def __init__(self, workspace, url, headers=None, timeout=5,
  39. parser_args=None, search_config=None):
  40. self.workspace = workspace
  41. self.url = url
  42. self.headers = headers
  43. self.timeout = timeout
  44. self.parser_args = parser_args
  45. self.search_config = search_config
  46. self.confidence = 0.0
  47. self.chains = (EMPTY, EMPTY_INTERSECTION)
  48. self.skipped = False
  49. self.excluded = False
  50. self._event1 = Event()
  51. self._event2 = Event()
  52. self._event2.set()
  53. def __repr__(self):
  54. """Return the canonical string representation of the source."""
  55. res = ("CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r}, "
  56. "excluded={3!r})")
  57. return res.format(
  58. self.url, self.confidence, self.skipped, self.excluded)
  59. def __str__(self):
  60. """Return a nice string representation of the source."""
  61. if self.excluded:
  62. return "<CopyvioSource ({0}, excluded)>".format(self.url)
  63. if self.skipped:
  64. return "<CopyvioSource ({0}, skipped)>".format(self.url)
  65. res = "<CopyvioSource ({0} with {1} conf)>"
  66. return res.format(self.url, self.confidence)
  67. def start_work(self):
  68. """Mark this source as being worked on right now."""
  69. self._event2.clear()
  70. self._event1.set()
  71. def update(self, confidence, source_chain, delta_chain):
  72. """Fill out the confidence and chain information inside this source."""
  73. self.confidence = confidence
  74. self.chains = (source_chain, delta_chain)
  75. def finish_work(self):
  76. """Mark this source as finished."""
  77. self._event2.set()
  78. def skip(self):
  79. """Deactivate this source without filling in the relevant data."""
  80. if self._event1.is_set():
  81. return
  82. self.skipped = True
  83. self._event1.set()
  84. def join(self, until):
  85. """Block until this violation result is filled out."""
  86. for event in [self._event1, self._event2]:
  87. if until:
  88. timeout = until - time()
  89. if timeout <= 0:
  90. return
  91. event.wait(timeout)
  92. else:
  93. event.wait()
  94. class CopyvioCheckResult:
  95. """
  96. **EarwigBot: Wiki Toolset: Copyvio Check Result**
  97. A class holding information about the results of a copyvio check.
  98. *Attributes:*
  99. - :py:attr:`violation`: ``True`` if this is a violation, else ``False``
  100. - :py:attr:`sources`: a list of CopyvioSources, sorted by confidence
  101. - :py:attr:`best`: the best matching CopyvioSource, or ``None``
  102. - :py:attr:`confidence`: the best matching source's confidence, or 0
  103. - :py:attr:`url`: the best matching source's URL, or ``None``
  104. - :py:attr:`queries`: the number of queries used to reach a result
  105. - :py:attr:`time`: the amount of time the check took to complete
  106. - :py:attr:`article_chain`: the MarkovChain of the article text
  107. - :py:attr:`possible_miss`: whether some URLs might have been missed
  108. """
  109. def __init__(self, violation, sources, queries, check_time, article_chain,
  110. possible_miss):
  111. self.violation = violation
  112. self.sources = sources
  113. self.queries = queries
  114. self.time = check_time
  115. self.article_chain = article_chain
  116. self.possible_miss = possible_miss
  117. def __repr__(self):
  118. """Return the canonical string representation of the result."""
  119. res = "CopyvioCheckResult(violation={0!r}, sources={1!r}, queries={2!r}, time={3!r})"
  120. return res.format(self.violation, self.sources, self.queries,
  121. self.time)
  122. def __str__(self):
  123. """Return a nice string representation of the result."""
  124. res = "<CopyvioCheckResult ({0} with best {1})>"
  125. return res.format(self.violation, self.best)
  126. @property
  127. def best(self):
  128. """The best known source, or None if no sources exist."""
  129. return self.sources[0] if self.sources else None
  130. @property
  131. def confidence(self):
  132. """The confidence of the best source, or 0 if no sources exist."""
  133. return self.best.confidence if self.best else 0.0
  134. @property
  135. def url(self):
  136. """The URL of the best source, or None if no sources exist."""
  137. return self.best.url if self.best else None
  138. def get_log_message(self, title):
  139. """Build a relevant log message for this copyvio check result."""
  140. if not self.sources:
  141. log = "No violation for [[{0}]] (no sources; {1} queries; {2} seconds)"
  142. return log.format(title, self.queries, self.time)
  143. log = "{0} for [[{1}]] (best: {2} ({3} confidence); {4} sources; {5} queries; {6} seconds)"
  144. is_vio = "Violation detected" if self.violation else "No violation"
  145. return log.format(is_vio, title, self.url, self.confidence,
  146. len(self.sources), self.queries, self.time)