@@ -59,10 +59,6 @@ requires = ["setuptools>=61.0"] | |||
build-backend = "setuptools.build_meta" | |||
[tool.pyright] | |||
exclude = [ | |||
# TODO | |||
"src/earwigbot/wiki/copyvios" | |||
] | |||
pythonVersion = "3.11" | |||
venvPath = "." | |||
venv = "venv" | |||
@@ -18,208 +18,142 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
__all__ = [ | |||
"DEFAULT_DEGREE", | |||
"CopyvioChecker", | |||
"CopyvioCheckResult", | |||
"globalize", | |||
"localize", | |||
] | |||
import functools | |||
import logging | |||
import time | |||
from urllib.request import build_opener | |||
from collections.abc import Callable | |||
from earwigbot import exceptions | |||
from earwigbot.wiki.copyvios.markov import MarkovChain | |||
from earwigbot.wiki.copyvios.parsers import ArticleTextParser | |||
from earwigbot.wiki.copyvios.search import SEARCH_ENGINES | |||
from earwigbot.wiki.copyvios.exclusions import ExclusionsDB | |||
from earwigbot.wiki.copyvios.markov import DEFAULT_DEGREE, MarkovChain | |||
from earwigbot.wiki.copyvios.parsers import ArticleParser, ParserArgs | |||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult | |||
from earwigbot.wiki.copyvios.search import SearchEngine, get_search_engine | |||
from earwigbot.wiki.copyvios.workers import CopyvioWorkspace, globalize, localize | |||
from earwigbot.wiki.page import Page | |||
__all__ = ["CopyvioMixIn", "globalize", "localize"] | |||
class CopyvioMixIn: | |||
class CopyvioChecker: | |||
""" | |||
**EarwigBot: Wiki Toolset: Copyright Violation MixIn** | |||
Manages the lifecycle of a copyvio check or comparison. | |||
This is a mixin that provides two public methods, :py:meth:`copyvio_check` | |||
and :py:meth:`copyvio_compare`. The former checks the page for copyright | |||
violations using a search engine API, and the latter compares the page | |||
against a given URL. Credentials for the search engine API are stored in | |||
the :py:class:`~earwigbot.wiki.site.Site`'s config. | |||
Created by :py:class:`~earwigbot.wiki.page.Page` and handles the implementation | |||
details of running a check. | |||
""" | |||
def __init__(self, site): | |||
self._search_config = site._search_config | |||
self._exclusions_db = self._search_config.get("exclusions_db") | |||
self._addheaders = [ | |||
("User-Agent", site.user_agent), | |||
def __init__( | |||
self, | |||
page: Page, | |||
*, | |||
min_confidence: float = 0.75, | |||
max_time: float = 30, | |||
degree: int = DEFAULT_DEGREE, | |||
logger: logging.Logger | None = None, | |||
) -> None: | |||
self._page = page | |||
self._site = page.site | |||
self._config = page.site._search_config | |||
self._min_confidence = min_confidence | |||
self._max_time = max_time | |||
self._degree = degree | |||
self._logger = logger or logging.getLogger("earwigbot.wiki") | |||
self._headers = [ | |||
("User-Agent", page.site.user_agent), | |||
("Accept-Encoding", "gzip"), | |||
] | |||
def _get_search_engine(self): | |||
"""Return a function that can be called to do web searches. | |||
The function takes one argument, a search query, and returns a list of | |||
URLs, ranked by importance. The underlying logic depends on the | |||
*engine* argument within our config; for example, if *engine* is | |||
"Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying. | |||
Raises UnknownSearchEngineError if the 'engine' listed in our config is | |||
unknown to us, and UnsupportedSearchEngineError if we are missing a | |||
required package or module, like oauth2 for "Yahoo! BOSS". | |||
""" | |||
engine = self._search_config["engine"] | |||
if engine not in SEARCH_ENGINES: | |||
raise exceptions.UnknownSearchEngineError(engine) | |||
klass = SEARCH_ENGINES[engine] | |||
credentials = self._search_config["credentials"] | |||
opener = build_opener() | |||
opener.addheaders = self._addheaders | |||
for dep in klass.requirements(): | |||
try: | |||
__import__(dep).__name__ | |||
except (ModuleNotFoundError, AttributeError): | |||
e = "Missing a required dependency ({}) for the {} engine" | |||
e = e.format(dep, engine) | |||
raise exceptions.UnsupportedSearchEngineError(e) | |||
return klass(credentials, opener) | |||
def copyvio_check( | |||
self, | |||
min_confidence=0.75, | |||
max_queries=15, | |||
max_time=-1, | |||
no_searches=False, | |||
no_links=False, | |||
short_circuit=True, | |||
degree=5, | |||
): | |||
"""Check the page for copyright violations. | |||
Returns a :class:`.CopyvioCheckResult` object with information on the | |||
results of the check. | |||
*min_confidence* is the minimum amount of confidence we must have in | |||
the similarity between a source text and the article in order for us to | |||
consider it a suspected violation. This is a number between 0 and 1. | |||
*max_queries* is self-explanatory; we will never make more than this | |||
number of queries in a given check. | |||
*max_time* can be set to prevent copyvio checks from taking longer than | |||
a set amount of time (generally around a minute), which can be useful | |||
if checks are called through a web server with timeouts. We will stop | |||
checking new URLs as soon as this limit is reached. | |||
Setting *no_searches* to ``True`` will cause only URLs in the wikitext | |||
of the page to be checked; no search engine queries will be made. | |||
Setting *no_links* to ``True`` will cause the opposite to happen: URLs | |||
in the wikitext will be ignored; search engine queries will be made | |||
only. Setting both of these to ``True`` is pointless. | |||
Normally, the checker will short-circuit if it finds a URL that meets | |||
*min_confidence*. This behavior normally causes it to skip any | |||
remaining URLs and web queries, but setting *short_circuit* to | |||
``False`` will prevent this. | |||
Raises :exc:`.CopyvioCheckError` or subclasses | |||
(:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on | |||
errors. | |||
""" | |||
log = "Starting copyvio check for [[{0}]]" | |||
self._logger.info(log.format(self.title)) | |||
searcher = self._get_search_engine() | |||
parser = ArticleTextParser( | |||
self.get(), | |||
args={"nltk_dir": self._search_config["nltk_dir"], "lang": self._site.lang}, | |||
self._parser = ArticleParser( | |||
self._page.get(), | |||
lang=self._site.lang, | |||
nltk_dir=self._config["nltk_dir"], | |||
) | |||
article = MarkovChain(parser.strip(), degree=degree) | |||
parser_args = {} | |||
self._article = MarkovChain(self._parser.strip(), degree=self._degree) | |||
if self._exclusions_db: | |||
self._exclusions_db.sync(self.site.name) | |||
@functools.cached_property | |||
def _searcher(self) -> SearchEngine: | |||
return get_search_engine(self._config, self._headers) | |||
def exclude(u): | |||
return self._exclusions_db.check(self.site.name, u) | |||
@property | |||
def _exclusions_db(self) -> ExclusionsDB | None: | |||
return self._config.get("exclusions_db") | |||
parser_args["mirror_hints"] = self._exclusions_db.get_mirror_hints(self) | |||
else: | |||
exclude = None | |||
def _get_exclusion_callback(self) -> Callable[[str], bool] | None: | |||
if not self._exclusions_db: | |||
return None | |||
return functools.partial(self._exclusions_db.check, self._site.name) | |||
def run_check( | |||
self, | |||
*, | |||
max_queries: int = 15, | |||
no_searches: bool = False, | |||
no_links: bool = False, | |||
short_circuit: bool = True, | |||
) -> CopyvioCheckResult: | |||
parser_args: ParserArgs = {} | |||
if self._exclusions_db: | |||
self._exclusions_db.sync(self._site.name) | |||
mirror_hints = self._exclusions_db.get_mirror_hints(self._page) | |||
parser_args["mirror_hints"] = mirror_hints | |||
workspace = CopyvioWorkspace( | |||
article, | |||
min_confidence, | |||
max_time, | |||
self._logger, | |||
self._addheaders, | |||
self._article, | |||
min_confidence=self._min_confidence, | |||
max_time=self._max_time, | |||
logger=self._logger, | |||
headers=self._headers, | |||
short_circuit=short_circuit, | |||
parser_args=parser_args, | |||
exclude_check=exclude, | |||
config=self._search_config, | |||
degree=degree, | |||
exclusion_callback=self._get_exclusion_callback(), | |||
config=self._config, | |||
degree=self._degree, | |||
) | |||
if article.size < 20: # Auto-fail very small articles | |||
result = workspace.get_result() | |||
self._logger.info(result.get_log_message(self.title)) | |||
return result | |||
if self._article.size < 20: # Auto-fail very small articles | |||
return workspace.get_result() | |||
if not no_links: | |||
workspace.enqueue(parser.get_links()) | |||
workspace.enqueue(self._parser.get_links()) | |||
num_queries = 0 | |||
if not no_searches: | |||
chunks = parser.chunk(max_queries) | |||
chunks = self._parser.chunk(max_queries) | |||
for chunk in chunks: | |||
if short_circuit and workspace.finished: | |||
workspace.possible_miss = True | |||
break | |||
log = "[[{0}]] -> querying {1} for {2!r}" | |||
self._logger.debug(log.format(self.title, searcher.name, chunk)) | |||
workspace.enqueue(searcher.search(chunk)) | |||
self._logger.debug( | |||
f"[[{self._page.title}]] -> querying {self._searcher.name} " | |||
f"for {chunk!r}" | |||
) | |||
workspace.enqueue(self._searcher.search(chunk)) | |||
num_queries += 1 | |||
time.sleep(1) | |||
time.sleep(1) # TODO: Check whether this is needed | |||
workspace.wait() | |||
result = workspace.get_result(num_queries) | |||
self._logger.info(result.get_log_message(self.title)) | |||
return result | |||
def copyvio_compare(self, urls, min_confidence=0.75, max_time=30, degree=5): | |||
"""Check the page like :py:meth:`copyvio_check` against specific URLs. | |||
This is essentially a reduced version of :meth:`copyvio_check` - a | |||
copyivo comparison is made using Markov chains and the result is | |||
returned in a :class:`.CopyvioCheckResult` object - but without using a | |||
search engine, since the suspected "violated" URL is supplied from the | |||
start. | |||
Its primary use is to generate a result when the URL is retrieved from | |||
a cache, like the one used in EarwigBot's Tool Labs site. After a | |||
search is done, the resulting URL is stored in a cache for 72 hours so | |||
future checks against that page will not require another set of | |||
time-and-money-consuming search engine queries. However, the comparison | |||
itself (which includes the article's and the source's content) cannot | |||
be stored for data retention reasons, so a fresh comparison is made | |||
using this function. | |||
Since no searching is done, neither :exc:`.UnknownSearchEngineError` | |||
nor :exc:`.SearchQueryError` will be raised. | |||
""" | |||
if not isinstance(urls, list): | |||
urls = [urls] | |||
log = "Starting copyvio compare for [[{0}]] against {1}" | |||
self._logger.info(log.format(self.title, ", ".join(urls))) | |||
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree) | |||
return workspace.get_result(num_queries) | |||
def run_compare(self, urls: list[str]) -> CopyvioCheckResult: | |||
workspace = CopyvioWorkspace( | |||
article, | |||
min_confidence, | |||
max_time, | |||
self._logger, | |||
self._addheaders, | |||
max_time, | |||
self._article, | |||
min_confidence=self._min_confidence, | |||
max_time=self._max_time, | |||
logger=self._logger, | |||
headers=self._headers, | |||
url_timeout=self._max_time, | |||
num_workers=min(len(urls), 8), | |||
short_circuit=False, | |||
config=self._search_config, | |||
degree=degree, | |||
config=self._config, | |||
degree=self._degree, | |||
) | |||
workspace.enqueue(urls) | |||
workspace.wait() | |||
result = workspace.get_result() | |||
self._logger.info(result.get_log_message(self.title)) | |||
return result | |||
return workspace.get_result() |
@@ -18,15 +18,24 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import annotations | |||
__all__ = ["ExclusionsDB"] | |||
import logging | |||
import re | |||
import sqlite3 | |||
import threading | |||
import time | |||
import typing | |||
import urllib.parse | |||
from earwigbot import exceptions | |||
__all__ = ["ExclusionsDB"] | |||
if typing.TYPE_CHECKING: | |||
from earwigbot.wiki.page import Page | |||
from earwigbot.wiki.site import Site | |||
from earwigbot.wiki.sitesdb import SitesDB | |||
DEFAULT_SOURCES = { | |||
"all": [ # Applies to all, but located on enwiki | |||
@@ -52,26 +61,28 @@ class ExclusionsDB: | |||
""" | |||
**EarwigBot: Wiki Toolset: Exclusions Database Manager** | |||
Controls the :file:`exclusions.db` file, which stores URLs excluded from | |||
copyright violation checks on account of being known mirrors, for example. | |||
Controls the :file:`exclusions.db` file, which stores URLs excluded from copyright | |||
violation checks on account of being known mirrors, for example. | |||
""" | |||
def __init__(self, sitesdb, dbfile, logger): | |||
def __init__(self, sitesdb: SitesDB, dbfile: str, logger: logging.Logger) -> None: | |||
self._sitesdb = sitesdb | |||
self._dbfile = dbfile | |||
self._logger = logger | |||
self._db_access_lock = threading.Lock() | |||
def __repr__(self): | |||
def __repr__(self) -> str: | |||
"""Return the canonical string representation of the ExclusionsDB.""" | |||
res = "ExclusionsDB(sitesdb={0!r}, dbfile={1!r}, logger={2!r})" | |||
return res.format(self._sitesdb, self._dbfile, self._logger) | |||
return ( | |||
f"ExclusionsDB(sitesdb={self._sitesdb!r}, dbfile={self._dbfile!r}, " | |||
f"logger={self._logger!r})" | |||
) | |||
def __str__(self): | |||
def __str__(self) -> str: | |||
"""Return a nice string representation of the ExclusionsDB.""" | |||
return f"<ExclusionsDB at {self._dbfile}>" | |||
def _create(self): | |||
def _create(self) -> None: | |||
"""Initialize the exclusions database with its necessary tables.""" | |||
script = """ | |||
CREATE TABLE sources (source_sitename, source_page); | |||
@@ -79,7 +90,7 @@ class ExclusionsDB: | |||
CREATE TABLE exclusions (exclusion_sitename, exclusion_url); | |||
""" | |||
query = "INSERT INTO sources VALUES (?, ?);" | |||
sources = [] | |||
sources: list[tuple[str, str]] = [] | |||
for sitename, pages in DEFAULT_SOURCES.items(): | |||
for page in pages: | |||
sources.append((sitename, page)) | |||
@@ -88,9 +99,9 @@ class ExclusionsDB: | |||
conn.executescript(script) | |||
conn.executemany(query, sources) | |||
def _load_source(self, site, source): | |||
def _load_source(self, site: Site, source: str) -> set[str]: | |||
"""Load from a specific source and return a set of URLs.""" | |||
urls = set() | |||
urls: set[str] = set() | |||
try: | |||
data = site.get_page(source, follow_redirects=True).get() | |||
except exceptions.PageNotFoundError: | |||
@@ -123,7 +134,7 @@ class ExclusionsDB: | |||
urls.add(url) | |||
return urls | |||
def _update(self, sitename): | |||
def _update(self, sitename: str) -> None: | |||
"""Update the database from listed sources in the index.""" | |||
query1 = "SELECT source_page FROM sources WHERE source_sitename = ?" | |||
query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" | |||
@@ -140,7 +151,7 @@ class ExclusionsDB: | |||
else: | |||
site = self._sitesdb.get_site(sitename) | |||
with self._db_access_lock, sqlite3.connect(self._dbfile) as conn: | |||
urls = set() | |||
urls: set[str] = set() | |||
for (source,) in conn.execute(query1, (sitename,)): | |||
urls |= self._load_source(site, source) | |||
for (url,) in conn.execute(query2, (sitename,)): | |||
@@ -154,7 +165,7 @@ class ExclusionsDB: | |||
else: | |||
conn.execute(query7, (sitename, int(time.time()))) | |||
def _get_last_update(self, sitename): | |||
def _get_last_update(self, sitename: str) -> int: | |||
"""Return the UNIX timestamp of the last time the db was updated.""" | |||
query = "SELECT update_time FROM updates WHERE update_sitename = ?" | |||
with self._db_access_lock, sqlite3.connect(self._dbfile) as conn: | |||
@@ -165,28 +176,34 @@ class ExclusionsDB: | |||
return 0 | |||
return result[0] if result else 0 | |||
def sync(self, sitename, force=False): | |||
"""Update the database if it hasn't been updated recently. | |||
def sync(self, sitename: str, force: bool = False) -> None: | |||
""" | |||
Update the database if it hasn't been updated recently. | |||
This updates the exclusions database for the site *sitename* and "all". | |||
Site-specific lists are considered stale after 48 hours; global lists | |||
after 12 hours. | |||
Site-specific lists are considered stale after 48 hours; global lists after | |||
12 hours. | |||
""" | |||
max_staleness = 60 * 60 * (12 if sitename == "all" else 48) | |||
time_since_update = int(time.time() - self._get_last_update(sitename)) | |||
if force or time_since_update > max_staleness: | |||
log = "Updating stale database: {0} (last updated {1} seconds ago)" | |||
self._logger.info(log.format(sitename, time_since_update)) | |||
self._logger.info( | |||
f"Updating stale database: {sitename} (last updated " | |||
f"{time_since_update} seconds ago)" | |||
) | |||
self._update(sitename) | |||
else: | |||
log = "Database for {0} is still fresh (last updated {1} seconds ago)" | |||
self._logger.debug(log.format(sitename, time_since_update)) | |||
self._logger.debug( | |||
f"Database for {sitename} is still fresh (last updated " | |||
f"{time_since_update} seconds ago)" | |||
) | |||
if sitename != "all": | |||
self.sync("all", force=force) | |||
def check(self, sitename, url): | |||
"""Check whether a given URL is in the exclusions database. | |||
def check(self, sitename: str, url: str) -> bool: | |||
""" | |||
Check whether a given URL is in the exclusions database. | |||
Return ``True`` if the URL is in the database, or ``False`` otherwise. | |||
""" | |||
@@ -216,19 +233,18 @@ class ExclusionsDB: | |||
else: | |||
matches = normalized.startswith(excl) | |||
if matches: | |||
log = "Exclusion detected in {0} for {1}" | |||
self._logger.debug(log.format(sitename, url)) | |||
self._logger.debug(f"Exclusion detected in {sitename} for {url}") | |||
return True | |||
log = f"No exclusions in {sitename} for {url}" | |||
self._logger.debug(log) | |||
self._logger.debug(f"No exclusions in {sitename} for {url}") | |||
return False | |||
def get_mirror_hints(self, page, try_mobile=True): | |||
"""Return a list of strings that indicate the existence of a mirror. | |||
def get_mirror_hints(self, page: Page, try_mobile: bool = True) -> list[str]: | |||
""" | |||
Return a list of strings that indicate the existence of a mirror. | |||
The source parser checks for the presence of these strings inside of | |||
certain HTML tag attributes (``"href"`` and ``"src"``). | |||
The source parser checks for the presence of these strings inside of certain | |||
HTML tag attributes (``"href"`` and ``"src"``). | |||
""" | |||
site = page.site | |||
path = urllib.parse.urlparse(page.url).path | |||
@@ -238,10 +254,10 @@ class ExclusionsDB: | |||
if try_mobile: | |||
fragments = re.search(r"^([\w]+)\.([\w]+).([\w]+)$", site.domain) | |||
if fragments: | |||
roots.append("{}.m.{}.{}".format(*fragments.groups())) | |||
roots.append(f"{fragments[1]}.m.{fragments[2]}.{fragments[3]}") | |||
general = [ | |||
root + site._script_path + "/" + script | |||
root + site.script_path + "/" + script | |||
for root in roots | |||
for script in scripts | |||
] | |||
@@ -18,29 +18,44 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
__all__ = [ | |||
"DEFAULT_DEGREE", | |||
"EMPTY", | |||
"EMPTY_INTERSECTION", | |||
"MarkovChain", | |||
"MarkovChainIntersection", | |||
] | |||
import re | |||
from collections.abc import Iterable | |||
from enum import Enum | |||
__all__ = ["EMPTY", "EMPTY_INTERSECTION", "MarkovChain", "MarkovChainIntersection"] | |||
DEFAULT_DEGREE = 5 | |||
class MarkovChain: | |||
"""Implements a basic ngram Markov chain of words.""" | |||
class Sentinel(Enum): | |||
START = -1 | |||
END = -2 | |||
def __init__(self, text, degree=5): | |||
RawChain = dict[tuple[str | Sentinel, ...], int] | |||
class MarkovChain: | |||
"""Implements a basic ngram Markov chain of words.""" | |||
def __init__(self, text: str, degree: int = DEFAULT_DEGREE) -> None: | |||
self.text = text | |||
self.degree = degree # 2 for bigrams, 3 for trigrams, etc. | |||
self.chain = self._build() | |||
self.size = self._get_size() | |||
def _build(self): | |||
def _build(self) -> RawChain: | |||
"""Build and return the Markov chain from the input text.""" | |||
padding = self.degree - 1 | |||
words = re.sub(r"[^\w\s-]", "", self.text.lower(), flags=re.UNICODE).split() | |||
words = ([self.START] * padding) + words + ([self.END] * padding) | |||
chain = {} | |||
words = re.sub(r"[^\w\s-]", "", self.text.lower()).split() | |||
words = ([Sentinel.START] * padding) + words + ([Sentinel.END] * padding) | |||
chain: RawChain = {} | |||
for i in range(len(words) - self.degree + 1): | |||
phrase = tuple(words[i : i + self.degree]) | |||
@@ -50,15 +65,15 @@ class MarkovChain: | |||
chain[phrase] = 1 | |||
return chain | |||
def _get_size(self): | |||
def _get_size(self) -> int: | |||
"""Return the size of the Markov chain: the total number of nodes.""" | |||
return sum(self.chain.values()) | |||
def __repr__(self): | |||
def __repr__(self) -> str: | |||
"""Return the canonical string representation of the MarkovChain.""" | |||
return f"MarkovChain(text={self.text!r})" | |||
def __str__(self): | |||
def __str__(self) -> str: | |||
"""Return a nice string representation of the MarkovChain.""" | |||
return f"<MarkovChain of size {self.size}>" | |||
@@ -66,61 +81,60 @@ class MarkovChain: | |||
class MarkovChainIntersection(MarkovChain): | |||
"""Implements the intersection of two chains (i.e., their shared nodes).""" | |||
def __init__(self, mc1, mc2): | |||
def __init__(self, mc1: MarkovChain, mc2: MarkovChain) -> None: | |||
self.mc1, self.mc2 = mc1, mc2 | |||
self.chain = self._build() | |||
self.size = self._get_size() | |||
def _build(self): | |||
def _build(self) -> RawChain: | |||
"""Build and return the Markov chain from the input chains.""" | |||
c1 = self.mc1.chain | |||
c2 = self.mc2.chain | |||
chain = {} | |||
chain: RawChain = {} | |||
for phrase in c1: | |||
if phrase in c2: | |||
chain[phrase] = min(c1[phrase], c2[phrase]) | |||
return chain | |||
def __repr__(self): | |||
def __repr__(self) -> str: | |||
"""Return the canonical string representation of the intersection.""" | |||
res = "MarkovChainIntersection(mc1={0!r}, mc2={1!r})" | |||
return res.format(self.mc1, self.mc2) | |||
return f"MarkovChainIntersection(mc1={self.mc1!r}, mc2={self.mc2!r})" | |||
def __str__(self): | |||
def __str__(self) -> str: | |||
"""Return a nice string representation of the intersection.""" | |||
res = "<MarkovChainIntersection of size {0} ({1} ^ {2})>" | |||
return res.format(self.size, self.mc1, self.mc2) | |||
return ( | |||
f"<MarkovChainIntersection of size {self.size} ({self.mc1} ^ {self.mc2})>" | |||
) | |||
class MarkovChainUnion(MarkovChain): | |||
"""Implemented the union of multiple chains.""" | |||
def __init__(self, chains): | |||
def __init__(self, chains: Iterable[MarkovChain]) -> None: | |||
self.chains = list(chains) | |||
self.chain = self._build() | |||
self.size = self._get_size() | |||
def _build(self): | |||
def _build(self) -> RawChain: | |||
"""Build and return the Markov chain from the input chains.""" | |||
union = {} | |||
union: RawChain = {} | |||
for chain in self.chains: | |||
for phrase, count in chain.chain.iteritems(): | |||
for phrase, count in chain.chain.items(): | |||
if phrase in union: | |||
union[phrase] += count | |||
else: | |||
union[phrase] = count | |||
return union | |||
def __repr__(self): | |||
def __repr__(self) -> str: | |||
"""Return the canonical string representation of the union.""" | |||
res = "MarkovChainUnion(chains={!r})" | |||
return res.format(self.chains) | |||
return f"MarkovChainUnion(chains={self.chains!r})" | |||
def __str__(self): | |||
def __str__(self) -> str: | |||
"""Return a nice string representation of the union.""" | |||
res = "<MarkovChainUnion of size {} ({})>" | |||
return res.format(self.size, "| ".join(str(chain) for chain in self.chains)) | |||
chains = " | ".join(str(chain) for chain in self.chains) | |||
return f"<MarkovChainUnion of size {self.size} ({chains})>" | |||
EMPTY = MarkovChain("") | |||
@@ -18,44 +18,34 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import annotations | |||
__all__ = ["ArticleParser", "get_parser"] | |||
import io | |||
import json | |||
import os.path | |||
import re | |||
import typing | |||
import urllib.parse | |||
import urllib.request | |||
from abc import ABC, abstractmethod | |||
from collections.abc import Callable | |||
from typing import Any, ClassVar, Literal, TypedDict | |||
import mwparserfromhell | |||
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError | |||
__all__ = ["ArticleTextParser", "get_parser"] | |||
class _BaseTextParser: | |||
"""Base class for a parser that handles text.""" | |||
TYPE = None | |||
def __init__(self, text, url=None, args=None): | |||
self.text = text | |||
self.url = url | |||
self._args = args or {} | |||
def __repr__(self): | |||
"""Return the canonical string representation of the text parser.""" | |||
return f"{self.__class__.__name__}(text={self.text!r})" | |||
if typing.TYPE_CHECKING: | |||
import bs4 | |||
def __str__(self): | |||
"""Return a nice string representation of the text parser.""" | |||
name = self.__class__.__name__ | |||
return f"<{name} of text with size {len(self.text)}>" | |||
from earwigbot.wiki.copyvios.workers import OpenedURL | |||
class ArticleTextParser(_BaseTextParser): | |||
class ArticleParser: | |||
"""A parser that can strip and chunk wikicode article text.""" | |||
TYPE = "Article" | |||
TEMPLATE_MERGE_THRESHOLD = 35 | |||
NLTK_DEFAULT = "english" | |||
NLTK_LANGS = { | |||
@@ -78,7 +68,18 @@ class ArticleTextParser(_BaseTextParser): | |||
"tr": "turkish", | |||
} | |||
def _merge_templates(self, code): | |||
def __init__(self, text: str, lang: str, nltk_dir: str) -> None: | |||
self.text = text | |||
self._lang = lang | |||
self._nltk_dir = nltk_dir | |||
def __repr__(self) -> str: | |||
return f"{self.__class__.__name__}(text={self.text!r})" | |||
def __str__(self) -> str: | |||
return f"<{self.__class__.__name__} of text with size {len(self.text)}>" | |||
def _merge_templates(self, code: mwparserfromhell.wikicode.Wikicode) -> None: | |||
"""Merge template contents in to wikicode when the values are long.""" | |||
for template in code.filter_templates(recursive=code.RECURSE_OTHERS): | |||
chunks = [] | |||
@@ -92,23 +93,25 @@ class ArticleTextParser(_BaseTextParser): | |||
else: | |||
code.remove(template) | |||
def _get_tokenizer(self): | |||
def _get_tokenizer(self) -> Any: | |||
"""Return a NLTK punctuation tokenizer for the article's language.""" | |||
import nltk | |||
def datafile(lang): | |||
def datafile(lang: str) -> str: | |||
return "file:" + os.path.join( | |||
self._args["nltk_dir"], "tokenizers", "punkt", lang + ".pickle" | |||
self._nltk_dir, "tokenizers", "punkt", lang + ".pickle" | |||
) | |||
lang = self.NLTK_LANGS.get(self._args.get("lang"), self.NLTK_DEFAULT) | |||
lang = self.NLTK_LANGS.get(self._lang, self.NLTK_DEFAULT) | |||
try: | |||
nltk.data.load(datafile(self.NLTK_DEFAULT)) | |||
except LookupError: | |||
nltk.download("punkt", self._args["nltk_dir"]) | |||
nltk.download("punkt", self._nltk_dir) | |||
return nltk.data.load(datafile(lang)) | |||
def _get_sentences(self, min_query, max_query, split_thresh): | |||
def _get_sentences( | |||
self, min_query: int, max_query: int, split_thresh: int | |||
) -> list[str]: | |||
"""Split the article text into sentences of a certain length.""" | |||
def cut_sentence(words): | |||
@@ -138,24 +141,27 @@ class ArticleTextParser(_BaseTextParser): | |||
sentences.extend(cut_sentence(sentence.split())) | |||
return [sen for sen in sentences if len(sen) >= min_query] | |||
def strip(self): | |||
"""Clean the page's raw text by removing templates and formatting. | |||
def strip(self) -> str: | |||
""" | |||
Clean the page's raw text by removing templates and formatting. | |||
Return the page's text with all HTML and wikicode formatting removed, | |||
including templates, tables, and references. It retains punctuation | |||
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses, | |||
quotes), original capitalization, and so forth. HTML entities are | |||
replaced by their unicode equivalents. | |||
Return the page's text with all HTML and wikicode formatting removed, including | |||
templates, tables, and references. It retains punctuation (spacing, paragraphs, | |||
periods, commas, (semi)-colons, parentheses, quotes), original capitalization, | |||
and so forth. HTML entities are replaced by their unicode equivalents. | |||
The actual stripping is handled by :py:mod:`mwparserfromhell`. | |||
""" | |||
def remove(code, node): | |||
"""Remove a node from a code object, ignoring ValueError. | |||
def remove( | |||
code: mwparserfromhell.wikicode.Wikicode, node: mwparserfromhell.nodes.Node | |||
) -> None: | |||
""" | |||
Remove a node from a code object, ignoring ValueError. | |||
Sometimes we will remove a node that contains another node we wish | |||
to remove, and we fail when we try to remove the inner one. Easiest | |||
solution is to just ignore the exception. | |||
Sometimes we will remove a node that contains another node we wish to | |||
remove, and we fail when we try to remove the inner one. Easiest solution | |||
is to just ignore the exception. | |||
""" | |||
try: | |||
code.remove(node) | |||
@@ -181,26 +187,32 @@ class ArticleTextParser(_BaseTextParser): | |||
self.clean = re.sub(r"\n\n+", "\n", clean).strip() | |||
return self.clean | |||
def chunk(self, max_chunks, min_query=8, max_query=128, split_thresh=32): | |||
"""Convert the clean article text into a list of web-searchable chunks. | |||
No greater than *max_chunks* will be returned. Each chunk will only be | |||
a sentence or two long at most (no more than *max_query*). The idea is | |||
to return a sample of the article text rather than the whole, so we'll | |||
pick and choose from parts of it, especially if the article is large | |||
and *max_chunks* is low, so we don't end up just searching for just the | |||
first paragraph. | |||
This is implemented using :py:mod:`nltk` (https://nltk.org/). A base | |||
directory (*nltk_dir*) is required to store nltk's punctuation | |||
database, and should be passed as an argument to the constructor. It is | |||
typically located in the bot's working directory. | |||
def chunk( | |||
self, | |||
max_chunks: int, | |||
min_query: int = 8, | |||
max_query: int = 128, | |||
split_thresh: int = 32, | |||
) -> list[str]: | |||
""" | |||
Convert the clean article text into a list of web-searchable chunks. | |||
No greater than *max_chunks* will be returned. Each chunk will only be a | |||
sentence or two long at most (no more than *max_query*). The idea is to return | |||
a sample of the article text rather than the whole, so we'll pick and choose | |||
from parts of it, especially if the article is large and *max_chunks* is low, | |||
so we don't end up just searching for just the first paragraph. | |||
This is implemented using :py:mod:`nltk` (https://nltk.org/). A base directory | |||
(*nltk_dir*) is required to store nltk's punctuation database, and should be | |||
passed as an argument to the constructor. It is typically located in the bot's | |||
working directory. | |||
""" | |||
sentences = self._get_sentences(min_query, max_query, split_thresh) | |||
if len(sentences) <= max_chunks: | |||
return sentences | |||
chunks = [] | |||
chunks: list[str] = [] | |||
while len(chunks) < max_chunks: | |||
if len(chunks) % 5 == 0: | |||
chunk = sentences.pop(0) # Pop from beginning | |||
@@ -216,7 +228,8 @@ class ArticleTextParser(_BaseTextParser): | |||
return chunks | |||
def get_links(self): | |||
"""Return a list of all external links in the article. | |||
""" | |||
Return a list of all external links in the article. | |||
The list is restricted to things that we suspect we can parse: i.e., | |||
those with schemes of ``http`` and ``https``. | |||
@@ -226,14 +239,42 @@ class ArticleTextParser(_BaseTextParser): | |||
return [str(link.url) for link in links if link.url.startswith(schemes)] | |||
class _HTMLParser(_BaseTextParser): | |||
class ParserArgs(TypedDict, total=False): | |||
mirror_hints: list[str] | |||
open_url: Callable[[str], OpenedURL | None] | |||
class SourceParser(ABC): | |||
"""Base class for a parser that handles text.""" | |||
TYPE: ClassVar[str] | |||
def __init__(self, text: bytes, url: str, args: ParserArgs | None = None) -> None: | |||
self.text = text | |||
self.url = url | |||
self._args = args or {} | |||
def __repr__(self) -> str: | |||
"""Return the canonical string representation of the text parser.""" | |||
return f"{self.__class__.__name__}(text={self.text!r})" | |||
def __str__(self) -> str: | |||
"""Return a nice string representation of the text parser.""" | |||
return f"<{self.__class__.__name__} of text with size {len(self.text)}>" | |||
@abstractmethod | |||
def parse(self) -> str: ... | |||
class HTMLParser(SourceParser): | |||
"""A parser that can extract the text from an HTML document.""" | |||
TYPE = "HTML" | |||
hidden_tags = ["script", "style"] | |||
def _fail_if_mirror(self, soup): | |||
"""Look for obvious signs that the given soup is a wiki mirror. | |||
def _fail_if_mirror(self, soup: bs4.BeautifulSoup) -> None: | |||
""" | |||
Look for obvious signs that the given soup is a wiki mirror. | |||
If so, raise ParserExclusionError, which is caught in the workers and | |||
causes this source to excluded. | |||
@@ -242,13 +283,14 @@ class _HTMLParser(_BaseTextParser): | |||
return | |||
def func(attr): | |||
assert "mirror_hints" in self._args | |||
return attr and any(hint in attr for hint in self._args["mirror_hints"]) | |||
if soup.find_all(href=func) or soup.find_all(src=func): | |||
raise ParserExclusionError() | |||
@staticmethod | |||
def _get_soup(text): | |||
def _get_soup(text: bytes) -> bs4.BeautifulSoup: | |||
"""Parse some text using BeautifulSoup.""" | |||
import bs4 | |||
@@ -257,11 +299,11 @@ class _HTMLParser(_BaseTextParser): | |||
except ValueError: | |||
return bs4.BeautifulSoup(text) | |||
def _clean_soup(self, soup): | |||
def _clean_soup(self, soup: bs4.element.Tag) -> str: | |||
"""Clean a BeautifulSoup tree of invisible tags.""" | |||
import bs4 | |||
def is_comment(text): | |||
def is_comment(text: bs4.element.Tag) -> bool: | |||
return isinstance(text, bs4.element.Comment) | |||
for comment in soup.find_all(text=is_comment): | |||
@@ -272,7 +314,7 @@ class _HTMLParser(_BaseTextParser): | |||
return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings) | |||
def _open(self, url, **kwargs): | |||
def _open(self, url: str, **kwargs: Any) -> bytes | None: | |||
"""Try to read a URL. Return None if it couldn't be read.""" | |||
opener = self._args.get("open_url") | |||
if not opener: | |||
@@ -280,13 +322,13 @@ class _HTMLParser(_BaseTextParser): | |||
result = opener(url, **kwargs) | |||
return result.content if result else None | |||
def _load_from_blogspot(self, url): | |||
def _load_from_blogspot(self, url: urllib.parse.ParseResult) -> str: | |||
"""Load dynamic content from Blogger Dynamic Views.""" | |||
match = re.search(r"'postId': '(\d+)'", self.text) | |||
match = re.search(rb"'postId': '(\d+)'", self.text) | |||
if not match: | |||
return "" | |||
post_id = match.group(1) | |||
url = f"https://{url.netloc}/feeds/posts/default/{post_id}?" | |||
feed_url = f"https://{url.netloc}/feeds/posts/default/{post_id}?" | |||
params = { | |||
"alt": "json", | |||
"v": "2", | |||
@@ -294,7 +336,7 @@ class _HTMLParser(_BaseTextParser): | |||
"rewriteforssl": "true", | |||
} | |||
raw = self._open( | |||
url + urllib.parse.urlencode(params), | |||
feed_url + urllib.parse.urlencode(params), | |||
allow_content_types=["application/json"], | |||
) | |||
if raw is None: | |||
@@ -308,19 +350,24 @@ class _HTMLParser(_BaseTextParser): | |||
except KeyError: | |||
return "" | |||
soup = self._get_soup(text) | |||
if not soup.body: | |||
return "" | |||
return self._clean_soup(soup.body) | |||
def parse(self): | |||
"""Return the actual text contained within an HTML document. | |||
def parse(self) -> str: | |||
""" | |||
Return the actual text contained within an HTML document. | |||
Implemented using :py:mod:`BeautifulSoup <bs4>` | |||
(https://www.crummy.com/software/BeautifulSoup/). | |||
(https://pypi.org/project/beautifulsoup4/). | |||
""" | |||
import bs4 | |||
url = urllib.parse.urlparse(self.url) if self.url else None | |||
soup = self._get_soup(self.text) | |||
if not soup.body: | |||
# No <body> tag present in HTML -> | |||
# no scrapable content (possibly JS or <iframe> magic): | |||
# No <body> tag present in HTML -> # no scrapable content | |||
# (possibly JS or <iframe> magic): | |||
return "" | |||
self._fail_if_mirror(soup) | |||
@@ -328,7 +375,7 @@ class _HTMLParser(_BaseTextParser): | |||
if url and url.netloc == "web.archive.org" and url.path.endswith(".pdf"): | |||
playback = body.find(id="playback") | |||
if playback and "src" in playback.attrs: | |||
if isinstance(playback, bs4.element.Tag) and "src" in playback.attrs: | |||
raise ParserRedirectError(playback.attrs["src"]) | |||
content = self._clean_soup(body) | |||
@@ -339,7 +386,7 @@ class _HTMLParser(_BaseTextParser): | |||
return content | |||
class _PDFParser(_BaseTextParser): | |||
class PDFParser(SourceParser): | |||
"""A parser that can extract text from a PDF file.""" | |||
TYPE = "PDF" | |||
@@ -348,7 +395,7 @@ class _PDFParser(_BaseTextParser): | |||
("\u2022", " "), | |||
] | |||
def parse(self): | |||
def parse(self) -> str: | |||
"""Return extracted text from the PDF.""" | |||
from pdfminer import converter, pdfinterp, pdfpage | |||
@@ -358,7 +405,7 @@ class _PDFParser(_BaseTextParser): | |||
interp = pdfinterp.PDFPageInterpreter(manager, conv) | |||
try: | |||
pages = pdfpage.PDFPage.get_pages(io.StringIO(self.text)) | |||
pages = pdfpage.PDFPage.get_pages(io.BytesIO(self.text)) | |||
for page in pages: | |||
interp.process_page(page) | |||
except Exception: # pylint: disable=broad-except | |||
@@ -372,12 +419,12 @@ class _PDFParser(_BaseTextParser): | |||
return re.sub(r"\n\n+", "\n", value).strip() | |||
class _PlainTextParser(_BaseTextParser): | |||
class PlainTextParser(SourceParser): | |||
"""A parser that can unicode-ify and strip text from a plain text page.""" | |||
TYPE = "Text" | |||
def parse(self): | |||
def parse(self) -> str: | |||
"""Unicode-ify and strip whitespace from the plain text document.""" | |||
from bs4.dammit import UnicodeDammit | |||
@@ -385,15 +432,25 @@ class _PlainTextParser(_BaseTextParser): | |||
return converted.strip() if converted else "" | |||
_CONTENT_TYPES = { | |||
"text/html": _HTMLParser, | |||
"application/xhtml+xml": _HTMLParser, | |||
"application/pdf": _PDFParser, | |||
"application/x-pdf": _PDFParser, | |||
"text/plain": _PlainTextParser, | |||
_CONTENT_TYPES: dict[str, type[SourceParser]] = { | |||
"text/html": HTMLParser, | |||
"application/xhtml+xml": HTMLParser, | |||
"application/pdf": PDFParser, | |||
"application/x-pdf": PDFParser, | |||
"text/plain": PlainTextParser, | |||
} | |||
def get_parser(content_type): | |||
@typing.overload | |||
def get_parser(content_type: str) -> type[SourceParser] | None: ... | |||
@typing.overload | |||
def get_parser( | |||
content_type: Literal["text/plain"] = "text/plain", | |||
) -> type[SourceParser]: ... | |||
def get_parser(content_type: str = "text/plain") -> type[SourceParser] | None: | |||
"""Return the parser most able to handle a given content type, or None.""" | |||
return _CONTENT_TYPES.get(content_type) |
@@ -18,13 +18,26 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import annotations | |||
__all__ = ["CopyvioSource", "CopyvioCheckResult"] | |||
import time | |||
import typing | |||
import urllib.parse | |||
from threading import Event | |||
from time import time | |||
from typing import Any | |||
from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION | |||
from earwigbot.wiki.copyvios.markov import ( | |||
EMPTY, | |||
EMPTY_INTERSECTION, | |||
MarkovChain, | |||
MarkovChainIntersection, | |||
) | |||
__all__ = ["CopyvioSource", "CopyvioCheckResult"] | |||
if typing.TYPE_CHECKING: | |||
from earwigbot.wiki.copyvios.parsers import ParserArgs | |||
from earwigbot.wiki.copyvios.workers import CopyvioWorkspace | |||
class CopyvioSource: | |||
@@ -45,13 +58,13 @@ class CopyvioSource: | |||
def __init__( | |||
self, | |||
workspace, | |||
url, | |||
headers=None, | |||
timeout=5, | |||
parser_args=None, | |||
search_config=None, | |||
): | |||
workspace: CopyvioWorkspace, | |||
url: str, | |||
headers: list[tuple[str, str]] | None = None, | |||
timeout: float = 5, | |||
parser_args: ParserArgs | None = None, | |||
search_config: dict[str, Any] | None = None, | |||
) -> None: | |||
self.workspace = workspace | |||
self.url = url | |||
self.headers = headers | |||
@@ -68,54 +81,57 @@ class CopyvioSource: | |||
self._event2 = Event() | |||
self._event2.set() | |||
def __repr__(self): | |||
def __repr__(self) -> str: | |||
"""Return the canonical string representation of the source.""" | |||
res = ( | |||
"CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r}, " | |||
"excluded={3!r})" | |||
return ( | |||
f"CopyvioSource(url={self.url!r}, confidence={self.confidence!r}, " | |||
f"skipped={self.skipped!r}, excluded={self.excluded!r})" | |||
) | |||
return res.format(self.url, self.confidence, self.skipped, self.excluded) | |||
def __str__(self): | |||
def __str__(self) -> str: | |||
"""Return a nice string representation of the source.""" | |||
if self.excluded: | |||
return f"<CopyvioSource ({self.url}, excluded)>" | |||
if self.skipped: | |||
return f"<CopyvioSource ({self.url}, skipped)>" | |||
res = "<CopyvioSource ({0} with {1} conf)>" | |||
return res.format(self.url, self.confidence) | |||
return f"<CopyvioSource ({self.url} with {self.confidence} conf)>" | |||
@property | |||
def domain(self): | |||
def domain(self) -> str | None: | |||
"""The source URL's domain name, or None.""" | |||
return urllib.parse.urlparse(self.url).netloc or None | |||
def start_work(self): | |||
def start_work(self) -> None: | |||
"""Mark this source as being worked on right now.""" | |||
self._event2.clear() | |||
self._event1.set() | |||
def update(self, confidence, source_chain, delta_chain): | |||
def update( | |||
self, | |||
confidence: float, | |||
source_chain: MarkovChain, | |||
delta_chain: MarkovChainIntersection, | |||
) -> None: | |||
"""Fill out the confidence and chain information inside this source.""" | |||
self.confidence = confidence | |||
self.chains = (source_chain, delta_chain) | |||
def finish_work(self): | |||
def finish_work(self) -> None: | |||
"""Mark this source as finished.""" | |||
self._event2.set() | |||
def skip(self): | |||
def skip(self) -> None: | |||
"""Deactivate this source without filling in the relevant data.""" | |||
if self._event1.is_set(): | |||
return | |||
self.skipped = True | |||
self._event1.set() | |||
def join(self, until): | |||
def join(self, until: float | None = None) -> None: | |||
"""Block until this violation result is filled out.""" | |||
for event in [self._event1, self._event2]: | |||
if until: | |||
timeout = until - time() | |||
if until is not None: | |||
timeout = until - time.time() | |||
if timeout <= 0: | |||
return | |||
event.wait(timeout) | |||
@@ -144,16 +160,15 @@ class CopyvioCheckResult: | |||
def __init__( | |||
self, | |||
violation, | |||
sources, | |||
queries, | |||
check_time, | |||
article_chain, | |||
possible_miss, | |||
included_sources=None, | |||
unified_confidence=None, | |||
violation: bool, | |||
sources: list[CopyvioSource], | |||
queries: int, | |||
check_time: float, | |||
article_chain: MarkovChain, | |||
possible_miss: bool, | |||
included_sources: list[CopyvioSource] | None = None, | |||
unified_confidence: float | None = None, | |||
): | |||
assert isinstance(sources, list) | |||
self.violation = violation | |||
self.sources = sources | |||
self.queries = queries | |||
@@ -163,48 +178,47 @@ class CopyvioCheckResult: | |||
self.included_sources = included_sources if included_sources else [] | |||
self.unified_confidence = unified_confidence | |||
def __repr__(self): | |||
def __repr__(self) -> str: | |||
"""Return the canonical string representation of the result.""" | |||
res = "CopyvioCheckResult(violation={0!r}, sources={1!r}, queries={2!r}, time={3!r})" | |||
return res.format(self.violation, self.sources, self.queries, self.time) | |||
return ( | |||
f"CopyvioCheckResult(violation={self.violation!r}, " | |||
f"sources={self.sources!r}, queries={self.queries!r}, time={self.time!r})" | |||
) | |||
def __str__(self): | |||
def __str__(self) -> str: | |||
"""Return a nice string representation of the result.""" | |||
res = "<CopyvioCheckResult ({0} with best {1})>" | |||
return res.format(self.violation, self.best) | |||
return f"<CopyvioCheckResult ({self.violation} with best {self.best})>" | |||
@property | |||
def best(self): | |||
def best(self) -> CopyvioSource | None: | |||
"""The best known source, or None if no sources exist.""" | |||
return self.sources[0] if self.sources else None | |||
@property | |||
def confidence(self): | |||
def confidence(self) -> float: | |||
"""The confidence of the best source, or 0 if no sources exist.""" | |||
if self.unified_confidence is not None: | |||
return self.unified_confidence | |||
if self.best: | |||
if self.best is not None: | |||
return self.best.confidence | |||
return 0.0 | |||
@property | |||
def url(self): | |||
def url(self) -> str | None: | |||
"""The URL of the best source, or None if no sources exist.""" | |||
return self.best.url if self.best else None | |||
def get_log_message(self, title): | |||
def get_log_message(self, title: str) -> str: | |||
"""Build a relevant log message for this copyvio check result.""" | |||
if not self.sources: | |||
log = "No violation for [[{0}]] (no sources; {1} queries; {2} seconds)" | |||
return log.format(title, self.queries, self.time) | |||
log = "{0} for [[{1}]] (best: {2} ({3} confidence); {4} sources; {5} queries; {6} seconds)" | |||
return ( | |||
f"No violation for [[{title}]] (no sources; {self.queries} queries; " | |||
f"{self.time} seconds)" | |||
) | |||
is_vio = "Violation detected" if self.violation else "No violation" | |||
return log.format( | |||
is_vio, | |||
title, | |||
self.url, | |||
self.confidence, | |||
len(self.sources), | |||
self.queries, | |||
self.time, | |||
return ( | |||
f"{is_vio} for [[{title}]] (best: {self.url} ({self.confidence} " | |||
f"confidence); {len(self.sources)} sources; {self.queries} queries; " | |||
f"{self.time} seconds)" | |||
) |
@@ -18,91 +18,101 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
import re | |||
from gzip import GzipFile | |||
from io import StringIO | |||
from json import loads | |||
from urllib.error import URLError | |||
from urllib.parse import urlencode | |||
from earwigbot.exceptions import SearchQueryError | |||
__all__ = [ | |||
"BingSearchEngine", | |||
"GoogleSearchEngine", | |||
"SearchEngine", | |||
"YandexSearchEngine", | |||
"SEARCH_ENGINES", | |||
"get_search_engine", | |||
] | |||
import base64 | |||
import gzip | |||
import io | |||
import json | |||
import re | |||
import urllib.parse | |||
import urllib.request | |||
from abc import ABC, abstractmethod | |||
from typing import Any | |||
from urllib.error import URLError | |||
class _BaseSearchEngine: | |||
from earwigbot import exceptions | |||
class SearchEngine(ABC): | |||
"""Base class for a simple search engine interface.""" | |||
name = "Base" | |||
def __init__(self, cred, opener): | |||
def __init__( | |||
self, cred: dict[str, str], opener: urllib.request.OpenerDirector | |||
) -> None: | |||
"""Store credentials (*cred*) and *opener* for searching later on.""" | |||
self.cred = cred | |||
self.opener = opener | |||
self.count = 5 | |||
def __repr__(self): | |||
def __repr__(self) -> str: | |||
"""Return the canonical string representation of the search engine.""" | |||
return f"{self.__class__.__name__}()" | |||
def __str__(self): | |||
def __str__(self) -> str: | |||
"""Return a nice string representation of the search engine.""" | |||
return f"<{self.__class__.__name__}>" | |||
def _open(self, *args): | |||
def _open(self, url: str) -> bytes: | |||
"""Open a URL (like urlopen) and try to return its contents.""" | |||
try: | |||
response = self.opener.open(*args) | |||
response = self.opener.open(url) | |||
result = response.read() | |||
except (OSError, URLError) as exc: | |||
err = SearchQueryError(f"{self.name} Error: {exc}") | |||
err.cause = exc | |||
raise err | |||
raise exceptions.SearchQueryError(f"{self.name} Error: {exc}") | |||
if response.headers.get("Content-Encoding") == "gzip": | |||
stream = StringIO(result) | |||
gzipper = GzipFile(fileobj=stream) | |||
stream = io.BytesIO(result) | |||
gzipper = gzip.GzipFile(fileobj=stream) | |||
result = gzipper.read() | |||
code = response.getcode() | |||
if code != 200: | |||
err = "{0} Error: got response code '{1}':\n{2}'" | |||
raise SearchQueryError(err.format(self.name, code, result)) | |||
raise exceptions.SearchQueryError( | |||
f"{self.name} Error: got response code '{code}':\n{result}'" | |||
) | |||
return result | |||
@staticmethod | |||
def requirements(): | |||
def requirements() -> list[str]: | |||
"""Return a list of packages required by this search engine.""" | |||
return [] | |||
def search(self, query): | |||
"""Use this engine to search for *query*. | |||
@abstractmethod | |||
def search(self, query: str) -> list[str]: | |||
""" | |||
Use this engine to search for *query*. | |||
Not implemented in this base class; overridden in subclasses. | |||
""" | |||
raise NotImplementedError() | |||
class BingSearchEngine(_BaseSearchEngine): | |||
class BingSearchEngine(SearchEngine): | |||
"""A search engine interface with Bing Search (via Azure Marketplace).""" | |||
name = "Bing" | |||
def __init__(self, cred, opener): | |||
def __init__( | |||
self, cred: dict[str, str], opener: urllib.request.OpenerDirector | |||
) -> None: | |||
super().__init__(cred, opener) | |||
key = self.cred["key"] | |||
auth = (key + ":" + key).encode("base64").replace("\n", "") | |||
self.opener.addheaders.append(("Authorization", "Basic " + auth)) | |||
auth = base64.b64encode(f"{key}:{key}".encode()).decode() | |||
self.opener.addheaders.append(("Authorization", f"Basic {auth}")) | |||
def search(self, query: str) -> list[str]: | |||
"""Do a Bing web search for *query*. | |||
""" | |||
Do a Bing web search for *query*. | |||
Returns a list of URLs ranked by relevance (as determined by Bing). | |||
Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. | |||
@@ -112,20 +122,19 @@ class BingSearchEngine(_BaseSearchEngine): | |||
params = { | |||
"$format": "json", | |||
"$top": str(self.count), | |||
"Query": "'\"" + query.replace('"', "").encode("utf8") + "\"'", | |||
"Query": "'\"" + query.replace('"', "") + "\"'", | |||
"Market": "'en-US'", | |||
"Adult": "'Off'", | |||
"Options": "'DisableLocationDetection'", | |||
"WebSearchOptions": "'DisableHostCollapsing+DisableQueryAlterations'", | |||
} | |||
result = self._open(url + urlencode(params)) | |||
result = self._open(url + urllib.parse.urlencode(params)) | |||
try: | |||
res = loads(result) | |||
res = json.loads(result) | |||
except ValueError: | |||
err = "Bing Error: JSON could not be decoded" | |||
raise SearchQueryError(err) | |||
raise exceptions.SearchQueryError("Bing Error: JSON could not be decoded") | |||
try: | |||
results = res["d"]["results"] | |||
@@ -134,13 +143,14 @@ class BingSearchEngine(_BaseSearchEngine): | |||
return [result["Url"] for result in results] | |||
class GoogleSearchEngine(_BaseSearchEngine): | |||
class GoogleSearchEngine(SearchEngine): | |||
"""A search engine interface with Google Search.""" | |||
name = "Google" | |||
def search(self, query: str) -> list[str]: | |||
"""Do a Google web search for *query*. | |||
""" | |||
Do a Google web search for *query*. | |||
Returns a list of URLs ranked by relevance (as determined by Google). | |||
Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. | |||
@@ -157,13 +167,13 @@ class GoogleSearchEngine(_BaseSearchEngine): | |||
"fields": "items(link)", | |||
} | |||
result = self._open(url + urlencode(params)) | |||
result = self._open(url + urllib.parse.urlencode(params)) | |||
try: | |||
res = loads(result) | |||
res = json.loads(result) | |||
except ValueError: | |||
err = "Google Error: JSON could not be decoded" | |||
raise SearchQueryError(err) | |||
raise exceptions.SearchQueryError(err) | |||
try: | |||
return [item["link"] for item in res["items"]] | |||
@@ -171,7 +181,7 @@ class GoogleSearchEngine(_BaseSearchEngine): | |||
return [] | |||
class YandexSearchEngine(_BaseSearchEngine): | |||
class YandexSearchEngine(SearchEngine): | |||
"""A search engine interface with Yandex Search.""" | |||
name = "Yandex" | |||
@@ -181,7 +191,8 @@ class YandexSearchEngine(_BaseSearchEngine): | |||
return ["lxml.etree"] | |||
def search(self, query: str) -> list[str]: | |||
"""Do a Yandex web search for *query*. | |||
""" | |||
Do a Yandex web search for *query*. | |||
Returns a list of URLs ranked by relevance (as determined by Yandex). | |||
Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. | |||
@@ -201,17 +212,51 @@ class YandexSearchEngine(_BaseSearchEngine): | |||
"groupby": f"mode=flat.groups-on-page={self.count}", | |||
} | |||
result = self._open(url + urlencode(params)) | |||
result = self._open(url + urllib.parse.urlencode(params)) | |||
try: | |||
data = lxml.etree.fromstring(result) # type: ignore | |||
data = lxml.etree.fromstring(result) | |||
return [elem.text for elem in data.xpath(".//url")] | |||
except lxml.etree.Error as exc: | |||
raise SearchQueryError("Yandex XML parse error: " + str(exc)) | |||
raise exceptions.SearchQueryError(f"Yandex XML parse error: {exc}") | |||
SEARCH_ENGINES = { | |||
SEARCH_ENGINES: dict[str, type[SearchEngine]] = { | |||
"Bing": BingSearchEngine, | |||
"Google": GoogleSearchEngine, | |||
"Yandex": YandexSearchEngine, | |||
} | |||
def get_search_engine( | |||
search_config: dict[str, Any], headers: list[tuple[str, str]] | |||
) -> SearchEngine: | |||
"""Return a function that can be called to do web searches. | |||
The function takes one argument, a search query, and returns a list of URLs, ranked | |||
by importance. The underlying logic depends on the *engine* argument within our | |||
config; for example, if *engine* is "Yahoo! BOSS", we'll use YahooBOSSSearchEngine | |||
for querying. | |||
Raises UnknownSearchEngineError if the 'engine' listed in our config is unknown to | |||
us, and UnsupportedSearchEngineError if we are missing a required package or | |||
module, like oauth2 for "Yahoo! BOSS". | |||
""" | |||
engine = search_config["engine"] | |||
if engine not in SEARCH_ENGINES: | |||
raise exceptions.UnknownSearchEngineError(engine) | |||
klass = SEARCH_ENGINES[engine] | |||
credentials = search_config["credentials"] | |||
opener = urllib.request.build_opener() | |||
opener.addheaders = headers | |||
for dep in klass.requirements(): | |||
try: | |||
__import__(dep).__name__ | |||
except (ModuleNotFoundError, AttributeError): | |||
e = "Missing a required dependency ({}) for the {} engine" | |||
e = e.format(dep, engine) | |||
raise exceptions.UnsupportedSearchEngineError(e) | |||
return klass(credentials, opener) |
@@ -18,59 +18,61 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import annotations | |||
__all__ = ["globalize", "localize", "CopyvioWorkspace"] | |||
import base64 | |||
import collections | |||
import dataclasses | |||
import functools | |||
import gzip | |||
import io | |||
import logging | |||
import math | |||
import queue | |||
import struct | |||
import threading | |||
import time | |||
import urllib.parse | |||
from collections import deque | |||
from gzip import GzipFile | |||
import urllib.request | |||
from collections.abc import Callable, Container | |||
from dataclasses import dataclass | |||
from http.client import HTTPException | |||
from io import StringIO | |||
from logging import getLogger | |||
from math import log | |||
from queue import Empty, Queue | |||
from struct import error as struct_error | |||
from threading import Lock, Thread | |||
from typing import Any | |||
from urllib.error import URLError | |||
from urllib.request import Request, build_opener | |||
from earwigbot import importer | |||
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError | |||
from earwigbot.wiki.copyvios.markov import ( | |||
DEFAULT_DEGREE, | |||
MarkovChain, | |||
MarkovChainIntersection, | |||
MarkovChainUnion, | |||
) | |||
from earwigbot.wiki.copyvios.parsers import get_parser | |||
from earwigbot.wiki.copyvios.parsers import ParserArgs, SourceParser, get_parser | |||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | |||
tldextract = importer.new("tldextract") | |||
__all__ = ["globalize", "localize", "CopyvioWorkspace"] | |||
INCLUDE_THRESHOLD = 0.15 | |||
_MAX_REDIRECTS = 3 | |||
_MAX_RAW_SIZE = 20 * 1024**2 | |||
_is_globalized = False | |||
_global_queues = None | |||
_global_workers = [] | |||
_global_queues: _CopyvioQueues | None = None | |||
_global_workers: list[_CopyvioWorker] = [] | |||
_OpenedURL = collections.namedtuple("_OpenedURL", ["content", "parser_class"]) | |||
def globalize(num_workers: int = 8) -> None: | |||
""" | |||
Cause all copyvio checks to be done by one global set of workers. | |||
def globalize(num_workers=8): | |||
"""Cause all copyvio checks to be done by one global set of workers. | |||
This is useful when checks are being done through a web interface where | |||
large numbers of simulatenous requests could be problematic. The global | |||
workers are spawned when the function is called, run continuously, and | |||
intelligently handle multiple checks. | |||
This is useful when checks are being done through a web interface where large | |||
numbers of simulatenous requests could be problematic. The global workers are | |||
spawned when the function is called, run continuously, and intelligently handle | |||
multiple checks. | |||
This function is not thread-safe and should only be called when no checks | |||
are being done. It has no effect if it has already been called. | |||
This function is not thread-safe and should only be called when no checks are being | |||
done. It has no effect if it has already been called. | |||
""" | |||
global _is_globalized, _global_queues | |||
if _is_globalized: | |||
@@ -84,19 +86,20 @@ def globalize(num_workers=8): | |||
_is_globalized = True | |||
def localize(): | |||
def localize() -> None: | |||
"""Return to using page-specific workers for copyvio checks. | |||
This disables changes made by :func:`globalize`, including stoping the | |||
global worker threads. | |||
This disables changes made by :func:`globalize`, including stoping the global | |||
worker threads. | |||
This function is not thread-safe and should only be called when no checks | |||
are being done. | |||
This function is not thread-safe and should only be called when no checks are | |||
being done. | |||
""" | |||
global _is_globalized, _global_queues, _global_workers | |||
if not _is_globalized: | |||
return | |||
assert _global_queues is not None | |||
for i in range(len(_global_workers)): | |||
_global_queues.unassigned.put((StopIteration, None)) | |||
_global_queues = None | |||
@@ -104,30 +107,50 @@ def localize(): | |||
_is_globalized = False | |||
@dataclass(frozen=True) | |||
class OpenedURL: | |||
content: bytes | |||
parser_class: type[SourceParser] | |||
SourceQueue = collections.deque[CopyvioSource] | |||
UnassignedQueue = queue.Queue[ | |||
tuple[str, SourceQueue] | tuple[type[StopIteration], None] | |||
] | |||
@dataclass(frozen=True) | |||
class _CopyvioQueues: | |||
"""Stores data necessary to maintain the various queues during a check.""" | |||
def __init__(self): | |||
self.lock = Lock() | |||
self.sites = {} | |||
self.unassigned = Queue() | |||
lock: threading.Lock = dataclasses.field(default_factory=threading.Lock) | |||
sites: dict[str, SourceQueue] = dataclasses.field(default_factory=dict) | |||
unassigned: UnassignedQueue = dataclasses.field(default_factory=queue.Queue) | |||
class _CopyvioWorker: | |||
"""A multithreaded URL opener/parser instance.""" | |||
def __init__(self, name, queues, until=None): | |||
def __init__( | |||
self, name: str, queues: _CopyvioQueues, until: float | None = None | |||
) -> None: | |||
self._name = name | |||
self._queues = queues | |||
self._until = until | |||
self._site = None | |||
self._queue = None | |||
self._search_config = None | |||
self._opener = build_opener() | |||
self._logger = getLogger("earwigbot.wiki.cvworker." + name) | |||
self._site: str | None = None | |||
self._queue: SourceQueue | None = None | |||
self._search_config: dict[str, Any] | None = None | |||
self._opener = urllib.request.build_opener() | |||
self._logger = logging.getLogger("earwigbot.wiki.cvworker." + name) | |||
def _try_map_proxy_url(self, url, parsed, extra_headers, is_error=False): | |||
def _try_map_proxy_url( | |||
self, | |||
url: str, | |||
parsed: urllib.parse.ParseResult, | |||
extra_headers: dict[str, str], | |||
is_error: bool = False, | |||
) -> tuple[str, bool]: | |||
if not self._search_config or "proxies" not in self._search_config: | |||
return url, False | |||
for proxy_info in self._search_config["proxies"]: | |||
@@ -152,17 +175,20 @@ class _CopyvioWorker: | |||
return url, True | |||
return url, False | |||
def _open_url_raw(self, url, timeout=5, allow_content_types=None): | |||
def _open_url_raw( | |||
self, | |||
url: str, | |||
timeout: float = 5, | |||
allow_content_types: Container[str] | None = None, | |||
) -> OpenedURL | None: | |||
"""Open a URL, without parsing it. | |||
None will be returned for URLs that cannot be read for whatever reason. | |||
""" | |||
parsed = urllib.parse.urlparse(url) | |||
if not isinstance(url, str): | |||
url = url.encode("utf8") | |||
extra_headers = {} | |||
extra_headers: dict[str, str] = {} | |||
url, _ = self._try_map_proxy_url(url, parsed, extra_headers) | |||
request = Request(url, headers=extra_headers) | |||
request = urllib.request.Request(url, headers=extra_headers) | |||
try: | |||
response = self._opener.open(request, timeout=timeout) | |||
except (OSError, URLError, HTTPException, ValueError): | |||
@@ -170,14 +196,14 @@ class _CopyvioWorker: | |||
url, parsed, extra_headers, is_error=True | |||
) | |||
if not remapped: | |||
self._logger.exception("Failed to fetch URL: %s", url) | |||
self._logger.exception(f"Failed to fetch URL: {url}") | |||
return None | |||
self._logger.info("Failed to fetch URL, trying proxy remap: %s", url) | |||
request = Request(url, headers=extra_headers) | |||
self._logger.info(f"Failed to fetch URL, trying proxy remap: {url}") | |||
request = urllib.request.Request(url, headers=extra_headers) | |||
try: | |||
response = self._opener.open(request, timeout=timeout) | |||
except (OSError, URLError, HTTPException, ValueError): | |||
self._logger.exception("Failed to fetch URL after proxy remap: %s", url) | |||
self._logger.exception(f"Failed to fetch URL after proxy remap: {url}") | |||
return None | |||
try: | |||
@@ -193,7 +219,7 @@ class _CopyvioWorker: | |||
): | |||
return None | |||
if not parser_class: | |||
parser_class = get_parser("text/plain") | |||
parser_class = get_parser() | |||
if size > (15 if parser_class.TYPE == "PDF" else 2) * 1024**2: | |||
return None | |||
@@ -207,28 +233,27 @@ class _CopyvioWorker: | |||
return None | |||
if response.headers.get("Content-Encoding") == "gzip": | |||
stream = StringIO(content) | |||
gzipper = GzipFile(fileobj=stream) | |||
stream = io.BytesIO(content) | |||
gzipper = gzip.GzipFile(fileobj=stream) | |||
try: | |||
content = gzipper.read() | |||
except (OSError, struct_error): | |||
except (OSError, struct.error): | |||
return None | |||
if len(content) > _MAX_RAW_SIZE: | |||
return None | |||
return _OpenedURL(content, parser_class) | |||
return OpenedURL(content, parser_class) | |||
def _open_url(self, source, redirects=0): | |||
def _open_url(self, source: CopyvioSource, redirects: int = 0) -> str | None: | |||
"""Open a URL and return its parsed content, or None. | |||
First, we will decompress the content if the headers contain "gzip" as | |||
its content encoding. Then, we will return the content stripped using | |||
an HTML parser if the headers indicate it is HTML, or return the | |||
content directly if it is plain text. If we don't understand the | |||
content type, we'll return None. | |||
First, we will decompress the content if the headers contain "gzip" as its | |||
content encoding. Then, we will return the content stripped using an HTML | |||
parser if the headers indicate it is HTML, or return the content directly if it | |||
is plain text. If we don't understand the content type, we'll return None. | |||
If a URLError was raised while opening the URL or an IOError was raised | |||
while decompressing, None will be returned. | |||
If a URLError was raised while opening the URL or an IOError was raised while | |||
decompressing, None will be returned. | |||
""" | |||
self._search_config = source.search_config | |||
if source.headers: | |||
@@ -238,9 +263,9 @@ class _CopyvioWorker: | |||
if result is None: | |||
return None | |||
args = source.parser_args.copy() if source.parser_args else {} | |||
args: ParserArgs = source.parser_args.copy() if source.parser_args else {} | |||
args["open_url"] = functools.partial(self._open_url_raw, timeout=source.timeout) | |||
parser = result.parser_class(result.content, url=source.url, args=args) | |||
parser = result.parser_class(result.content, source.url, args=args) | |||
try: | |||
return parser.parse() | |||
except ParserRedirectError as exc: | |||
@@ -249,30 +274,31 @@ class _CopyvioWorker: | |||
source.url = exc.url.decode("utf8") | |||
return self._open_url(source, redirects=redirects + 1) | |||
def _acquire_new_site(self): | |||
def _acquire_new_site(self) -> None: | |||
"""Block for a new unassigned site queue.""" | |||
if self._until: | |||
timeout = self._until - time.time() | |||
if timeout <= 0: | |||
raise Empty | |||
raise queue.Empty() | |||
else: | |||
timeout = None | |||
self._logger.debug("Waiting for new site queue") | |||
site, queue = self._queues.unassigned.get(timeout=timeout) | |||
if site is StopIteration: | |||
site, q = self._queues.unassigned.get(timeout=timeout) | |||
if isinstance(site, type) and issubclass(site, StopIteration): | |||
raise StopIteration | |||
self._logger.debug(f"Acquired new site queue: {site}") | |||
self._site = site | |||
self._queue = queue | |||
self._queue = q | |||
def _dequeue(self): | |||
def _dequeue(self) -> CopyvioSource: | |||
"""Remove a source from one of the queues.""" | |||
if not self._site: | |||
self._acquire_new_site() | |||
assert self._site is not None | |||
assert self._queue is not None | |||
logmsg = "Fetching source URL from queue {0}" | |||
self._logger.debug(logmsg.format(self._site)) | |||
self._logger.debug(f"Fetching source URL from queue {self._site}") | |||
self._queues.lock.acquire() | |||
try: | |||
source = self._queue.popleft() | |||
@@ -294,11 +320,11 @@ class _CopyvioWorker: | |||
self._queues.lock.release() | |||
return source | |||
def _handle_once(self): | |||
"""Handle a single source from one of the queues.""" | |||
def _handle_once(self) -> bool: | |||
"""Handle a single source from one of the queues. Return if we should exit.""" | |||
try: | |||
source = self._dequeue() | |||
except Empty: | |||
except queue.Empty: | |||
self._logger.debug("Exiting: queue timed out") | |||
return False | |||
except StopIteration: | |||
@@ -320,12 +346,11 @@ class _CopyvioWorker: | |||
source.workspace.compare(source, chain) | |||
return True | |||
def _run(self): | |||
def _run(self) -> None: | |||
"""Main entry point for the worker thread. | |||
We will keep fetching URLs from the queues and handling them until | |||
either we run out of time, or we get an exit signal that the queue is | |||
now empty. | |||
We will keep fetching URLs from the queues and handling them until either we | |||
run out of time, or we get an exit signal that the queue is now empty. | |||
""" | |||
while True: | |||
try: | |||
@@ -335,9 +360,9 @@ class _CopyvioWorker: | |||
self._logger.exception("Uncaught exception in worker") | |||
time.sleep(5) # Delay if we get stuck in a busy loop | |||
def start(self): | |||
def start(self) -> None: | |||
"""Start the copyvio worker in a new thread.""" | |||
thread = Thread(target=self._run, name="cvworker-" + self._name) | |||
thread = threading.Thread(target=self._run, name="cvworker-" + self._name) | |||
thread.daemon = True | |||
thread.start() | |||
@@ -347,20 +372,20 @@ class CopyvioWorkspace: | |||
def __init__( | |||
self, | |||
article, | |||
min_confidence, | |||
max_time, | |||
logger, | |||
headers, | |||
url_timeout=5, | |||
num_workers=8, | |||
short_circuit=True, | |||
parser_args=None, | |||
exclude_check=None, | |||
config=None, | |||
degree=5, | |||
): | |||
self.sources = [] | |||
article: MarkovChain, | |||
min_confidence: float, | |||
max_time: float, | |||
logger: logging.Logger, | |||
headers: list[tuple[str, str]], | |||
url_timeout: float = 5, | |||
num_workers: int = 8, | |||
short_circuit: bool = True, | |||
parser_args: ParserArgs | None = None, | |||
exclusion_callback: Callable[[str], bool] | None = None, | |||
config: dict[str, Any] | None = None, | |||
degree: int = DEFAULT_DEGREE, | |||
) -> None: | |||
self.sources: list[CopyvioSource] = [] | |||
self.finished = False | |||
self.possible_miss = False | |||
@@ -369,8 +394,8 @@ class CopyvioWorkspace: | |||
self._min_confidence = min_confidence | |||
self._start_time = time.time() | |||
self._until = (self._start_time + max_time) if max_time > 0 else None | |||
self._handled_urls = set() | |||
self._finish_lock = Lock() | |||
self._handled_urls: set[str] = set() | |||
self._finish_lock = threading.Lock() | |||
self._short_circuit = short_circuit | |||
self._source_args = { | |||
"workspace": self, | |||
@@ -379,10 +404,11 @@ class CopyvioWorkspace: | |||
"parser_args": parser_args, | |||
"search_config": config, | |||
} | |||
self._exclude_check = exclude_check | |||
self._exclusion_callback = exclusion_callback | |||
self._degree = degree | |||
if _is_globalized: | |||
assert _global_queues is not None | |||
self._queues = _global_queues | |||
else: | |||
self._queues = _CopyvioQueues() | |||
@@ -391,28 +417,27 @@ class CopyvioWorkspace: | |||
name = f"local-{id(self) % 10000:04}.{i}" | |||
_CopyvioWorker(name, self._queues, self._until).start() | |||
def _calculate_confidence(self, delta): | |||
def _calculate_confidence(self, delta: MarkovChainIntersection) -> float: | |||
"""Return the confidence of a violation as a float between 0 and 1.""" | |||
def conf_with_article_and_delta(article, delta): | |||
def conf_with_article_and_delta(article: float, delta: float) -> float: | |||
"""Calculate confidence using the article and delta chain sizes.""" | |||
# This piecewise function exhibits exponential growth until it | |||
# reaches the default "suspect" confidence threshold, at which | |||
# point it transitions to polynomial growth with a limit of 1 as | |||
# (delta / article) approaches 1. | |||
# This piecewise function exhibits exponential growth until it reaches the | |||
# default "suspect" confidence threshold, at which point it transitions to | |||
# polynomial growth with a limit of 1 as # (delta / article) approaches 1. | |||
# A graph can be viewed here: https://goo.gl/mKPhvr | |||
ratio = delta / article | |||
if ratio <= 0.52763: | |||
return -log(1 - ratio) | |||
return -math.log(1 - ratio) | |||
else: | |||
return (-0.8939 * (ratio**2)) + (1.8948 * ratio) - 0.0009 | |||
def conf_with_delta(delta): | |||
def conf_with_delta(delta: float) -> float: | |||
"""Calculate confidence using just the delta chain size.""" | |||
# This piecewise function was derived from experimental data using | |||
# reference points at (0, 0), (100, 0.5), (250, 0.75), (500, 0.9), | |||
# and (1000, 0.95), with a limit of 1 as delta approaches infinity. | |||
# A graph can be viewed here: https://goo.gl/lVl7or | |||
# reference points at (0, 0), (100, 0.5), (250, 0.75), (500, 0.9), and | |||
# (1000, 0.95), with a limit of 1 as delta approaches infinity. A graph can | |||
# be viewed here: https://goo.gl/lVl7or | |||
if delta <= 100: | |||
return delta / (delta + 100) | |||
elif delta <= 250: | |||
@@ -430,7 +455,7 @@ class CopyvioWorkspace: | |||
) | |||
) | |||
def _finish_early(self): | |||
def _finish_early(self) -> None: | |||
"""Finish handling links prematurely (if we've hit min_confidence).""" | |||
self._logger.debug("Confidence threshold met; skipping remaining sources") | |||
with self._queues.lock: | |||
@@ -438,7 +463,7 @@ class CopyvioWorkspace: | |||
source.skip() | |||
self.finished = True | |||
def enqueue(self, urls): | |||
def enqueue(self, urls: list[str]) -> None: | |||
"""Put a list of URLs into the various worker queues.""" | |||
for url in urls: | |||
with self._queues.lock: | |||
@@ -449,7 +474,7 @@ class CopyvioWorkspace: | |||
source = CopyvioSource(url=url, **self._source_args) | |||
self.sources.append(source) | |||
if self._exclude_check and self._exclude_check(url): | |||
if self._exclusion_callback and self._exclusion_callback(url): | |||
self._logger.debug(f"enqueue(): exclude {url}") | |||
source.excluded = True | |||
source.skip() | |||
@@ -460,32 +485,37 @@ class CopyvioWorkspace: | |||
continue | |||
try: | |||
import tldextract | |||
key = tldextract.extract(url).registered_domain | |||
except ImportError: # Fall back on very naive method | |||
except ModuleNotFoundError: # Fall back on very naive method | |||
from urllib.parse import urlparse | |||
key = ".".join(urlparse(url).netloc.split(".")[-2:]) | |||
logmsg = "enqueue(): {0} {1} -> {2}" | |||
logmsg = f"enqueue(): %s {key} -> {url}" | |||
if key in self._queues.sites: | |||
self._logger.debug(logmsg.format("append", key, url)) | |||
self._logger.debug(logmsg % "append") | |||
self._queues.sites[key].append(source) | |||
else: | |||
self._logger.debug(logmsg.format("new", key, url)) | |||
self._queues.sites[key] = queue = deque() | |||
queue.append(source) | |||
self._queues.unassigned.put((key, queue)) | |||
self._logger.debug(logmsg % "new") | |||
q: SourceQueue = collections.deque() | |||
q.append(source) | |||
self._queues.sites[key] = q | |||
self._queues.unassigned.put((key, q)) | |||
def compare(self, source, source_chain): | |||
def compare(self, source: CopyvioSource, source_chain: MarkovChain | None) -> None: | |||
"""Compare a source to the article; call _finish_early if necessary.""" | |||
if source_chain: | |||
delta = MarkovChainIntersection(self._article, source_chain) | |||
conf = self._calculate_confidence(delta) | |||
else: | |||
delta = None | |||
conf = 0.0 | |||
self._logger.debug(f"compare(): {source.url} -> {conf}") | |||
with self._finish_lock: | |||
if source_chain: | |||
assert delta is not None | |||
source.update(conf, source_chain, delta) | |||
source.finish_work() | |||
if not self.finished and conf >= self._min_confidence: | |||
@@ -494,7 +524,7 @@ class CopyvioWorkspace: | |||
else: | |||
self.finished = True | |||
def wait(self): | |||
def wait(self) -> None: | |||
"""Wait for the workers to finish handling the sources.""" | |||
self._logger.debug(f"Waiting on {len(self.sources)} sources") | |||
for source in self.sources: | |||
@@ -505,7 +535,7 @@ class CopyvioWorkspace: | |||
for i in range(self._num_workers): | |||
self._queues.unassigned.put((StopIteration, None)) | |||
def get_result(self, num_queries=0): | |||
def get_result(self, num_queries: int = 0) -> CopyvioCheckResult: | |||
"""Return a CopyvioCheckResult containing the results of this check.""" | |||
self.sources.sort( | |||
key=lambda s: ( | |||
@@ -35,14 +35,14 @@ import mwparserfromhell | |||
from earwigbot import exceptions | |||
from earwigbot.exceptions import APIError | |||
from earwigbot.wiki.copyvios import CopyvioMixIn | |||
from earwigbot.wiki.copyvios import DEFAULT_DEGREE, CopyvioChecker, CopyvioCheckResult | |||
if typing.TYPE_CHECKING: | |||
from earwigbot.wiki.site import Site | |||
from earwigbot.wiki.user import User | |||
class Page(CopyvioMixIn): | |||
class Page: | |||
""" | |||
**EarwigBot: Wiki Toolset: Page** | |||
@@ -110,7 +110,6 @@ class Page(CopyvioMixIn): | |||
__init__() will not do any API queries, but it will use basic namespace logic | |||
to determine our namespace ID and if we are a talkpage. | |||
""" | |||
super().__init__(site) | |||
self._site = site | |||
self._title = title.strip() | |||
self._follow_redirects = self._keep_following = follow_redirects | |||
@@ -873,3 +872,108 @@ class Page(CopyvioMixIn): | |||
return False | |||
return True | |||
def copyvio_check( | |||
self, | |||
min_confidence: float = 0.75, | |||
max_queries: int = 15, | |||
max_time: float = -1, | |||
no_searches: bool = False, | |||
no_links: bool = False, | |||
short_circuit: bool = True, | |||
degree: int = DEFAULT_DEGREE, | |||
) -> CopyvioCheckResult: | |||
""" | |||
Check the page for copyright violations. | |||
Returns a :class:`.CopyvioCheckResult` object with information on the results | |||
of the check. | |||
*min_confidence* is the minimum amount of confidence we must have in the | |||
similarity between a source text and the article in order for us to consider it | |||
a suspected violation. This is a number between 0 and 1. | |||
*max_queries* is self-explanatory; we will never make more than this number of | |||
queries in a given check. | |||
*max_time* can be set to prevent copyvio checks from taking longer than a set | |||
amount of time (generally around a minute), which can be useful if checks are | |||
called through a web server with timeouts. We will stop checking new URLs as | |||
soon as this limit is reached. | |||
Setting *no_searches* to ``True`` will cause only URLs in the wikitext of the | |||
page to be checked; no search engine queries will be made. Setting *no_links* | |||
to ``True`` will cause the opposite to happen: URLs in the wikitext will be | |||
ignored; search engine queries will be made only. Setting both of these to | |||
``True`` is pointless. | |||
Normally, the checker will short-circuit if it finds a URL that meets | |||
*min_confidence*. This behavior normally causes it to skip any remaining URLs | |||
and web queries, but setting *short_circuit* to ``False`` will prevent this. | |||
The *degree* controls the n-gram word size used in comparing similarity. It | |||
should usually be a number between 3 and 5. | |||
Raises :exc:`.CopyvioCheckError` or subclasses | |||
(:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on errors. | |||
""" | |||
self._logger.info(f"Starting copyvio check for [[{self.title}]]") | |||
checker = CopyvioChecker( | |||
self, | |||
min_confidence=min_confidence, | |||
max_time=max_time, | |||
degree=degree, | |||
logger=self._logger, | |||
) | |||
result = checker.run_check( | |||
max_queries=max_queries, | |||
no_searches=no_searches, | |||
no_links=no_links, | |||
short_circuit=short_circuit, | |||
) | |||
self._logger.info(result.get_log_message(self.title)) | |||
return result | |||
def copyvio_compare( | |||
self, | |||
urls: list[str] | str, | |||
min_confidence: float = 0.75, | |||
max_time: float = 30, | |||
degree: int = DEFAULT_DEGREE, | |||
) -> CopyvioCheckResult: | |||
""" | |||
Check the page, like :py:meth:`copyvio_check`, against specific URLs. | |||
This is essentially a reduced version of :meth:`copyvio_check` - a copyivo | |||
comparison is made using Markov chains and the result is returned in a | |||
:class:`.CopyvioCheckResult` object - but without using a search engine, since | |||
the suspected "violated" URL is supplied from the start. | |||
One use case is to generate a result when the URL is retrieved from a cache, | |||
like the one used in EarwigBot's Toolforge site. After a search is done, the | |||
resulting URL is stored in a cache for 72 hours so future checks against that | |||
page will not require another set of time-and-money-consuming search engine | |||
queries. However, the comparison itself (which includes the article's and the | |||
source's content) cannot be stored for data retention reasons, so a fresh | |||
comparison is made using this function. | |||
Since no searching is done, neither :exc:`.UnknownSearchEngineError` nor | |||
:exc:`.SearchQueryError` will be raised. | |||
""" | |||
if not isinstance(urls, list): | |||
urls = [urls] | |||
self._logger.info( | |||
f"Starting copyvio compare for [[{self.title}]] against {', '.join(urls)}" | |||
) | |||
checker = CopyvioChecker( | |||
self, | |||
min_confidence=min_confidence, | |||
max_time=max_time, | |||
degree=degree, | |||
logger=self._logger, | |||
) | |||
result = checker.run_compare(urls) | |||
self._logger.info(result.get_log_message(self.title)) | |||
return result |