@@ -59,10 +59,6 @@ requires = ["setuptools>=61.0"] | |||||
build-backend = "setuptools.build_meta" | build-backend = "setuptools.build_meta" | ||||
[tool.pyright] | [tool.pyright] | ||||
exclude = [ | |||||
# TODO | |||||
"src/earwigbot/wiki/copyvios" | |||||
] | |||||
pythonVersion = "3.11" | pythonVersion = "3.11" | ||||
venvPath = "." | venvPath = "." | ||||
venv = "venv" | venv = "venv" | ||||
@@ -18,208 +18,142 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
__all__ = [ | |||||
"DEFAULT_DEGREE", | |||||
"CopyvioChecker", | |||||
"CopyvioCheckResult", | |||||
"globalize", | |||||
"localize", | |||||
] | |||||
import functools | |||||
import logging | |||||
import time | import time | ||||
from urllib.request import build_opener | |||||
from collections.abc import Callable | |||||
from earwigbot import exceptions | |||||
from earwigbot.wiki.copyvios.markov import MarkovChain | |||||
from earwigbot.wiki.copyvios.parsers import ArticleTextParser | |||||
from earwigbot.wiki.copyvios.search import SEARCH_ENGINES | |||||
from earwigbot.wiki.copyvios.exclusions import ExclusionsDB | |||||
from earwigbot.wiki.copyvios.markov import DEFAULT_DEGREE, MarkovChain | |||||
from earwigbot.wiki.copyvios.parsers import ArticleParser, ParserArgs | |||||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult | |||||
from earwigbot.wiki.copyvios.search import SearchEngine, get_search_engine | |||||
from earwigbot.wiki.copyvios.workers import CopyvioWorkspace, globalize, localize | from earwigbot.wiki.copyvios.workers import CopyvioWorkspace, globalize, localize | ||||
from earwigbot.wiki.page import Page | |||||
__all__ = ["CopyvioMixIn", "globalize", "localize"] | |||||
class CopyvioMixIn: | |||||
class CopyvioChecker: | |||||
""" | """ | ||||
**EarwigBot: Wiki Toolset: Copyright Violation MixIn** | |||||
Manages the lifecycle of a copyvio check or comparison. | |||||
This is a mixin that provides two public methods, :py:meth:`copyvio_check` | |||||
and :py:meth:`copyvio_compare`. The former checks the page for copyright | |||||
violations using a search engine API, and the latter compares the page | |||||
against a given URL. Credentials for the search engine API are stored in | |||||
the :py:class:`~earwigbot.wiki.site.Site`'s config. | |||||
Created by :py:class:`~earwigbot.wiki.page.Page` and handles the implementation | |||||
details of running a check. | |||||
""" | """ | ||||
def __init__(self, site): | |||||
self._search_config = site._search_config | |||||
self._exclusions_db = self._search_config.get("exclusions_db") | |||||
self._addheaders = [ | |||||
("User-Agent", site.user_agent), | |||||
def __init__( | |||||
self, | |||||
page: Page, | |||||
*, | |||||
min_confidence: float = 0.75, | |||||
max_time: float = 30, | |||||
degree: int = DEFAULT_DEGREE, | |||||
logger: logging.Logger | None = None, | |||||
) -> None: | |||||
self._page = page | |||||
self._site = page.site | |||||
self._config = page.site._search_config | |||||
self._min_confidence = min_confidence | |||||
self._max_time = max_time | |||||
self._degree = degree | |||||
self._logger = logger or logging.getLogger("earwigbot.wiki") | |||||
self._headers = [ | |||||
("User-Agent", page.site.user_agent), | |||||
("Accept-Encoding", "gzip"), | ("Accept-Encoding", "gzip"), | ||||
] | ] | ||||
def _get_search_engine(self): | |||||
"""Return a function that can be called to do web searches. | |||||
The function takes one argument, a search query, and returns a list of | |||||
URLs, ranked by importance. The underlying logic depends on the | |||||
*engine* argument within our config; for example, if *engine* is | |||||
"Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying. | |||||
Raises UnknownSearchEngineError if the 'engine' listed in our config is | |||||
unknown to us, and UnsupportedSearchEngineError if we are missing a | |||||
required package or module, like oauth2 for "Yahoo! BOSS". | |||||
""" | |||||
engine = self._search_config["engine"] | |||||
if engine not in SEARCH_ENGINES: | |||||
raise exceptions.UnknownSearchEngineError(engine) | |||||
klass = SEARCH_ENGINES[engine] | |||||
credentials = self._search_config["credentials"] | |||||
opener = build_opener() | |||||
opener.addheaders = self._addheaders | |||||
for dep in klass.requirements(): | |||||
try: | |||||
__import__(dep).__name__ | |||||
except (ModuleNotFoundError, AttributeError): | |||||
e = "Missing a required dependency ({}) for the {} engine" | |||||
e = e.format(dep, engine) | |||||
raise exceptions.UnsupportedSearchEngineError(e) | |||||
return klass(credentials, opener) | |||||
def copyvio_check( | |||||
self, | |||||
min_confidence=0.75, | |||||
max_queries=15, | |||||
max_time=-1, | |||||
no_searches=False, | |||||
no_links=False, | |||||
short_circuit=True, | |||||
degree=5, | |||||
): | |||||
"""Check the page for copyright violations. | |||||
Returns a :class:`.CopyvioCheckResult` object with information on the | |||||
results of the check. | |||||
*min_confidence* is the minimum amount of confidence we must have in | |||||
the similarity between a source text and the article in order for us to | |||||
consider it a suspected violation. This is a number between 0 and 1. | |||||
*max_queries* is self-explanatory; we will never make more than this | |||||
number of queries in a given check. | |||||
*max_time* can be set to prevent copyvio checks from taking longer than | |||||
a set amount of time (generally around a minute), which can be useful | |||||
if checks are called through a web server with timeouts. We will stop | |||||
checking new URLs as soon as this limit is reached. | |||||
Setting *no_searches* to ``True`` will cause only URLs in the wikitext | |||||
of the page to be checked; no search engine queries will be made. | |||||
Setting *no_links* to ``True`` will cause the opposite to happen: URLs | |||||
in the wikitext will be ignored; search engine queries will be made | |||||
only. Setting both of these to ``True`` is pointless. | |||||
Normally, the checker will short-circuit if it finds a URL that meets | |||||
*min_confidence*. This behavior normally causes it to skip any | |||||
remaining URLs and web queries, but setting *short_circuit* to | |||||
``False`` will prevent this. | |||||
Raises :exc:`.CopyvioCheckError` or subclasses | |||||
(:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on | |||||
errors. | |||||
""" | |||||
log = "Starting copyvio check for [[{0}]]" | |||||
self._logger.info(log.format(self.title)) | |||||
searcher = self._get_search_engine() | |||||
parser = ArticleTextParser( | |||||
self.get(), | |||||
args={"nltk_dir": self._search_config["nltk_dir"], "lang": self._site.lang}, | |||||
self._parser = ArticleParser( | |||||
self._page.get(), | |||||
lang=self._site.lang, | |||||
nltk_dir=self._config["nltk_dir"], | |||||
) | ) | ||||
article = MarkovChain(parser.strip(), degree=degree) | |||||
parser_args = {} | |||||
self._article = MarkovChain(self._parser.strip(), degree=self._degree) | |||||
if self._exclusions_db: | |||||
self._exclusions_db.sync(self.site.name) | |||||
@functools.cached_property | |||||
def _searcher(self) -> SearchEngine: | |||||
return get_search_engine(self._config, self._headers) | |||||
def exclude(u): | |||||
return self._exclusions_db.check(self.site.name, u) | |||||
@property | |||||
def _exclusions_db(self) -> ExclusionsDB | None: | |||||
return self._config.get("exclusions_db") | |||||
parser_args["mirror_hints"] = self._exclusions_db.get_mirror_hints(self) | |||||
else: | |||||
exclude = None | |||||
def _get_exclusion_callback(self) -> Callable[[str], bool] | None: | |||||
if not self._exclusions_db: | |||||
return None | |||||
return functools.partial(self._exclusions_db.check, self._site.name) | |||||
def run_check( | |||||
self, | |||||
*, | |||||
max_queries: int = 15, | |||||
no_searches: bool = False, | |||||
no_links: bool = False, | |||||
short_circuit: bool = True, | |||||
) -> CopyvioCheckResult: | |||||
parser_args: ParserArgs = {} | |||||
if self._exclusions_db: | |||||
self._exclusions_db.sync(self._site.name) | |||||
mirror_hints = self._exclusions_db.get_mirror_hints(self._page) | |||||
parser_args["mirror_hints"] = mirror_hints | |||||
workspace = CopyvioWorkspace( | workspace = CopyvioWorkspace( | ||||
article, | |||||
min_confidence, | |||||
max_time, | |||||
self._logger, | |||||
self._addheaders, | |||||
self._article, | |||||
min_confidence=self._min_confidence, | |||||
max_time=self._max_time, | |||||
logger=self._logger, | |||||
headers=self._headers, | |||||
short_circuit=short_circuit, | short_circuit=short_circuit, | ||||
parser_args=parser_args, | parser_args=parser_args, | ||||
exclude_check=exclude, | |||||
config=self._search_config, | |||||
degree=degree, | |||||
exclusion_callback=self._get_exclusion_callback(), | |||||
config=self._config, | |||||
degree=self._degree, | |||||
) | ) | ||||
if article.size < 20: # Auto-fail very small articles | |||||
result = workspace.get_result() | |||||
self._logger.info(result.get_log_message(self.title)) | |||||
return result | |||||
if self._article.size < 20: # Auto-fail very small articles | |||||
return workspace.get_result() | |||||
if not no_links: | if not no_links: | ||||
workspace.enqueue(parser.get_links()) | |||||
workspace.enqueue(self._parser.get_links()) | |||||
num_queries = 0 | num_queries = 0 | ||||
if not no_searches: | if not no_searches: | ||||
chunks = parser.chunk(max_queries) | |||||
chunks = self._parser.chunk(max_queries) | |||||
for chunk in chunks: | for chunk in chunks: | ||||
if short_circuit and workspace.finished: | if short_circuit and workspace.finished: | ||||
workspace.possible_miss = True | workspace.possible_miss = True | ||||
break | break | ||||
log = "[[{0}]] -> querying {1} for {2!r}" | |||||
self._logger.debug(log.format(self.title, searcher.name, chunk)) | |||||
workspace.enqueue(searcher.search(chunk)) | |||||
self._logger.debug( | |||||
f"[[{self._page.title}]] -> querying {self._searcher.name} " | |||||
f"for {chunk!r}" | |||||
) | |||||
workspace.enqueue(self._searcher.search(chunk)) | |||||
num_queries += 1 | num_queries += 1 | ||||
time.sleep(1) | |||||
time.sleep(1) # TODO: Check whether this is needed | |||||
workspace.wait() | workspace.wait() | ||||
result = workspace.get_result(num_queries) | |||||
self._logger.info(result.get_log_message(self.title)) | |||||
return result | |||||
def copyvio_compare(self, urls, min_confidence=0.75, max_time=30, degree=5): | |||||
"""Check the page like :py:meth:`copyvio_check` against specific URLs. | |||||
This is essentially a reduced version of :meth:`copyvio_check` - a | |||||
copyivo comparison is made using Markov chains and the result is | |||||
returned in a :class:`.CopyvioCheckResult` object - but without using a | |||||
search engine, since the suspected "violated" URL is supplied from the | |||||
start. | |||||
Its primary use is to generate a result when the URL is retrieved from | |||||
a cache, like the one used in EarwigBot's Tool Labs site. After a | |||||
search is done, the resulting URL is stored in a cache for 72 hours so | |||||
future checks against that page will not require another set of | |||||
time-and-money-consuming search engine queries. However, the comparison | |||||
itself (which includes the article's and the source's content) cannot | |||||
be stored for data retention reasons, so a fresh comparison is made | |||||
using this function. | |||||
Since no searching is done, neither :exc:`.UnknownSearchEngineError` | |||||
nor :exc:`.SearchQueryError` will be raised. | |||||
""" | |||||
if not isinstance(urls, list): | |||||
urls = [urls] | |||||
log = "Starting copyvio compare for [[{0}]] against {1}" | |||||
self._logger.info(log.format(self.title, ", ".join(urls))) | |||||
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree) | |||||
return workspace.get_result(num_queries) | |||||
def run_compare(self, urls: list[str]) -> CopyvioCheckResult: | |||||
workspace = CopyvioWorkspace( | workspace = CopyvioWorkspace( | ||||
article, | |||||
min_confidence, | |||||
max_time, | |||||
self._logger, | |||||
self._addheaders, | |||||
max_time, | |||||
self._article, | |||||
min_confidence=self._min_confidence, | |||||
max_time=self._max_time, | |||||
logger=self._logger, | |||||
headers=self._headers, | |||||
url_timeout=self._max_time, | |||||
num_workers=min(len(urls), 8), | num_workers=min(len(urls), 8), | ||||
short_circuit=False, | short_circuit=False, | ||||
config=self._search_config, | |||||
degree=degree, | |||||
config=self._config, | |||||
degree=self._degree, | |||||
) | ) | ||||
workspace.enqueue(urls) | workspace.enqueue(urls) | ||||
workspace.wait() | workspace.wait() | ||||
result = workspace.get_result() | |||||
self._logger.info(result.get_log_message(self.title)) | |||||
return result | |||||
return workspace.get_result() |
@@ -18,15 +18,24 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
from __future__ import annotations | |||||
__all__ = ["ExclusionsDB"] | |||||
import logging | |||||
import re | import re | ||||
import sqlite3 | import sqlite3 | ||||
import threading | import threading | ||||
import time | import time | ||||
import typing | |||||
import urllib.parse | import urllib.parse | ||||
from earwigbot import exceptions | from earwigbot import exceptions | ||||
__all__ = ["ExclusionsDB"] | |||||
if typing.TYPE_CHECKING: | |||||
from earwigbot.wiki.page import Page | |||||
from earwigbot.wiki.site import Site | |||||
from earwigbot.wiki.sitesdb import SitesDB | |||||
DEFAULT_SOURCES = { | DEFAULT_SOURCES = { | ||||
"all": [ # Applies to all, but located on enwiki | "all": [ # Applies to all, but located on enwiki | ||||
@@ -52,26 +61,28 @@ class ExclusionsDB: | |||||
""" | """ | ||||
**EarwigBot: Wiki Toolset: Exclusions Database Manager** | **EarwigBot: Wiki Toolset: Exclusions Database Manager** | ||||
Controls the :file:`exclusions.db` file, which stores URLs excluded from | |||||
copyright violation checks on account of being known mirrors, for example. | |||||
Controls the :file:`exclusions.db` file, which stores URLs excluded from copyright | |||||
violation checks on account of being known mirrors, for example. | |||||
""" | """ | ||||
def __init__(self, sitesdb, dbfile, logger): | |||||
def __init__(self, sitesdb: SitesDB, dbfile: str, logger: logging.Logger) -> None: | |||||
self._sitesdb = sitesdb | self._sitesdb = sitesdb | ||||
self._dbfile = dbfile | self._dbfile = dbfile | ||||
self._logger = logger | self._logger = logger | ||||
self._db_access_lock = threading.Lock() | self._db_access_lock = threading.Lock() | ||||
def __repr__(self): | |||||
def __repr__(self) -> str: | |||||
"""Return the canonical string representation of the ExclusionsDB.""" | """Return the canonical string representation of the ExclusionsDB.""" | ||||
res = "ExclusionsDB(sitesdb={0!r}, dbfile={1!r}, logger={2!r})" | |||||
return res.format(self._sitesdb, self._dbfile, self._logger) | |||||
return ( | |||||
f"ExclusionsDB(sitesdb={self._sitesdb!r}, dbfile={self._dbfile!r}, " | |||||
f"logger={self._logger!r})" | |||||
) | |||||
def __str__(self): | |||||
def __str__(self) -> str: | |||||
"""Return a nice string representation of the ExclusionsDB.""" | """Return a nice string representation of the ExclusionsDB.""" | ||||
return f"<ExclusionsDB at {self._dbfile}>" | return f"<ExclusionsDB at {self._dbfile}>" | ||||
def _create(self): | |||||
def _create(self) -> None: | |||||
"""Initialize the exclusions database with its necessary tables.""" | """Initialize the exclusions database with its necessary tables.""" | ||||
script = """ | script = """ | ||||
CREATE TABLE sources (source_sitename, source_page); | CREATE TABLE sources (source_sitename, source_page); | ||||
@@ -79,7 +90,7 @@ class ExclusionsDB: | |||||
CREATE TABLE exclusions (exclusion_sitename, exclusion_url); | CREATE TABLE exclusions (exclusion_sitename, exclusion_url); | ||||
""" | """ | ||||
query = "INSERT INTO sources VALUES (?, ?);" | query = "INSERT INTO sources VALUES (?, ?);" | ||||
sources = [] | |||||
sources: list[tuple[str, str]] = [] | |||||
for sitename, pages in DEFAULT_SOURCES.items(): | for sitename, pages in DEFAULT_SOURCES.items(): | ||||
for page in pages: | for page in pages: | ||||
sources.append((sitename, page)) | sources.append((sitename, page)) | ||||
@@ -88,9 +99,9 @@ class ExclusionsDB: | |||||
conn.executescript(script) | conn.executescript(script) | ||||
conn.executemany(query, sources) | conn.executemany(query, sources) | ||||
def _load_source(self, site, source): | |||||
def _load_source(self, site: Site, source: str) -> set[str]: | |||||
"""Load from a specific source and return a set of URLs.""" | """Load from a specific source and return a set of URLs.""" | ||||
urls = set() | |||||
urls: set[str] = set() | |||||
try: | try: | ||||
data = site.get_page(source, follow_redirects=True).get() | data = site.get_page(source, follow_redirects=True).get() | ||||
except exceptions.PageNotFoundError: | except exceptions.PageNotFoundError: | ||||
@@ -123,7 +134,7 @@ class ExclusionsDB: | |||||
urls.add(url) | urls.add(url) | ||||
return urls | return urls | ||||
def _update(self, sitename): | |||||
def _update(self, sitename: str) -> None: | |||||
"""Update the database from listed sources in the index.""" | """Update the database from listed sources in the index.""" | ||||
query1 = "SELECT source_page FROM sources WHERE source_sitename = ?" | query1 = "SELECT source_page FROM sources WHERE source_sitename = ?" | ||||
query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" | query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" | ||||
@@ -140,7 +151,7 @@ class ExclusionsDB: | |||||
else: | else: | ||||
site = self._sitesdb.get_site(sitename) | site = self._sitesdb.get_site(sitename) | ||||
with self._db_access_lock, sqlite3.connect(self._dbfile) as conn: | with self._db_access_lock, sqlite3.connect(self._dbfile) as conn: | ||||
urls = set() | |||||
urls: set[str] = set() | |||||
for (source,) in conn.execute(query1, (sitename,)): | for (source,) in conn.execute(query1, (sitename,)): | ||||
urls |= self._load_source(site, source) | urls |= self._load_source(site, source) | ||||
for (url,) in conn.execute(query2, (sitename,)): | for (url,) in conn.execute(query2, (sitename,)): | ||||
@@ -154,7 +165,7 @@ class ExclusionsDB: | |||||
else: | else: | ||||
conn.execute(query7, (sitename, int(time.time()))) | conn.execute(query7, (sitename, int(time.time()))) | ||||
def _get_last_update(self, sitename): | |||||
def _get_last_update(self, sitename: str) -> int: | |||||
"""Return the UNIX timestamp of the last time the db was updated.""" | """Return the UNIX timestamp of the last time the db was updated.""" | ||||
query = "SELECT update_time FROM updates WHERE update_sitename = ?" | query = "SELECT update_time FROM updates WHERE update_sitename = ?" | ||||
with self._db_access_lock, sqlite3.connect(self._dbfile) as conn: | with self._db_access_lock, sqlite3.connect(self._dbfile) as conn: | ||||
@@ -165,28 +176,34 @@ class ExclusionsDB: | |||||
return 0 | return 0 | ||||
return result[0] if result else 0 | return result[0] if result else 0 | ||||
def sync(self, sitename, force=False): | |||||
"""Update the database if it hasn't been updated recently. | |||||
def sync(self, sitename: str, force: bool = False) -> None: | |||||
""" | |||||
Update the database if it hasn't been updated recently. | |||||
This updates the exclusions database for the site *sitename* and "all". | This updates the exclusions database for the site *sitename* and "all". | ||||
Site-specific lists are considered stale after 48 hours; global lists | |||||
after 12 hours. | |||||
Site-specific lists are considered stale after 48 hours; global lists after | |||||
12 hours. | |||||
""" | """ | ||||
max_staleness = 60 * 60 * (12 if sitename == "all" else 48) | max_staleness = 60 * 60 * (12 if sitename == "all" else 48) | ||||
time_since_update = int(time.time() - self._get_last_update(sitename)) | time_since_update = int(time.time() - self._get_last_update(sitename)) | ||||
if force or time_since_update > max_staleness: | if force or time_since_update > max_staleness: | ||||
log = "Updating stale database: {0} (last updated {1} seconds ago)" | |||||
self._logger.info(log.format(sitename, time_since_update)) | |||||
self._logger.info( | |||||
f"Updating stale database: {sitename} (last updated " | |||||
f"{time_since_update} seconds ago)" | |||||
) | |||||
self._update(sitename) | self._update(sitename) | ||||
else: | else: | ||||
log = "Database for {0} is still fresh (last updated {1} seconds ago)" | |||||
self._logger.debug(log.format(sitename, time_since_update)) | |||||
self._logger.debug( | |||||
f"Database for {sitename} is still fresh (last updated " | |||||
f"{time_since_update} seconds ago)" | |||||
) | |||||
if sitename != "all": | if sitename != "all": | ||||
self.sync("all", force=force) | self.sync("all", force=force) | ||||
def check(self, sitename, url): | |||||
"""Check whether a given URL is in the exclusions database. | |||||
def check(self, sitename: str, url: str) -> bool: | |||||
""" | |||||
Check whether a given URL is in the exclusions database. | |||||
Return ``True`` if the URL is in the database, or ``False`` otherwise. | Return ``True`` if the URL is in the database, or ``False`` otherwise. | ||||
""" | """ | ||||
@@ -216,19 +233,18 @@ class ExclusionsDB: | |||||
else: | else: | ||||
matches = normalized.startswith(excl) | matches = normalized.startswith(excl) | ||||
if matches: | if matches: | ||||
log = "Exclusion detected in {0} for {1}" | |||||
self._logger.debug(log.format(sitename, url)) | |||||
self._logger.debug(f"Exclusion detected in {sitename} for {url}") | |||||
return True | return True | ||||
log = f"No exclusions in {sitename} for {url}" | |||||
self._logger.debug(log) | |||||
self._logger.debug(f"No exclusions in {sitename} for {url}") | |||||
return False | return False | ||||
def get_mirror_hints(self, page, try_mobile=True): | |||||
"""Return a list of strings that indicate the existence of a mirror. | |||||
def get_mirror_hints(self, page: Page, try_mobile: bool = True) -> list[str]: | |||||
""" | |||||
Return a list of strings that indicate the existence of a mirror. | |||||
The source parser checks for the presence of these strings inside of | |||||
certain HTML tag attributes (``"href"`` and ``"src"``). | |||||
The source parser checks for the presence of these strings inside of certain | |||||
HTML tag attributes (``"href"`` and ``"src"``). | |||||
""" | """ | ||||
site = page.site | site = page.site | ||||
path = urllib.parse.urlparse(page.url).path | path = urllib.parse.urlparse(page.url).path | ||||
@@ -238,10 +254,10 @@ class ExclusionsDB: | |||||
if try_mobile: | if try_mobile: | ||||
fragments = re.search(r"^([\w]+)\.([\w]+).([\w]+)$", site.domain) | fragments = re.search(r"^([\w]+)\.([\w]+).([\w]+)$", site.domain) | ||||
if fragments: | if fragments: | ||||
roots.append("{}.m.{}.{}".format(*fragments.groups())) | |||||
roots.append(f"{fragments[1]}.m.{fragments[2]}.{fragments[3]}") | |||||
general = [ | general = [ | ||||
root + site._script_path + "/" + script | |||||
root + site.script_path + "/" + script | |||||
for root in roots | for root in roots | ||||
for script in scripts | for script in scripts | ||||
] | ] | ||||
@@ -18,29 +18,44 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
__all__ = [ | |||||
"DEFAULT_DEGREE", | |||||
"EMPTY", | |||||
"EMPTY_INTERSECTION", | |||||
"MarkovChain", | |||||
"MarkovChainIntersection", | |||||
] | |||||
import re | import re | ||||
from collections.abc import Iterable | |||||
from enum import Enum | |||||
__all__ = ["EMPTY", "EMPTY_INTERSECTION", "MarkovChain", "MarkovChainIntersection"] | |||||
DEFAULT_DEGREE = 5 | |||||
class MarkovChain: | |||||
"""Implements a basic ngram Markov chain of words.""" | |||||
class Sentinel(Enum): | |||||
START = -1 | START = -1 | ||||
END = -2 | END = -2 | ||||
def __init__(self, text, degree=5): | |||||
RawChain = dict[tuple[str | Sentinel, ...], int] | |||||
class MarkovChain: | |||||
"""Implements a basic ngram Markov chain of words.""" | |||||
def __init__(self, text: str, degree: int = DEFAULT_DEGREE) -> None: | |||||
self.text = text | self.text = text | ||||
self.degree = degree # 2 for bigrams, 3 for trigrams, etc. | self.degree = degree # 2 for bigrams, 3 for trigrams, etc. | ||||
self.chain = self._build() | self.chain = self._build() | ||||
self.size = self._get_size() | self.size = self._get_size() | ||||
def _build(self): | |||||
def _build(self) -> RawChain: | |||||
"""Build and return the Markov chain from the input text.""" | """Build and return the Markov chain from the input text.""" | ||||
padding = self.degree - 1 | padding = self.degree - 1 | ||||
words = re.sub(r"[^\w\s-]", "", self.text.lower(), flags=re.UNICODE).split() | |||||
words = ([self.START] * padding) + words + ([self.END] * padding) | |||||
chain = {} | |||||
words = re.sub(r"[^\w\s-]", "", self.text.lower()).split() | |||||
words = ([Sentinel.START] * padding) + words + ([Sentinel.END] * padding) | |||||
chain: RawChain = {} | |||||
for i in range(len(words) - self.degree + 1): | for i in range(len(words) - self.degree + 1): | ||||
phrase = tuple(words[i : i + self.degree]) | phrase = tuple(words[i : i + self.degree]) | ||||
@@ -50,15 +65,15 @@ class MarkovChain: | |||||
chain[phrase] = 1 | chain[phrase] = 1 | ||||
return chain | return chain | ||||
def _get_size(self): | |||||
def _get_size(self) -> int: | |||||
"""Return the size of the Markov chain: the total number of nodes.""" | """Return the size of the Markov chain: the total number of nodes.""" | ||||
return sum(self.chain.values()) | return sum(self.chain.values()) | ||||
def __repr__(self): | |||||
def __repr__(self) -> str: | |||||
"""Return the canonical string representation of the MarkovChain.""" | """Return the canonical string representation of the MarkovChain.""" | ||||
return f"MarkovChain(text={self.text!r})" | return f"MarkovChain(text={self.text!r})" | ||||
def __str__(self): | |||||
def __str__(self) -> str: | |||||
"""Return a nice string representation of the MarkovChain.""" | """Return a nice string representation of the MarkovChain.""" | ||||
return f"<MarkovChain of size {self.size}>" | return f"<MarkovChain of size {self.size}>" | ||||
@@ -66,61 +81,60 @@ class MarkovChain: | |||||
class MarkovChainIntersection(MarkovChain): | class MarkovChainIntersection(MarkovChain): | ||||
"""Implements the intersection of two chains (i.e., their shared nodes).""" | """Implements the intersection of two chains (i.e., their shared nodes).""" | ||||
def __init__(self, mc1, mc2): | |||||
def __init__(self, mc1: MarkovChain, mc2: MarkovChain) -> None: | |||||
self.mc1, self.mc2 = mc1, mc2 | self.mc1, self.mc2 = mc1, mc2 | ||||
self.chain = self._build() | self.chain = self._build() | ||||
self.size = self._get_size() | self.size = self._get_size() | ||||
def _build(self): | |||||
def _build(self) -> RawChain: | |||||
"""Build and return the Markov chain from the input chains.""" | """Build and return the Markov chain from the input chains.""" | ||||
c1 = self.mc1.chain | c1 = self.mc1.chain | ||||
c2 = self.mc2.chain | c2 = self.mc2.chain | ||||
chain = {} | |||||
chain: RawChain = {} | |||||
for phrase in c1: | for phrase in c1: | ||||
if phrase in c2: | if phrase in c2: | ||||
chain[phrase] = min(c1[phrase], c2[phrase]) | chain[phrase] = min(c1[phrase], c2[phrase]) | ||||
return chain | return chain | ||||
def __repr__(self): | |||||
def __repr__(self) -> str: | |||||
"""Return the canonical string representation of the intersection.""" | """Return the canonical string representation of the intersection.""" | ||||
res = "MarkovChainIntersection(mc1={0!r}, mc2={1!r})" | |||||
return res.format(self.mc1, self.mc2) | |||||
return f"MarkovChainIntersection(mc1={self.mc1!r}, mc2={self.mc2!r})" | |||||
def __str__(self): | |||||
def __str__(self) -> str: | |||||
"""Return a nice string representation of the intersection.""" | """Return a nice string representation of the intersection.""" | ||||
res = "<MarkovChainIntersection of size {0} ({1} ^ {2})>" | |||||
return res.format(self.size, self.mc1, self.mc2) | |||||
return ( | |||||
f"<MarkovChainIntersection of size {self.size} ({self.mc1} ^ {self.mc2})>" | |||||
) | |||||
class MarkovChainUnion(MarkovChain): | class MarkovChainUnion(MarkovChain): | ||||
"""Implemented the union of multiple chains.""" | """Implemented the union of multiple chains.""" | ||||
def __init__(self, chains): | |||||
def __init__(self, chains: Iterable[MarkovChain]) -> None: | |||||
self.chains = list(chains) | self.chains = list(chains) | ||||
self.chain = self._build() | self.chain = self._build() | ||||
self.size = self._get_size() | self.size = self._get_size() | ||||
def _build(self): | |||||
def _build(self) -> RawChain: | |||||
"""Build and return the Markov chain from the input chains.""" | """Build and return the Markov chain from the input chains.""" | ||||
union = {} | |||||
union: RawChain = {} | |||||
for chain in self.chains: | for chain in self.chains: | ||||
for phrase, count in chain.chain.iteritems(): | |||||
for phrase, count in chain.chain.items(): | |||||
if phrase in union: | if phrase in union: | ||||
union[phrase] += count | union[phrase] += count | ||||
else: | else: | ||||
union[phrase] = count | union[phrase] = count | ||||
return union | return union | ||||
def __repr__(self): | |||||
def __repr__(self) -> str: | |||||
"""Return the canonical string representation of the union.""" | """Return the canonical string representation of the union.""" | ||||
res = "MarkovChainUnion(chains={!r})" | |||||
return res.format(self.chains) | |||||
return f"MarkovChainUnion(chains={self.chains!r})" | |||||
def __str__(self): | |||||
def __str__(self) -> str: | |||||
"""Return a nice string representation of the union.""" | """Return a nice string representation of the union.""" | ||||
res = "<MarkovChainUnion of size {} ({})>" | |||||
return res.format(self.size, "| ".join(str(chain) for chain in self.chains)) | |||||
chains = " | ".join(str(chain) for chain in self.chains) | |||||
return f"<MarkovChainUnion of size {self.size} ({chains})>" | |||||
EMPTY = MarkovChain("") | EMPTY = MarkovChain("") | ||||
@@ -18,44 +18,34 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
from __future__ import annotations | |||||
__all__ = ["ArticleParser", "get_parser"] | |||||
import io | import io | ||||
import json | import json | ||||
import os.path | import os.path | ||||
import re | import re | ||||
import typing | |||||
import urllib.parse | import urllib.parse | ||||
import urllib.request | import urllib.request | ||||
from abc import ABC, abstractmethod | |||||
from collections.abc import Callable | |||||
from typing import Any, ClassVar, Literal, TypedDict | |||||
import mwparserfromhell | import mwparserfromhell | ||||
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError | from earwigbot.exceptions import ParserExclusionError, ParserRedirectError | ||||
__all__ = ["ArticleTextParser", "get_parser"] | |||||
class _BaseTextParser: | |||||
"""Base class for a parser that handles text.""" | |||||
TYPE = None | |||||
def __init__(self, text, url=None, args=None): | |||||
self.text = text | |||||
self.url = url | |||||
self._args = args or {} | |||||
def __repr__(self): | |||||
"""Return the canonical string representation of the text parser.""" | |||||
return f"{self.__class__.__name__}(text={self.text!r})" | |||||
if typing.TYPE_CHECKING: | |||||
import bs4 | |||||
def __str__(self): | |||||
"""Return a nice string representation of the text parser.""" | |||||
name = self.__class__.__name__ | |||||
return f"<{name} of text with size {len(self.text)}>" | |||||
from earwigbot.wiki.copyvios.workers import OpenedURL | |||||
class ArticleTextParser(_BaseTextParser): | |||||
class ArticleParser: | |||||
"""A parser that can strip and chunk wikicode article text.""" | """A parser that can strip and chunk wikicode article text.""" | ||||
TYPE = "Article" | |||||
TEMPLATE_MERGE_THRESHOLD = 35 | TEMPLATE_MERGE_THRESHOLD = 35 | ||||
NLTK_DEFAULT = "english" | NLTK_DEFAULT = "english" | ||||
NLTK_LANGS = { | NLTK_LANGS = { | ||||
@@ -78,7 +68,18 @@ class ArticleTextParser(_BaseTextParser): | |||||
"tr": "turkish", | "tr": "turkish", | ||||
} | } | ||||
def _merge_templates(self, code): | |||||
def __init__(self, text: str, lang: str, nltk_dir: str) -> None: | |||||
self.text = text | |||||
self._lang = lang | |||||
self._nltk_dir = nltk_dir | |||||
def __repr__(self) -> str: | |||||
return f"{self.__class__.__name__}(text={self.text!r})" | |||||
def __str__(self) -> str: | |||||
return f"<{self.__class__.__name__} of text with size {len(self.text)}>" | |||||
def _merge_templates(self, code: mwparserfromhell.wikicode.Wikicode) -> None: | |||||
"""Merge template contents in to wikicode when the values are long.""" | """Merge template contents in to wikicode when the values are long.""" | ||||
for template in code.filter_templates(recursive=code.RECURSE_OTHERS): | for template in code.filter_templates(recursive=code.RECURSE_OTHERS): | ||||
chunks = [] | chunks = [] | ||||
@@ -92,23 +93,25 @@ class ArticleTextParser(_BaseTextParser): | |||||
else: | else: | ||||
code.remove(template) | code.remove(template) | ||||
def _get_tokenizer(self): | |||||
def _get_tokenizer(self) -> Any: | |||||
"""Return a NLTK punctuation tokenizer for the article's language.""" | """Return a NLTK punctuation tokenizer for the article's language.""" | ||||
import nltk | import nltk | ||||
def datafile(lang): | |||||
def datafile(lang: str) -> str: | |||||
return "file:" + os.path.join( | return "file:" + os.path.join( | ||||
self._args["nltk_dir"], "tokenizers", "punkt", lang + ".pickle" | |||||
self._nltk_dir, "tokenizers", "punkt", lang + ".pickle" | |||||
) | ) | ||||
lang = self.NLTK_LANGS.get(self._args.get("lang"), self.NLTK_DEFAULT) | |||||
lang = self.NLTK_LANGS.get(self._lang, self.NLTK_DEFAULT) | |||||
try: | try: | ||||
nltk.data.load(datafile(self.NLTK_DEFAULT)) | nltk.data.load(datafile(self.NLTK_DEFAULT)) | ||||
except LookupError: | except LookupError: | ||||
nltk.download("punkt", self._args["nltk_dir"]) | |||||
nltk.download("punkt", self._nltk_dir) | |||||
return nltk.data.load(datafile(lang)) | return nltk.data.load(datafile(lang)) | ||||
def _get_sentences(self, min_query, max_query, split_thresh): | |||||
def _get_sentences( | |||||
self, min_query: int, max_query: int, split_thresh: int | |||||
) -> list[str]: | |||||
"""Split the article text into sentences of a certain length.""" | """Split the article text into sentences of a certain length.""" | ||||
def cut_sentence(words): | def cut_sentence(words): | ||||
@@ -138,24 +141,27 @@ class ArticleTextParser(_BaseTextParser): | |||||
sentences.extend(cut_sentence(sentence.split())) | sentences.extend(cut_sentence(sentence.split())) | ||||
return [sen for sen in sentences if len(sen) >= min_query] | return [sen for sen in sentences if len(sen) >= min_query] | ||||
def strip(self): | |||||
"""Clean the page's raw text by removing templates and formatting. | |||||
def strip(self) -> str: | |||||
""" | |||||
Clean the page's raw text by removing templates and formatting. | |||||
Return the page's text with all HTML and wikicode formatting removed, | |||||
including templates, tables, and references. It retains punctuation | |||||
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses, | |||||
quotes), original capitalization, and so forth. HTML entities are | |||||
replaced by their unicode equivalents. | |||||
Return the page's text with all HTML and wikicode formatting removed, including | |||||
templates, tables, and references. It retains punctuation (spacing, paragraphs, | |||||
periods, commas, (semi)-colons, parentheses, quotes), original capitalization, | |||||
and so forth. HTML entities are replaced by their unicode equivalents. | |||||
The actual stripping is handled by :py:mod:`mwparserfromhell`. | The actual stripping is handled by :py:mod:`mwparserfromhell`. | ||||
""" | """ | ||||
def remove(code, node): | |||||
"""Remove a node from a code object, ignoring ValueError. | |||||
def remove( | |||||
code: mwparserfromhell.wikicode.Wikicode, node: mwparserfromhell.nodes.Node | |||||
) -> None: | |||||
""" | |||||
Remove a node from a code object, ignoring ValueError. | |||||
Sometimes we will remove a node that contains another node we wish | |||||
to remove, and we fail when we try to remove the inner one. Easiest | |||||
solution is to just ignore the exception. | |||||
Sometimes we will remove a node that contains another node we wish to | |||||
remove, and we fail when we try to remove the inner one. Easiest solution | |||||
is to just ignore the exception. | |||||
""" | """ | ||||
try: | try: | ||||
code.remove(node) | code.remove(node) | ||||
@@ -181,26 +187,32 @@ class ArticleTextParser(_BaseTextParser): | |||||
self.clean = re.sub(r"\n\n+", "\n", clean).strip() | self.clean = re.sub(r"\n\n+", "\n", clean).strip() | ||||
return self.clean | return self.clean | ||||
def chunk(self, max_chunks, min_query=8, max_query=128, split_thresh=32): | |||||
"""Convert the clean article text into a list of web-searchable chunks. | |||||
No greater than *max_chunks* will be returned. Each chunk will only be | |||||
a sentence or two long at most (no more than *max_query*). The idea is | |||||
to return a sample of the article text rather than the whole, so we'll | |||||
pick and choose from parts of it, especially if the article is large | |||||
and *max_chunks* is low, so we don't end up just searching for just the | |||||
first paragraph. | |||||
This is implemented using :py:mod:`nltk` (https://nltk.org/). A base | |||||
directory (*nltk_dir*) is required to store nltk's punctuation | |||||
database, and should be passed as an argument to the constructor. It is | |||||
typically located in the bot's working directory. | |||||
def chunk( | |||||
self, | |||||
max_chunks: int, | |||||
min_query: int = 8, | |||||
max_query: int = 128, | |||||
split_thresh: int = 32, | |||||
) -> list[str]: | |||||
""" | |||||
Convert the clean article text into a list of web-searchable chunks. | |||||
No greater than *max_chunks* will be returned. Each chunk will only be a | |||||
sentence or two long at most (no more than *max_query*). The idea is to return | |||||
a sample of the article text rather than the whole, so we'll pick and choose | |||||
from parts of it, especially if the article is large and *max_chunks* is low, | |||||
so we don't end up just searching for just the first paragraph. | |||||
This is implemented using :py:mod:`nltk` (https://nltk.org/). A base directory | |||||
(*nltk_dir*) is required to store nltk's punctuation database, and should be | |||||
passed as an argument to the constructor. It is typically located in the bot's | |||||
working directory. | |||||
""" | """ | ||||
sentences = self._get_sentences(min_query, max_query, split_thresh) | sentences = self._get_sentences(min_query, max_query, split_thresh) | ||||
if len(sentences) <= max_chunks: | if len(sentences) <= max_chunks: | ||||
return sentences | return sentences | ||||
chunks = [] | |||||
chunks: list[str] = [] | |||||
while len(chunks) < max_chunks: | while len(chunks) < max_chunks: | ||||
if len(chunks) % 5 == 0: | if len(chunks) % 5 == 0: | ||||
chunk = sentences.pop(0) # Pop from beginning | chunk = sentences.pop(0) # Pop from beginning | ||||
@@ -216,7 +228,8 @@ class ArticleTextParser(_BaseTextParser): | |||||
return chunks | return chunks | ||||
def get_links(self): | def get_links(self): | ||||
"""Return a list of all external links in the article. | |||||
""" | |||||
Return a list of all external links in the article. | |||||
The list is restricted to things that we suspect we can parse: i.e., | The list is restricted to things that we suspect we can parse: i.e., | ||||
those with schemes of ``http`` and ``https``. | those with schemes of ``http`` and ``https``. | ||||
@@ -226,14 +239,42 @@ class ArticleTextParser(_BaseTextParser): | |||||
return [str(link.url) for link in links if link.url.startswith(schemes)] | return [str(link.url) for link in links if link.url.startswith(schemes)] | ||||
class _HTMLParser(_BaseTextParser): | |||||
class ParserArgs(TypedDict, total=False): | |||||
mirror_hints: list[str] | |||||
open_url: Callable[[str], OpenedURL | None] | |||||
class SourceParser(ABC): | |||||
"""Base class for a parser that handles text.""" | |||||
TYPE: ClassVar[str] | |||||
def __init__(self, text: bytes, url: str, args: ParserArgs | None = None) -> None: | |||||
self.text = text | |||||
self.url = url | |||||
self._args = args or {} | |||||
def __repr__(self) -> str: | |||||
"""Return the canonical string representation of the text parser.""" | |||||
return f"{self.__class__.__name__}(text={self.text!r})" | |||||
def __str__(self) -> str: | |||||
"""Return a nice string representation of the text parser.""" | |||||
return f"<{self.__class__.__name__} of text with size {len(self.text)}>" | |||||
@abstractmethod | |||||
def parse(self) -> str: ... | |||||
class HTMLParser(SourceParser): | |||||
"""A parser that can extract the text from an HTML document.""" | """A parser that can extract the text from an HTML document.""" | ||||
TYPE = "HTML" | TYPE = "HTML" | ||||
hidden_tags = ["script", "style"] | hidden_tags = ["script", "style"] | ||||
def _fail_if_mirror(self, soup): | |||||
"""Look for obvious signs that the given soup is a wiki mirror. | |||||
def _fail_if_mirror(self, soup: bs4.BeautifulSoup) -> None: | |||||
""" | |||||
Look for obvious signs that the given soup is a wiki mirror. | |||||
If so, raise ParserExclusionError, which is caught in the workers and | If so, raise ParserExclusionError, which is caught in the workers and | ||||
causes this source to excluded. | causes this source to excluded. | ||||
@@ -242,13 +283,14 @@ class _HTMLParser(_BaseTextParser): | |||||
return | return | ||||
def func(attr): | def func(attr): | ||||
assert "mirror_hints" in self._args | |||||
return attr and any(hint in attr for hint in self._args["mirror_hints"]) | return attr and any(hint in attr for hint in self._args["mirror_hints"]) | ||||
if soup.find_all(href=func) or soup.find_all(src=func): | if soup.find_all(href=func) or soup.find_all(src=func): | ||||
raise ParserExclusionError() | raise ParserExclusionError() | ||||
@staticmethod | @staticmethod | ||||
def _get_soup(text): | |||||
def _get_soup(text: bytes) -> bs4.BeautifulSoup: | |||||
"""Parse some text using BeautifulSoup.""" | """Parse some text using BeautifulSoup.""" | ||||
import bs4 | import bs4 | ||||
@@ -257,11 +299,11 @@ class _HTMLParser(_BaseTextParser): | |||||
except ValueError: | except ValueError: | ||||
return bs4.BeautifulSoup(text) | return bs4.BeautifulSoup(text) | ||||
def _clean_soup(self, soup): | |||||
def _clean_soup(self, soup: bs4.element.Tag) -> str: | |||||
"""Clean a BeautifulSoup tree of invisible tags.""" | """Clean a BeautifulSoup tree of invisible tags.""" | ||||
import bs4 | import bs4 | ||||
def is_comment(text): | |||||
def is_comment(text: bs4.element.Tag) -> bool: | |||||
return isinstance(text, bs4.element.Comment) | return isinstance(text, bs4.element.Comment) | ||||
for comment in soup.find_all(text=is_comment): | for comment in soup.find_all(text=is_comment): | ||||
@@ -272,7 +314,7 @@ class _HTMLParser(_BaseTextParser): | |||||
return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings) | return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings) | ||||
def _open(self, url, **kwargs): | |||||
def _open(self, url: str, **kwargs: Any) -> bytes | None: | |||||
"""Try to read a URL. Return None if it couldn't be read.""" | """Try to read a URL. Return None if it couldn't be read.""" | ||||
opener = self._args.get("open_url") | opener = self._args.get("open_url") | ||||
if not opener: | if not opener: | ||||
@@ -280,13 +322,13 @@ class _HTMLParser(_BaseTextParser): | |||||
result = opener(url, **kwargs) | result = opener(url, **kwargs) | ||||
return result.content if result else None | return result.content if result else None | ||||
def _load_from_blogspot(self, url): | |||||
def _load_from_blogspot(self, url: urllib.parse.ParseResult) -> str: | |||||
"""Load dynamic content from Blogger Dynamic Views.""" | """Load dynamic content from Blogger Dynamic Views.""" | ||||
match = re.search(r"'postId': '(\d+)'", self.text) | |||||
match = re.search(rb"'postId': '(\d+)'", self.text) | |||||
if not match: | if not match: | ||||
return "" | return "" | ||||
post_id = match.group(1) | post_id = match.group(1) | ||||
url = f"https://{url.netloc}/feeds/posts/default/{post_id}?" | |||||
feed_url = f"https://{url.netloc}/feeds/posts/default/{post_id}?" | |||||
params = { | params = { | ||||
"alt": "json", | "alt": "json", | ||||
"v": "2", | "v": "2", | ||||
@@ -294,7 +336,7 @@ class _HTMLParser(_BaseTextParser): | |||||
"rewriteforssl": "true", | "rewriteforssl": "true", | ||||
} | } | ||||
raw = self._open( | raw = self._open( | ||||
url + urllib.parse.urlencode(params), | |||||
feed_url + urllib.parse.urlencode(params), | |||||
allow_content_types=["application/json"], | allow_content_types=["application/json"], | ||||
) | ) | ||||
if raw is None: | if raw is None: | ||||
@@ -308,19 +350,24 @@ class _HTMLParser(_BaseTextParser): | |||||
except KeyError: | except KeyError: | ||||
return "" | return "" | ||||
soup = self._get_soup(text) | soup = self._get_soup(text) | ||||
if not soup.body: | |||||
return "" | |||||
return self._clean_soup(soup.body) | return self._clean_soup(soup.body) | ||||
def parse(self): | |||||
"""Return the actual text contained within an HTML document. | |||||
def parse(self) -> str: | |||||
""" | |||||
Return the actual text contained within an HTML document. | |||||
Implemented using :py:mod:`BeautifulSoup <bs4>` | Implemented using :py:mod:`BeautifulSoup <bs4>` | ||||
(https://www.crummy.com/software/BeautifulSoup/). | |||||
(https://pypi.org/project/beautifulsoup4/). | |||||
""" | """ | ||||
import bs4 | |||||
url = urllib.parse.urlparse(self.url) if self.url else None | url = urllib.parse.urlparse(self.url) if self.url else None | ||||
soup = self._get_soup(self.text) | soup = self._get_soup(self.text) | ||||
if not soup.body: | if not soup.body: | ||||
# No <body> tag present in HTML -> | |||||
# no scrapable content (possibly JS or <iframe> magic): | |||||
# No <body> tag present in HTML -> # no scrapable content | |||||
# (possibly JS or <iframe> magic): | |||||
return "" | return "" | ||||
self._fail_if_mirror(soup) | self._fail_if_mirror(soup) | ||||
@@ -328,7 +375,7 @@ class _HTMLParser(_BaseTextParser): | |||||
if url and url.netloc == "web.archive.org" and url.path.endswith(".pdf"): | if url and url.netloc == "web.archive.org" and url.path.endswith(".pdf"): | ||||
playback = body.find(id="playback") | playback = body.find(id="playback") | ||||
if playback and "src" in playback.attrs: | |||||
if isinstance(playback, bs4.element.Tag) and "src" in playback.attrs: | |||||
raise ParserRedirectError(playback.attrs["src"]) | raise ParserRedirectError(playback.attrs["src"]) | ||||
content = self._clean_soup(body) | content = self._clean_soup(body) | ||||
@@ -339,7 +386,7 @@ class _HTMLParser(_BaseTextParser): | |||||
return content | return content | ||||
class _PDFParser(_BaseTextParser): | |||||
class PDFParser(SourceParser): | |||||
"""A parser that can extract text from a PDF file.""" | """A parser that can extract text from a PDF file.""" | ||||
TYPE = "PDF" | TYPE = "PDF" | ||||
@@ -348,7 +395,7 @@ class _PDFParser(_BaseTextParser): | |||||
("\u2022", " "), | ("\u2022", " "), | ||||
] | ] | ||||
def parse(self): | |||||
def parse(self) -> str: | |||||
"""Return extracted text from the PDF.""" | """Return extracted text from the PDF.""" | ||||
from pdfminer import converter, pdfinterp, pdfpage | from pdfminer import converter, pdfinterp, pdfpage | ||||
@@ -358,7 +405,7 @@ class _PDFParser(_BaseTextParser): | |||||
interp = pdfinterp.PDFPageInterpreter(manager, conv) | interp = pdfinterp.PDFPageInterpreter(manager, conv) | ||||
try: | try: | ||||
pages = pdfpage.PDFPage.get_pages(io.StringIO(self.text)) | |||||
pages = pdfpage.PDFPage.get_pages(io.BytesIO(self.text)) | |||||
for page in pages: | for page in pages: | ||||
interp.process_page(page) | interp.process_page(page) | ||||
except Exception: # pylint: disable=broad-except | except Exception: # pylint: disable=broad-except | ||||
@@ -372,12 +419,12 @@ class _PDFParser(_BaseTextParser): | |||||
return re.sub(r"\n\n+", "\n", value).strip() | return re.sub(r"\n\n+", "\n", value).strip() | ||||
class _PlainTextParser(_BaseTextParser): | |||||
class PlainTextParser(SourceParser): | |||||
"""A parser that can unicode-ify and strip text from a plain text page.""" | """A parser that can unicode-ify and strip text from a plain text page.""" | ||||
TYPE = "Text" | TYPE = "Text" | ||||
def parse(self): | |||||
def parse(self) -> str: | |||||
"""Unicode-ify and strip whitespace from the plain text document.""" | """Unicode-ify and strip whitespace from the plain text document.""" | ||||
from bs4.dammit import UnicodeDammit | from bs4.dammit import UnicodeDammit | ||||
@@ -385,15 +432,25 @@ class _PlainTextParser(_BaseTextParser): | |||||
return converted.strip() if converted else "" | return converted.strip() if converted else "" | ||||
_CONTENT_TYPES = { | |||||
"text/html": _HTMLParser, | |||||
"application/xhtml+xml": _HTMLParser, | |||||
"application/pdf": _PDFParser, | |||||
"application/x-pdf": _PDFParser, | |||||
"text/plain": _PlainTextParser, | |||||
_CONTENT_TYPES: dict[str, type[SourceParser]] = { | |||||
"text/html": HTMLParser, | |||||
"application/xhtml+xml": HTMLParser, | |||||
"application/pdf": PDFParser, | |||||
"application/x-pdf": PDFParser, | |||||
"text/plain": PlainTextParser, | |||||
} | } | ||||
def get_parser(content_type): | |||||
@typing.overload | |||||
def get_parser(content_type: str) -> type[SourceParser] | None: ... | |||||
@typing.overload | |||||
def get_parser( | |||||
content_type: Literal["text/plain"] = "text/plain", | |||||
) -> type[SourceParser]: ... | |||||
def get_parser(content_type: str = "text/plain") -> type[SourceParser] | None: | |||||
"""Return the parser most able to handle a given content type, or None.""" | """Return the parser most able to handle a given content type, or None.""" | ||||
return _CONTENT_TYPES.get(content_type) | return _CONTENT_TYPES.get(content_type) |
@@ -18,13 +18,26 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
from __future__ import annotations | |||||
__all__ = ["CopyvioSource", "CopyvioCheckResult"] | |||||
import time | |||||
import typing | |||||
import urllib.parse | import urllib.parse | ||||
from threading import Event | from threading import Event | ||||
from time import time | |||||
from typing import Any | |||||
from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION | |||||
from earwigbot.wiki.copyvios.markov import ( | |||||
EMPTY, | |||||
EMPTY_INTERSECTION, | |||||
MarkovChain, | |||||
MarkovChainIntersection, | |||||
) | |||||
__all__ = ["CopyvioSource", "CopyvioCheckResult"] | |||||
if typing.TYPE_CHECKING: | |||||
from earwigbot.wiki.copyvios.parsers import ParserArgs | |||||
from earwigbot.wiki.copyvios.workers import CopyvioWorkspace | |||||
class CopyvioSource: | class CopyvioSource: | ||||
@@ -45,13 +58,13 @@ class CopyvioSource: | |||||
def __init__( | def __init__( | ||||
self, | self, | ||||
workspace, | |||||
url, | |||||
headers=None, | |||||
timeout=5, | |||||
parser_args=None, | |||||
search_config=None, | |||||
): | |||||
workspace: CopyvioWorkspace, | |||||
url: str, | |||||
headers: list[tuple[str, str]] | None = None, | |||||
timeout: float = 5, | |||||
parser_args: ParserArgs | None = None, | |||||
search_config: dict[str, Any] | None = None, | |||||
) -> None: | |||||
self.workspace = workspace | self.workspace = workspace | ||||
self.url = url | self.url = url | ||||
self.headers = headers | self.headers = headers | ||||
@@ -68,54 +81,57 @@ class CopyvioSource: | |||||
self._event2 = Event() | self._event2 = Event() | ||||
self._event2.set() | self._event2.set() | ||||
def __repr__(self): | |||||
def __repr__(self) -> str: | |||||
"""Return the canonical string representation of the source.""" | """Return the canonical string representation of the source.""" | ||||
res = ( | |||||
"CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r}, " | |||||
"excluded={3!r})" | |||||
return ( | |||||
f"CopyvioSource(url={self.url!r}, confidence={self.confidence!r}, " | |||||
f"skipped={self.skipped!r}, excluded={self.excluded!r})" | |||||
) | ) | ||||
return res.format(self.url, self.confidence, self.skipped, self.excluded) | |||||
def __str__(self): | |||||
def __str__(self) -> str: | |||||
"""Return a nice string representation of the source.""" | """Return a nice string representation of the source.""" | ||||
if self.excluded: | if self.excluded: | ||||
return f"<CopyvioSource ({self.url}, excluded)>" | return f"<CopyvioSource ({self.url}, excluded)>" | ||||
if self.skipped: | if self.skipped: | ||||
return f"<CopyvioSource ({self.url}, skipped)>" | return f"<CopyvioSource ({self.url}, skipped)>" | ||||
res = "<CopyvioSource ({0} with {1} conf)>" | |||||
return res.format(self.url, self.confidence) | |||||
return f"<CopyvioSource ({self.url} with {self.confidence} conf)>" | |||||
@property | @property | ||||
def domain(self): | |||||
def domain(self) -> str | None: | |||||
"""The source URL's domain name, or None.""" | """The source URL's domain name, or None.""" | ||||
return urllib.parse.urlparse(self.url).netloc or None | return urllib.parse.urlparse(self.url).netloc or None | ||||
def start_work(self): | |||||
def start_work(self) -> None: | |||||
"""Mark this source as being worked on right now.""" | """Mark this source as being worked on right now.""" | ||||
self._event2.clear() | self._event2.clear() | ||||
self._event1.set() | self._event1.set() | ||||
def update(self, confidence, source_chain, delta_chain): | |||||
def update( | |||||
self, | |||||
confidence: float, | |||||
source_chain: MarkovChain, | |||||
delta_chain: MarkovChainIntersection, | |||||
) -> None: | |||||
"""Fill out the confidence and chain information inside this source.""" | """Fill out the confidence and chain information inside this source.""" | ||||
self.confidence = confidence | self.confidence = confidence | ||||
self.chains = (source_chain, delta_chain) | self.chains = (source_chain, delta_chain) | ||||
def finish_work(self): | |||||
def finish_work(self) -> None: | |||||
"""Mark this source as finished.""" | """Mark this source as finished.""" | ||||
self._event2.set() | self._event2.set() | ||||
def skip(self): | |||||
def skip(self) -> None: | |||||
"""Deactivate this source without filling in the relevant data.""" | """Deactivate this source without filling in the relevant data.""" | ||||
if self._event1.is_set(): | if self._event1.is_set(): | ||||
return | return | ||||
self.skipped = True | self.skipped = True | ||||
self._event1.set() | self._event1.set() | ||||
def join(self, until): | |||||
def join(self, until: float | None = None) -> None: | |||||
"""Block until this violation result is filled out.""" | """Block until this violation result is filled out.""" | ||||
for event in [self._event1, self._event2]: | for event in [self._event1, self._event2]: | ||||
if until: | |||||
timeout = until - time() | |||||
if until is not None: | |||||
timeout = until - time.time() | |||||
if timeout <= 0: | if timeout <= 0: | ||||
return | return | ||||
event.wait(timeout) | event.wait(timeout) | ||||
@@ -144,16 +160,15 @@ class CopyvioCheckResult: | |||||
def __init__( | def __init__( | ||||
self, | self, | ||||
violation, | |||||
sources, | |||||
queries, | |||||
check_time, | |||||
article_chain, | |||||
possible_miss, | |||||
included_sources=None, | |||||
unified_confidence=None, | |||||
violation: bool, | |||||
sources: list[CopyvioSource], | |||||
queries: int, | |||||
check_time: float, | |||||
article_chain: MarkovChain, | |||||
possible_miss: bool, | |||||
included_sources: list[CopyvioSource] | None = None, | |||||
unified_confidence: float | None = None, | |||||
): | ): | ||||
assert isinstance(sources, list) | |||||
self.violation = violation | self.violation = violation | ||||
self.sources = sources | self.sources = sources | ||||
self.queries = queries | self.queries = queries | ||||
@@ -163,48 +178,47 @@ class CopyvioCheckResult: | |||||
self.included_sources = included_sources if included_sources else [] | self.included_sources = included_sources if included_sources else [] | ||||
self.unified_confidence = unified_confidence | self.unified_confidence = unified_confidence | ||||
def __repr__(self): | |||||
def __repr__(self) -> str: | |||||
"""Return the canonical string representation of the result.""" | """Return the canonical string representation of the result.""" | ||||
res = "CopyvioCheckResult(violation={0!r}, sources={1!r}, queries={2!r}, time={3!r})" | |||||
return res.format(self.violation, self.sources, self.queries, self.time) | |||||
return ( | |||||
f"CopyvioCheckResult(violation={self.violation!r}, " | |||||
f"sources={self.sources!r}, queries={self.queries!r}, time={self.time!r})" | |||||
) | |||||
def __str__(self): | |||||
def __str__(self) -> str: | |||||
"""Return a nice string representation of the result.""" | """Return a nice string representation of the result.""" | ||||
res = "<CopyvioCheckResult ({0} with best {1})>" | |||||
return res.format(self.violation, self.best) | |||||
return f"<CopyvioCheckResult ({self.violation} with best {self.best})>" | |||||
@property | @property | ||||
def best(self): | |||||
def best(self) -> CopyvioSource | None: | |||||
"""The best known source, or None if no sources exist.""" | """The best known source, or None if no sources exist.""" | ||||
return self.sources[0] if self.sources else None | return self.sources[0] if self.sources else None | ||||
@property | @property | ||||
def confidence(self): | |||||
def confidence(self) -> float: | |||||
"""The confidence of the best source, or 0 if no sources exist.""" | """The confidence of the best source, or 0 if no sources exist.""" | ||||
if self.unified_confidence is not None: | if self.unified_confidence is not None: | ||||
return self.unified_confidence | return self.unified_confidence | ||||
if self.best: | |||||
if self.best is not None: | |||||
return self.best.confidence | return self.best.confidence | ||||
return 0.0 | return 0.0 | ||||
@property | @property | ||||
def url(self): | |||||
def url(self) -> str | None: | |||||
"""The URL of the best source, or None if no sources exist.""" | """The URL of the best source, or None if no sources exist.""" | ||||
return self.best.url if self.best else None | return self.best.url if self.best else None | ||||
def get_log_message(self, title): | |||||
def get_log_message(self, title: str) -> str: | |||||
"""Build a relevant log message for this copyvio check result.""" | """Build a relevant log message for this copyvio check result.""" | ||||
if not self.sources: | if not self.sources: | ||||
log = "No violation for [[{0}]] (no sources; {1} queries; {2} seconds)" | |||||
return log.format(title, self.queries, self.time) | |||||
log = "{0} for [[{1}]] (best: {2} ({3} confidence); {4} sources; {5} queries; {6} seconds)" | |||||
return ( | |||||
f"No violation for [[{title}]] (no sources; {self.queries} queries; " | |||||
f"{self.time} seconds)" | |||||
) | |||||
is_vio = "Violation detected" if self.violation else "No violation" | is_vio = "Violation detected" if self.violation else "No violation" | ||||
return log.format( | |||||
is_vio, | |||||
title, | |||||
self.url, | |||||
self.confidence, | |||||
len(self.sources), | |||||
self.queries, | |||||
self.time, | |||||
return ( | |||||
f"{is_vio} for [[{title}]] (best: {self.url} ({self.confidence} " | |||||
f"confidence); {len(self.sources)} sources; {self.queries} queries; " | |||||
f"{self.time} seconds)" | |||||
) | ) |
@@ -18,91 +18,101 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
import re | |||||
from gzip import GzipFile | |||||
from io import StringIO | |||||
from json import loads | |||||
from urllib.error import URLError | |||||
from urllib.parse import urlencode | |||||
from earwigbot.exceptions import SearchQueryError | |||||
__all__ = [ | __all__ = [ | ||||
"BingSearchEngine", | "BingSearchEngine", | ||||
"GoogleSearchEngine", | "GoogleSearchEngine", | ||||
"SearchEngine", | |||||
"YandexSearchEngine", | "YandexSearchEngine", | ||||
"SEARCH_ENGINES", | |||||
"get_search_engine", | |||||
] | ] | ||||
import base64 | |||||
import gzip | |||||
import io | |||||
import json | |||||
import re | |||||
import urllib.parse | |||||
import urllib.request | |||||
from abc import ABC, abstractmethod | |||||
from typing import Any | |||||
from urllib.error import URLError | |||||
class _BaseSearchEngine: | |||||
from earwigbot import exceptions | |||||
class SearchEngine(ABC): | |||||
"""Base class for a simple search engine interface.""" | """Base class for a simple search engine interface.""" | ||||
name = "Base" | name = "Base" | ||||
def __init__(self, cred, opener): | |||||
def __init__( | |||||
self, cred: dict[str, str], opener: urllib.request.OpenerDirector | |||||
) -> None: | |||||
"""Store credentials (*cred*) and *opener* for searching later on.""" | """Store credentials (*cred*) and *opener* for searching later on.""" | ||||
self.cred = cred | self.cred = cred | ||||
self.opener = opener | self.opener = opener | ||||
self.count = 5 | self.count = 5 | ||||
def __repr__(self): | |||||
def __repr__(self) -> str: | |||||
"""Return the canonical string representation of the search engine.""" | """Return the canonical string representation of the search engine.""" | ||||
return f"{self.__class__.__name__}()" | return f"{self.__class__.__name__}()" | ||||
def __str__(self): | |||||
def __str__(self) -> str: | |||||
"""Return a nice string representation of the search engine.""" | """Return a nice string representation of the search engine.""" | ||||
return f"<{self.__class__.__name__}>" | return f"<{self.__class__.__name__}>" | ||||
def _open(self, *args): | |||||
def _open(self, url: str) -> bytes: | |||||
"""Open a URL (like urlopen) and try to return its contents.""" | """Open a URL (like urlopen) and try to return its contents.""" | ||||
try: | try: | ||||
response = self.opener.open(*args) | |||||
response = self.opener.open(url) | |||||
result = response.read() | result = response.read() | ||||
except (OSError, URLError) as exc: | except (OSError, URLError) as exc: | ||||
err = SearchQueryError(f"{self.name} Error: {exc}") | |||||
err.cause = exc | |||||
raise err | |||||
raise exceptions.SearchQueryError(f"{self.name} Error: {exc}") | |||||
if response.headers.get("Content-Encoding") == "gzip": | if response.headers.get("Content-Encoding") == "gzip": | ||||
stream = StringIO(result) | |||||
gzipper = GzipFile(fileobj=stream) | |||||
stream = io.BytesIO(result) | |||||
gzipper = gzip.GzipFile(fileobj=stream) | |||||
result = gzipper.read() | result = gzipper.read() | ||||
code = response.getcode() | code = response.getcode() | ||||
if code != 200: | if code != 200: | ||||
err = "{0} Error: got response code '{1}':\n{2}'" | |||||
raise SearchQueryError(err.format(self.name, code, result)) | |||||
raise exceptions.SearchQueryError( | |||||
f"{self.name} Error: got response code '{code}':\n{result}'" | |||||
) | |||||
return result | return result | ||||
@staticmethod | @staticmethod | ||||
def requirements(): | |||||
def requirements() -> list[str]: | |||||
"""Return a list of packages required by this search engine.""" | """Return a list of packages required by this search engine.""" | ||||
return [] | return [] | ||||
def search(self, query): | |||||
"""Use this engine to search for *query*. | |||||
@abstractmethod | |||||
def search(self, query: str) -> list[str]: | |||||
""" | |||||
Use this engine to search for *query*. | |||||
Not implemented in this base class; overridden in subclasses. | Not implemented in this base class; overridden in subclasses. | ||||
""" | """ | ||||
raise NotImplementedError() | |||||
class BingSearchEngine(_BaseSearchEngine): | |||||
class BingSearchEngine(SearchEngine): | |||||
"""A search engine interface with Bing Search (via Azure Marketplace).""" | """A search engine interface with Bing Search (via Azure Marketplace).""" | ||||
name = "Bing" | name = "Bing" | ||||
def __init__(self, cred, opener): | |||||
def __init__( | |||||
self, cred: dict[str, str], opener: urllib.request.OpenerDirector | |||||
) -> None: | |||||
super().__init__(cred, opener) | super().__init__(cred, opener) | ||||
key = self.cred["key"] | key = self.cred["key"] | ||||
auth = (key + ":" + key).encode("base64").replace("\n", "") | |||||
self.opener.addheaders.append(("Authorization", "Basic " + auth)) | |||||
auth = base64.b64encode(f"{key}:{key}".encode()).decode() | |||||
self.opener.addheaders.append(("Authorization", f"Basic {auth}")) | |||||
def search(self, query: str) -> list[str]: | def search(self, query: str) -> list[str]: | ||||
"""Do a Bing web search for *query*. | |||||
""" | |||||
Do a Bing web search for *query*. | |||||
Returns a list of URLs ranked by relevance (as determined by Bing). | Returns a list of URLs ranked by relevance (as determined by Bing). | ||||
Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. | Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. | ||||
@@ -112,20 +122,19 @@ class BingSearchEngine(_BaseSearchEngine): | |||||
params = { | params = { | ||||
"$format": "json", | "$format": "json", | ||||
"$top": str(self.count), | "$top": str(self.count), | ||||
"Query": "'\"" + query.replace('"', "").encode("utf8") + "\"'", | |||||
"Query": "'\"" + query.replace('"', "") + "\"'", | |||||
"Market": "'en-US'", | "Market": "'en-US'", | ||||
"Adult": "'Off'", | "Adult": "'Off'", | ||||
"Options": "'DisableLocationDetection'", | "Options": "'DisableLocationDetection'", | ||||
"WebSearchOptions": "'DisableHostCollapsing+DisableQueryAlterations'", | "WebSearchOptions": "'DisableHostCollapsing+DisableQueryAlterations'", | ||||
} | } | ||||
result = self._open(url + urlencode(params)) | |||||
result = self._open(url + urllib.parse.urlencode(params)) | |||||
try: | try: | ||||
res = loads(result) | |||||
res = json.loads(result) | |||||
except ValueError: | except ValueError: | ||||
err = "Bing Error: JSON could not be decoded" | |||||
raise SearchQueryError(err) | |||||
raise exceptions.SearchQueryError("Bing Error: JSON could not be decoded") | |||||
try: | try: | ||||
results = res["d"]["results"] | results = res["d"]["results"] | ||||
@@ -134,13 +143,14 @@ class BingSearchEngine(_BaseSearchEngine): | |||||
return [result["Url"] for result in results] | return [result["Url"] for result in results] | ||||
class GoogleSearchEngine(_BaseSearchEngine): | |||||
class GoogleSearchEngine(SearchEngine): | |||||
"""A search engine interface with Google Search.""" | """A search engine interface with Google Search.""" | ||||
name = "Google" | name = "Google" | ||||
def search(self, query: str) -> list[str]: | def search(self, query: str) -> list[str]: | ||||
"""Do a Google web search for *query*. | |||||
""" | |||||
Do a Google web search for *query*. | |||||
Returns a list of URLs ranked by relevance (as determined by Google). | Returns a list of URLs ranked by relevance (as determined by Google). | ||||
Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. | Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. | ||||
@@ -157,13 +167,13 @@ class GoogleSearchEngine(_BaseSearchEngine): | |||||
"fields": "items(link)", | "fields": "items(link)", | ||||
} | } | ||||
result = self._open(url + urlencode(params)) | |||||
result = self._open(url + urllib.parse.urlencode(params)) | |||||
try: | try: | ||||
res = loads(result) | |||||
res = json.loads(result) | |||||
except ValueError: | except ValueError: | ||||
err = "Google Error: JSON could not be decoded" | err = "Google Error: JSON could not be decoded" | ||||
raise SearchQueryError(err) | |||||
raise exceptions.SearchQueryError(err) | |||||
try: | try: | ||||
return [item["link"] for item in res["items"]] | return [item["link"] for item in res["items"]] | ||||
@@ -171,7 +181,7 @@ class GoogleSearchEngine(_BaseSearchEngine): | |||||
return [] | return [] | ||||
class YandexSearchEngine(_BaseSearchEngine): | |||||
class YandexSearchEngine(SearchEngine): | |||||
"""A search engine interface with Yandex Search.""" | """A search engine interface with Yandex Search.""" | ||||
name = "Yandex" | name = "Yandex" | ||||
@@ -181,7 +191,8 @@ class YandexSearchEngine(_BaseSearchEngine): | |||||
return ["lxml.etree"] | return ["lxml.etree"] | ||||
def search(self, query: str) -> list[str]: | def search(self, query: str) -> list[str]: | ||||
"""Do a Yandex web search for *query*. | |||||
""" | |||||
Do a Yandex web search for *query*. | |||||
Returns a list of URLs ranked by relevance (as determined by Yandex). | Returns a list of URLs ranked by relevance (as determined by Yandex). | ||||
Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. | Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. | ||||
@@ -201,17 +212,51 @@ class YandexSearchEngine(_BaseSearchEngine): | |||||
"groupby": f"mode=flat.groups-on-page={self.count}", | "groupby": f"mode=flat.groups-on-page={self.count}", | ||||
} | } | ||||
result = self._open(url + urlencode(params)) | |||||
result = self._open(url + urllib.parse.urlencode(params)) | |||||
try: | try: | ||||
data = lxml.etree.fromstring(result) # type: ignore | |||||
data = lxml.etree.fromstring(result) | |||||
return [elem.text for elem in data.xpath(".//url")] | return [elem.text for elem in data.xpath(".//url")] | ||||
except lxml.etree.Error as exc: | except lxml.etree.Error as exc: | ||||
raise SearchQueryError("Yandex XML parse error: " + str(exc)) | |||||
raise exceptions.SearchQueryError(f"Yandex XML parse error: {exc}") | |||||
SEARCH_ENGINES = { | |||||
SEARCH_ENGINES: dict[str, type[SearchEngine]] = { | |||||
"Bing": BingSearchEngine, | "Bing": BingSearchEngine, | ||||
"Google": GoogleSearchEngine, | "Google": GoogleSearchEngine, | ||||
"Yandex": YandexSearchEngine, | "Yandex": YandexSearchEngine, | ||||
} | } | ||||
def get_search_engine( | |||||
search_config: dict[str, Any], headers: list[tuple[str, str]] | |||||
) -> SearchEngine: | |||||
"""Return a function that can be called to do web searches. | |||||
The function takes one argument, a search query, and returns a list of URLs, ranked | |||||
by importance. The underlying logic depends on the *engine* argument within our | |||||
config; for example, if *engine* is "Yahoo! BOSS", we'll use YahooBOSSSearchEngine | |||||
for querying. | |||||
Raises UnknownSearchEngineError if the 'engine' listed in our config is unknown to | |||||
us, and UnsupportedSearchEngineError if we are missing a required package or | |||||
module, like oauth2 for "Yahoo! BOSS". | |||||
""" | |||||
engine = search_config["engine"] | |||||
if engine not in SEARCH_ENGINES: | |||||
raise exceptions.UnknownSearchEngineError(engine) | |||||
klass = SEARCH_ENGINES[engine] | |||||
credentials = search_config["credentials"] | |||||
opener = urllib.request.build_opener() | |||||
opener.addheaders = headers | |||||
for dep in klass.requirements(): | |||||
try: | |||||
__import__(dep).__name__ | |||||
except (ModuleNotFoundError, AttributeError): | |||||
e = "Missing a required dependency ({}) for the {} engine" | |||||
e = e.format(dep, engine) | |||||
raise exceptions.UnsupportedSearchEngineError(e) | |||||
return klass(credentials, opener) |
@@ -18,59 +18,61 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
from __future__ import annotations | |||||
__all__ = ["globalize", "localize", "CopyvioWorkspace"] | |||||
import base64 | import base64 | ||||
import collections | import collections | ||||
import dataclasses | |||||
import functools | import functools | ||||
import gzip | |||||
import io | |||||
import logging | |||||
import math | |||||
import queue | |||||
import struct | |||||
import threading | |||||
import time | import time | ||||
import urllib.parse | import urllib.parse | ||||
from collections import deque | |||||
from gzip import GzipFile | |||||
import urllib.request | |||||
from collections.abc import Callable, Container | |||||
from dataclasses import dataclass | |||||
from http.client import HTTPException | from http.client import HTTPException | ||||
from io import StringIO | |||||
from logging import getLogger | |||||
from math import log | |||||
from queue import Empty, Queue | |||||
from struct import error as struct_error | |||||
from threading import Lock, Thread | |||||
from typing import Any | |||||
from urllib.error import URLError | from urllib.error import URLError | ||||
from urllib.request import Request, build_opener | |||||
from earwigbot import importer | |||||
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError | from earwigbot.exceptions import ParserExclusionError, ParserRedirectError | ||||
from earwigbot.wiki.copyvios.markov import ( | from earwigbot.wiki.copyvios.markov import ( | ||||
DEFAULT_DEGREE, | |||||
MarkovChain, | MarkovChain, | ||||
MarkovChainIntersection, | MarkovChainIntersection, | ||||
MarkovChainUnion, | MarkovChainUnion, | ||||
) | ) | ||||
from earwigbot.wiki.copyvios.parsers import get_parser | |||||
from earwigbot.wiki.copyvios.parsers import ParserArgs, SourceParser, get_parser | |||||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | ||||
tldextract = importer.new("tldextract") | |||||
__all__ = ["globalize", "localize", "CopyvioWorkspace"] | |||||
INCLUDE_THRESHOLD = 0.15 | INCLUDE_THRESHOLD = 0.15 | ||||
_MAX_REDIRECTS = 3 | _MAX_REDIRECTS = 3 | ||||
_MAX_RAW_SIZE = 20 * 1024**2 | _MAX_RAW_SIZE = 20 * 1024**2 | ||||
_is_globalized = False | _is_globalized = False | ||||
_global_queues = None | |||||
_global_workers = [] | |||||
_global_queues: _CopyvioQueues | None = None | |||||
_global_workers: list[_CopyvioWorker] = [] | |||||
_OpenedURL = collections.namedtuple("_OpenedURL", ["content", "parser_class"]) | |||||
def globalize(num_workers: int = 8) -> None: | |||||
""" | |||||
Cause all copyvio checks to be done by one global set of workers. | |||||
def globalize(num_workers=8): | |||||
"""Cause all copyvio checks to be done by one global set of workers. | |||||
This is useful when checks are being done through a web interface where | |||||
large numbers of simulatenous requests could be problematic. The global | |||||
workers are spawned when the function is called, run continuously, and | |||||
intelligently handle multiple checks. | |||||
This is useful when checks are being done through a web interface where large | |||||
numbers of simulatenous requests could be problematic. The global workers are | |||||
spawned when the function is called, run continuously, and intelligently handle | |||||
multiple checks. | |||||
This function is not thread-safe and should only be called when no checks | |||||
are being done. It has no effect if it has already been called. | |||||
This function is not thread-safe and should only be called when no checks are being | |||||
done. It has no effect if it has already been called. | |||||
""" | """ | ||||
global _is_globalized, _global_queues | global _is_globalized, _global_queues | ||||
if _is_globalized: | if _is_globalized: | ||||
@@ -84,19 +86,20 @@ def globalize(num_workers=8): | |||||
_is_globalized = True | _is_globalized = True | ||||
def localize(): | |||||
def localize() -> None: | |||||
"""Return to using page-specific workers for copyvio checks. | """Return to using page-specific workers for copyvio checks. | ||||
This disables changes made by :func:`globalize`, including stoping the | |||||
global worker threads. | |||||
This disables changes made by :func:`globalize`, including stoping the global | |||||
worker threads. | |||||
This function is not thread-safe and should only be called when no checks | |||||
are being done. | |||||
This function is not thread-safe and should only be called when no checks are | |||||
being done. | |||||
""" | """ | ||||
global _is_globalized, _global_queues, _global_workers | global _is_globalized, _global_queues, _global_workers | ||||
if not _is_globalized: | if not _is_globalized: | ||||
return | return | ||||
assert _global_queues is not None | |||||
for i in range(len(_global_workers)): | for i in range(len(_global_workers)): | ||||
_global_queues.unassigned.put((StopIteration, None)) | _global_queues.unassigned.put((StopIteration, None)) | ||||
_global_queues = None | _global_queues = None | ||||
@@ -104,30 +107,50 @@ def localize(): | |||||
_is_globalized = False | _is_globalized = False | ||||
@dataclass(frozen=True) | |||||
class OpenedURL: | |||||
content: bytes | |||||
parser_class: type[SourceParser] | |||||
SourceQueue = collections.deque[CopyvioSource] | |||||
UnassignedQueue = queue.Queue[ | |||||
tuple[str, SourceQueue] | tuple[type[StopIteration], None] | |||||
] | |||||
@dataclass(frozen=True) | |||||
class _CopyvioQueues: | class _CopyvioQueues: | ||||
"""Stores data necessary to maintain the various queues during a check.""" | """Stores data necessary to maintain the various queues during a check.""" | ||||
def __init__(self): | |||||
self.lock = Lock() | |||||
self.sites = {} | |||||
self.unassigned = Queue() | |||||
lock: threading.Lock = dataclasses.field(default_factory=threading.Lock) | |||||
sites: dict[str, SourceQueue] = dataclasses.field(default_factory=dict) | |||||
unassigned: UnassignedQueue = dataclasses.field(default_factory=queue.Queue) | |||||
class _CopyvioWorker: | class _CopyvioWorker: | ||||
"""A multithreaded URL opener/parser instance.""" | """A multithreaded URL opener/parser instance.""" | ||||
def __init__(self, name, queues, until=None): | |||||
def __init__( | |||||
self, name: str, queues: _CopyvioQueues, until: float | None = None | |||||
) -> None: | |||||
self._name = name | self._name = name | ||||
self._queues = queues | self._queues = queues | ||||
self._until = until | self._until = until | ||||
self._site = None | |||||
self._queue = None | |||||
self._search_config = None | |||||
self._opener = build_opener() | |||||
self._logger = getLogger("earwigbot.wiki.cvworker." + name) | |||||
self._site: str | None = None | |||||
self._queue: SourceQueue | None = None | |||||
self._search_config: dict[str, Any] | None = None | |||||
self._opener = urllib.request.build_opener() | |||||
self._logger = logging.getLogger("earwigbot.wiki.cvworker." + name) | |||||
def _try_map_proxy_url(self, url, parsed, extra_headers, is_error=False): | |||||
def _try_map_proxy_url( | |||||
self, | |||||
url: str, | |||||
parsed: urllib.parse.ParseResult, | |||||
extra_headers: dict[str, str], | |||||
is_error: bool = False, | |||||
) -> tuple[str, bool]: | |||||
if not self._search_config or "proxies" not in self._search_config: | if not self._search_config or "proxies" not in self._search_config: | ||||
return url, False | return url, False | ||||
for proxy_info in self._search_config["proxies"]: | for proxy_info in self._search_config["proxies"]: | ||||
@@ -152,17 +175,20 @@ class _CopyvioWorker: | |||||
return url, True | return url, True | ||||
return url, False | return url, False | ||||
def _open_url_raw(self, url, timeout=5, allow_content_types=None): | |||||
def _open_url_raw( | |||||
self, | |||||
url: str, | |||||
timeout: float = 5, | |||||
allow_content_types: Container[str] | None = None, | |||||
) -> OpenedURL | None: | |||||
"""Open a URL, without parsing it. | """Open a URL, without parsing it. | ||||
None will be returned for URLs that cannot be read for whatever reason. | None will be returned for URLs that cannot be read for whatever reason. | ||||
""" | """ | ||||
parsed = urllib.parse.urlparse(url) | parsed = urllib.parse.urlparse(url) | ||||
if not isinstance(url, str): | |||||
url = url.encode("utf8") | |||||
extra_headers = {} | |||||
extra_headers: dict[str, str] = {} | |||||
url, _ = self._try_map_proxy_url(url, parsed, extra_headers) | url, _ = self._try_map_proxy_url(url, parsed, extra_headers) | ||||
request = Request(url, headers=extra_headers) | |||||
request = urllib.request.Request(url, headers=extra_headers) | |||||
try: | try: | ||||
response = self._opener.open(request, timeout=timeout) | response = self._opener.open(request, timeout=timeout) | ||||
except (OSError, URLError, HTTPException, ValueError): | except (OSError, URLError, HTTPException, ValueError): | ||||
@@ -170,14 +196,14 @@ class _CopyvioWorker: | |||||
url, parsed, extra_headers, is_error=True | url, parsed, extra_headers, is_error=True | ||||
) | ) | ||||
if not remapped: | if not remapped: | ||||
self._logger.exception("Failed to fetch URL: %s", url) | |||||
self._logger.exception(f"Failed to fetch URL: {url}") | |||||
return None | return None | ||||
self._logger.info("Failed to fetch URL, trying proxy remap: %s", url) | |||||
request = Request(url, headers=extra_headers) | |||||
self._logger.info(f"Failed to fetch URL, trying proxy remap: {url}") | |||||
request = urllib.request.Request(url, headers=extra_headers) | |||||
try: | try: | ||||
response = self._opener.open(request, timeout=timeout) | response = self._opener.open(request, timeout=timeout) | ||||
except (OSError, URLError, HTTPException, ValueError): | except (OSError, URLError, HTTPException, ValueError): | ||||
self._logger.exception("Failed to fetch URL after proxy remap: %s", url) | |||||
self._logger.exception(f"Failed to fetch URL after proxy remap: {url}") | |||||
return None | return None | ||||
try: | try: | ||||
@@ -193,7 +219,7 @@ class _CopyvioWorker: | |||||
): | ): | ||||
return None | return None | ||||
if not parser_class: | if not parser_class: | ||||
parser_class = get_parser("text/plain") | |||||
parser_class = get_parser() | |||||
if size > (15 if parser_class.TYPE == "PDF" else 2) * 1024**2: | if size > (15 if parser_class.TYPE == "PDF" else 2) * 1024**2: | ||||
return None | return None | ||||
@@ -207,28 +233,27 @@ class _CopyvioWorker: | |||||
return None | return None | ||||
if response.headers.get("Content-Encoding") == "gzip": | if response.headers.get("Content-Encoding") == "gzip": | ||||
stream = StringIO(content) | |||||
gzipper = GzipFile(fileobj=stream) | |||||
stream = io.BytesIO(content) | |||||
gzipper = gzip.GzipFile(fileobj=stream) | |||||
try: | try: | ||||
content = gzipper.read() | content = gzipper.read() | ||||
except (OSError, struct_error): | |||||
except (OSError, struct.error): | |||||
return None | return None | ||||
if len(content) > _MAX_RAW_SIZE: | if len(content) > _MAX_RAW_SIZE: | ||||
return None | return None | ||||
return _OpenedURL(content, parser_class) | |||||
return OpenedURL(content, parser_class) | |||||
def _open_url(self, source, redirects=0): | |||||
def _open_url(self, source: CopyvioSource, redirects: int = 0) -> str | None: | |||||
"""Open a URL and return its parsed content, or None. | """Open a URL and return its parsed content, or None. | ||||
First, we will decompress the content if the headers contain "gzip" as | |||||
its content encoding. Then, we will return the content stripped using | |||||
an HTML parser if the headers indicate it is HTML, or return the | |||||
content directly if it is plain text. If we don't understand the | |||||
content type, we'll return None. | |||||
First, we will decompress the content if the headers contain "gzip" as its | |||||
content encoding. Then, we will return the content stripped using an HTML | |||||
parser if the headers indicate it is HTML, or return the content directly if it | |||||
is plain text. If we don't understand the content type, we'll return None. | |||||
If a URLError was raised while opening the URL or an IOError was raised | |||||
while decompressing, None will be returned. | |||||
If a URLError was raised while opening the URL or an IOError was raised while | |||||
decompressing, None will be returned. | |||||
""" | """ | ||||
self._search_config = source.search_config | self._search_config = source.search_config | ||||
if source.headers: | if source.headers: | ||||
@@ -238,9 +263,9 @@ class _CopyvioWorker: | |||||
if result is None: | if result is None: | ||||
return None | return None | ||||
args = source.parser_args.copy() if source.parser_args else {} | |||||
args: ParserArgs = source.parser_args.copy() if source.parser_args else {} | |||||
args["open_url"] = functools.partial(self._open_url_raw, timeout=source.timeout) | args["open_url"] = functools.partial(self._open_url_raw, timeout=source.timeout) | ||||
parser = result.parser_class(result.content, url=source.url, args=args) | |||||
parser = result.parser_class(result.content, source.url, args=args) | |||||
try: | try: | ||||
return parser.parse() | return parser.parse() | ||||
except ParserRedirectError as exc: | except ParserRedirectError as exc: | ||||
@@ -249,30 +274,31 @@ class _CopyvioWorker: | |||||
source.url = exc.url.decode("utf8") | source.url = exc.url.decode("utf8") | ||||
return self._open_url(source, redirects=redirects + 1) | return self._open_url(source, redirects=redirects + 1) | ||||
def _acquire_new_site(self): | |||||
def _acquire_new_site(self) -> None: | |||||
"""Block for a new unassigned site queue.""" | """Block for a new unassigned site queue.""" | ||||
if self._until: | if self._until: | ||||
timeout = self._until - time.time() | timeout = self._until - time.time() | ||||
if timeout <= 0: | if timeout <= 0: | ||||
raise Empty | |||||
raise queue.Empty() | |||||
else: | else: | ||||
timeout = None | timeout = None | ||||
self._logger.debug("Waiting for new site queue") | self._logger.debug("Waiting for new site queue") | ||||
site, queue = self._queues.unassigned.get(timeout=timeout) | |||||
if site is StopIteration: | |||||
site, q = self._queues.unassigned.get(timeout=timeout) | |||||
if isinstance(site, type) and issubclass(site, StopIteration): | |||||
raise StopIteration | raise StopIteration | ||||
self._logger.debug(f"Acquired new site queue: {site}") | self._logger.debug(f"Acquired new site queue: {site}") | ||||
self._site = site | self._site = site | ||||
self._queue = queue | |||||
self._queue = q | |||||
def _dequeue(self): | |||||
def _dequeue(self) -> CopyvioSource: | |||||
"""Remove a source from one of the queues.""" | """Remove a source from one of the queues.""" | ||||
if not self._site: | if not self._site: | ||||
self._acquire_new_site() | self._acquire_new_site() | ||||
assert self._site is not None | |||||
assert self._queue is not None | |||||
logmsg = "Fetching source URL from queue {0}" | |||||
self._logger.debug(logmsg.format(self._site)) | |||||
self._logger.debug(f"Fetching source URL from queue {self._site}") | |||||
self._queues.lock.acquire() | self._queues.lock.acquire() | ||||
try: | try: | ||||
source = self._queue.popleft() | source = self._queue.popleft() | ||||
@@ -294,11 +320,11 @@ class _CopyvioWorker: | |||||
self._queues.lock.release() | self._queues.lock.release() | ||||
return source | return source | ||||
def _handle_once(self): | |||||
"""Handle a single source from one of the queues.""" | |||||
def _handle_once(self) -> bool: | |||||
"""Handle a single source from one of the queues. Return if we should exit.""" | |||||
try: | try: | ||||
source = self._dequeue() | source = self._dequeue() | ||||
except Empty: | |||||
except queue.Empty: | |||||
self._logger.debug("Exiting: queue timed out") | self._logger.debug("Exiting: queue timed out") | ||||
return False | return False | ||||
except StopIteration: | except StopIteration: | ||||
@@ -320,12 +346,11 @@ class _CopyvioWorker: | |||||
source.workspace.compare(source, chain) | source.workspace.compare(source, chain) | ||||
return True | return True | ||||
def _run(self): | |||||
def _run(self) -> None: | |||||
"""Main entry point for the worker thread. | """Main entry point for the worker thread. | ||||
We will keep fetching URLs from the queues and handling them until | |||||
either we run out of time, or we get an exit signal that the queue is | |||||
now empty. | |||||
We will keep fetching URLs from the queues and handling them until either we | |||||
run out of time, or we get an exit signal that the queue is now empty. | |||||
""" | """ | ||||
while True: | while True: | ||||
try: | try: | ||||
@@ -335,9 +360,9 @@ class _CopyvioWorker: | |||||
self._logger.exception("Uncaught exception in worker") | self._logger.exception("Uncaught exception in worker") | ||||
time.sleep(5) # Delay if we get stuck in a busy loop | time.sleep(5) # Delay if we get stuck in a busy loop | ||||
def start(self): | |||||
def start(self) -> None: | |||||
"""Start the copyvio worker in a new thread.""" | """Start the copyvio worker in a new thread.""" | ||||
thread = Thread(target=self._run, name="cvworker-" + self._name) | |||||
thread = threading.Thread(target=self._run, name="cvworker-" + self._name) | |||||
thread.daemon = True | thread.daemon = True | ||||
thread.start() | thread.start() | ||||
@@ -347,20 +372,20 @@ class CopyvioWorkspace: | |||||
def __init__( | def __init__( | ||||
self, | self, | ||||
article, | |||||
min_confidence, | |||||
max_time, | |||||
logger, | |||||
headers, | |||||
url_timeout=5, | |||||
num_workers=8, | |||||
short_circuit=True, | |||||
parser_args=None, | |||||
exclude_check=None, | |||||
config=None, | |||||
degree=5, | |||||
): | |||||
self.sources = [] | |||||
article: MarkovChain, | |||||
min_confidence: float, | |||||
max_time: float, | |||||
logger: logging.Logger, | |||||
headers: list[tuple[str, str]], | |||||
url_timeout: float = 5, | |||||
num_workers: int = 8, | |||||
short_circuit: bool = True, | |||||
parser_args: ParserArgs | None = None, | |||||
exclusion_callback: Callable[[str], bool] | None = None, | |||||
config: dict[str, Any] | None = None, | |||||
degree: int = DEFAULT_DEGREE, | |||||
) -> None: | |||||
self.sources: list[CopyvioSource] = [] | |||||
self.finished = False | self.finished = False | ||||
self.possible_miss = False | self.possible_miss = False | ||||
@@ -369,8 +394,8 @@ class CopyvioWorkspace: | |||||
self._min_confidence = min_confidence | self._min_confidence = min_confidence | ||||
self._start_time = time.time() | self._start_time = time.time() | ||||
self._until = (self._start_time + max_time) if max_time > 0 else None | self._until = (self._start_time + max_time) if max_time > 0 else None | ||||
self._handled_urls = set() | |||||
self._finish_lock = Lock() | |||||
self._handled_urls: set[str] = set() | |||||
self._finish_lock = threading.Lock() | |||||
self._short_circuit = short_circuit | self._short_circuit = short_circuit | ||||
self._source_args = { | self._source_args = { | ||||
"workspace": self, | "workspace": self, | ||||
@@ -379,10 +404,11 @@ class CopyvioWorkspace: | |||||
"parser_args": parser_args, | "parser_args": parser_args, | ||||
"search_config": config, | "search_config": config, | ||||
} | } | ||||
self._exclude_check = exclude_check | |||||
self._exclusion_callback = exclusion_callback | |||||
self._degree = degree | self._degree = degree | ||||
if _is_globalized: | if _is_globalized: | ||||
assert _global_queues is not None | |||||
self._queues = _global_queues | self._queues = _global_queues | ||||
else: | else: | ||||
self._queues = _CopyvioQueues() | self._queues = _CopyvioQueues() | ||||
@@ -391,28 +417,27 @@ class CopyvioWorkspace: | |||||
name = f"local-{id(self) % 10000:04}.{i}" | name = f"local-{id(self) % 10000:04}.{i}" | ||||
_CopyvioWorker(name, self._queues, self._until).start() | _CopyvioWorker(name, self._queues, self._until).start() | ||||
def _calculate_confidence(self, delta): | |||||
def _calculate_confidence(self, delta: MarkovChainIntersection) -> float: | |||||
"""Return the confidence of a violation as a float between 0 and 1.""" | """Return the confidence of a violation as a float between 0 and 1.""" | ||||
def conf_with_article_and_delta(article, delta): | |||||
def conf_with_article_and_delta(article: float, delta: float) -> float: | |||||
"""Calculate confidence using the article and delta chain sizes.""" | """Calculate confidence using the article and delta chain sizes.""" | ||||
# This piecewise function exhibits exponential growth until it | |||||
# reaches the default "suspect" confidence threshold, at which | |||||
# point it transitions to polynomial growth with a limit of 1 as | |||||
# (delta / article) approaches 1. | |||||
# This piecewise function exhibits exponential growth until it reaches the | |||||
# default "suspect" confidence threshold, at which point it transitions to | |||||
# polynomial growth with a limit of 1 as # (delta / article) approaches 1. | |||||
# A graph can be viewed here: https://goo.gl/mKPhvr | # A graph can be viewed here: https://goo.gl/mKPhvr | ||||
ratio = delta / article | ratio = delta / article | ||||
if ratio <= 0.52763: | if ratio <= 0.52763: | ||||
return -log(1 - ratio) | |||||
return -math.log(1 - ratio) | |||||
else: | else: | ||||
return (-0.8939 * (ratio**2)) + (1.8948 * ratio) - 0.0009 | return (-0.8939 * (ratio**2)) + (1.8948 * ratio) - 0.0009 | ||||
def conf_with_delta(delta): | |||||
def conf_with_delta(delta: float) -> float: | |||||
"""Calculate confidence using just the delta chain size.""" | """Calculate confidence using just the delta chain size.""" | ||||
# This piecewise function was derived from experimental data using | # This piecewise function was derived from experimental data using | ||||
# reference points at (0, 0), (100, 0.5), (250, 0.75), (500, 0.9), | |||||
# and (1000, 0.95), with a limit of 1 as delta approaches infinity. | |||||
# A graph can be viewed here: https://goo.gl/lVl7or | |||||
# reference points at (0, 0), (100, 0.5), (250, 0.75), (500, 0.9), and | |||||
# (1000, 0.95), with a limit of 1 as delta approaches infinity. A graph can | |||||
# be viewed here: https://goo.gl/lVl7or | |||||
if delta <= 100: | if delta <= 100: | ||||
return delta / (delta + 100) | return delta / (delta + 100) | ||||
elif delta <= 250: | elif delta <= 250: | ||||
@@ -430,7 +455,7 @@ class CopyvioWorkspace: | |||||
) | ) | ||||
) | ) | ||||
def _finish_early(self): | |||||
def _finish_early(self) -> None: | |||||
"""Finish handling links prematurely (if we've hit min_confidence).""" | """Finish handling links prematurely (if we've hit min_confidence).""" | ||||
self._logger.debug("Confidence threshold met; skipping remaining sources") | self._logger.debug("Confidence threshold met; skipping remaining sources") | ||||
with self._queues.lock: | with self._queues.lock: | ||||
@@ -438,7 +463,7 @@ class CopyvioWorkspace: | |||||
source.skip() | source.skip() | ||||
self.finished = True | self.finished = True | ||||
def enqueue(self, urls): | |||||
def enqueue(self, urls: list[str]) -> None: | |||||
"""Put a list of URLs into the various worker queues.""" | """Put a list of URLs into the various worker queues.""" | ||||
for url in urls: | for url in urls: | ||||
with self._queues.lock: | with self._queues.lock: | ||||
@@ -449,7 +474,7 @@ class CopyvioWorkspace: | |||||
source = CopyvioSource(url=url, **self._source_args) | source = CopyvioSource(url=url, **self._source_args) | ||||
self.sources.append(source) | self.sources.append(source) | ||||
if self._exclude_check and self._exclude_check(url): | |||||
if self._exclusion_callback and self._exclusion_callback(url): | |||||
self._logger.debug(f"enqueue(): exclude {url}") | self._logger.debug(f"enqueue(): exclude {url}") | ||||
source.excluded = True | source.excluded = True | ||||
source.skip() | source.skip() | ||||
@@ -460,32 +485,37 @@ class CopyvioWorkspace: | |||||
continue | continue | ||||
try: | try: | ||||
import tldextract | |||||
key = tldextract.extract(url).registered_domain | key = tldextract.extract(url).registered_domain | ||||
except ImportError: # Fall back on very naive method | |||||
except ModuleNotFoundError: # Fall back on very naive method | |||||
from urllib.parse import urlparse | from urllib.parse import urlparse | ||||
key = ".".join(urlparse(url).netloc.split(".")[-2:]) | key = ".".join(urlparse(url).netloc.split(".")[-2:]) | ||||
logmsg = "enqueue(): {0} {1} -> {2}" | |||||
logmsg = f"enqueue(): %s {key} -> {url}" | |||||
if key in self._queues.sites: | if key in self._queues.sites: | ||||
self._logger.debug(logmsg.format("append", key, url)) | |||||
self._logger.debug(logmsg % "append") | |||||
self._queues.sites[key].append(source) | self._queues.sites[key].append(source) | ||||
else: | else: | ||||
self._logger.debug(logmsg.format("new", key, url)) | |||||
self._queues.sites[key] = queue = deque() | |||||
queue.append(source) | |||||
self._queues.unassigned.put((key, queue)) | |||||
self._logger.debug(logmsg % "new") | |||||
q: SourceQueue = collections.deque() | |||||
q.append(source) | |||||
self._queues.sites[key] = q | |||||
self._queues.unassigned.put((key, q)) | |||||
def compare(self, source, source_chain): | |||||
def compare(self, source: CopyvioSource, source_chain: MarkovChain | None) -> None: | |||||
"""Compare a source to the article; call _finish_early if necessary.""" | """Compare a source to the article; call _finish_early if necessary.""" | ||||
if source_chain: | if source_chain: | ||||
delta = MarkovChainIntersection(self._article, source_chain) | delta = MarkovChainIntersection(self._article, source_chain) | ||||
conf = self._calculate_confidence(delta) | conf = self._calculate_confidence(delta) | ||||
else: | else: | ||||
delta = None | |||||
conf = 0.0 | conf = 0.0 | ||||
self._logger.debug(f"compare(): {source.url} -> {conf}") | self._logger.debug(f"compare(): {source.url} -> {conf}") | ||||
with self._finish_lock: | with self._finish_lock: | ||||
if source_chain: | if source_chain: | ||||
assert delta is not None | |||||
source.update(conf, source_chain, delta) | source.update(conf, source_chain, delta) | ||||
source.finish_work() | source.finish_work() | ||||
if not self.finished and conf >= self._min_confidence: | if not self.finished and conf >= self._min_confidence: | ||||
@@ -494,7 +524,7 @@ class CopyvioWorkspace: | |||||
else: | else: | ||||
self.finished = True | self.finished = True | ||||
def wait(self): | |||||
def wait(self) -> None: | |||||
"""Wait for the workers to finish handling the sources.""" | """Wait for the workers to finish handling the sources.""" | ||||
self._logger.debug(f"Waiting on {len(self.sources)} sources") | self._logger.debug(f"Waiting on {len(self.sources)} sources") | ||||
for source in self.sources: | for source in self.sources: | ||||
@@ -505,7 +535,7 @@ class CopyvioWorkspace: | |||||
for i in range(self._num_workers): | for i in range(self._num_workers): | ||||
self._queues.unassigned.put((StopIteration, None)) | self._queues.unassigned.put((StopIteration, None)) | ||||
def get_result(self, num_queries=0): | |||||
def get_result(self, num_queries: int = 0) -> CopyvioCheckResult: | |||||
"""Return a CopyvioCheckResult containing the results of this check.""" | """Return a CopyvioCheckResult containing the results of this check.""" | ||||
self.sources.sort( | self.sources.sort( | ||||
key=lambda s: ( | key=lambda s: ( | ||||
@@ -35,14 +35,14 @@ import mwparserfromhell | |||||
from earwigbot import exceptions | from earwigbot import exceptions | ||||
from earwigbot.exceptions import APIError | from earwigbot.exceptions import APIError | ||||
from earwigbot.wiki.copyvios import CopyvioMixIn | |||||
from earwigbot.wiki.copyvios import DEFAULT_DEGREE, CopyvioChecker, CopyvioCheckResult | |||||
if typing.TYPE_CHECKING: | if typing.TYPE_CHECKING: | ||||
from earwigbot.wiki.site import Site | from earwigbot.wiki.site import Site | ||||
from earwigbot.wiki.user import User | from earwigbot.wiki.user import User | ||||
class Page(CopyvioMixIn): | |||||
class Page: | |||||
""" | """ | ||||
**EarwigBot: Wiki Toolset: Page** | **EarwigBot: Wiki Toolset: Page** | ||||
@@ -110,7 +110,6 @@ class Page(CopyvioMixIn): | |||||
__init__() will not do any API queries, but it will use basic namespace logic | __init__() will not do any API queries, but it will use basic namespace logic | ||||
to determine our namespace ID and if we are a talkpage. | to determine our namespace ID and if we are a talkpage. | ||||
""" | """ | ||||
super().__init__(site) | |||||
self._site = site | self._site = site | ||||
self._title = title.strip() | self._title = title.strip() | ||||
self._follow_redirects = self._keep_following = follow_redirects | self._follow_redirects = self._keep_following = follow_redirects | ||||
@@ -873,3 +872,108 @@ class Page(CopyvioMixIn): | |||||
return False | return False | ||||
return True | return True | ||||
def copyvio_check( | |||||
self, | |||||
min_confidence: float = 0.75, | |||||
max_queries: int = 15, | |||||
max_time: float = -1, | |||||
no_searches: bool = False, | |||||
no_links: bool = False, | |||||
short_circuit: bool = True, | |||||
degree: int = DEFAULT_DEGREE, | |||||
) -> CopyvioCheckResult: | |||||
""" | |||||
Check the page for copyright violations. | |||||
Returns a :class:`.CopyvioCheckResult` object with information on the results | |||||
of the check. | |||||
*min_confidence* is the minimum amount of confidence we must have in the | |||||
similarity between a source text and the article in order for us to consider it | |||||
a suspected violation. This is a number between 0 and 1. | |||||
*max_queries* is self-explanatory; we will never make more than this number of | |||||
queries in a given check. | |||||
*max_time* can be set to prevent copyvio checks from taking longer than a set | |||||
amount of time (generally around a minute), which can be useful if checks are | |||||
called through a web server with timeouts. We will stop checking new URLs as | |||||
soon as this limit is reached. | |||||
Setting *no_searches* to ``True`` will cause only URLs in the wikitext of the | |||||
page to be checked; no search engine queries will be made. Setting *no_links* | |||||
to ``True`` will cause the opposite to happen: URLs in the wikitext will be | |||||
ignored; search engine queries will be made only. Setting both of these to | |||||
``True`` is pointless. | |||||
Normally, the checker will short-circuit if it finds a URL that meets | |||||
*min_confidence*. This behavior normally causes it to skip any remaining URLs | |||||
and web queries, but setting *short_circuit* to ``False`` will prevent this. | |||||
The *degree* controls the n-gram word size used in comparing similarity. It | |||||
should usually be a number between 3 and 5. | |||||
Raises :exc:`.CopyvioCheckError` or subclasses | |||||
(:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on errors. | |||||
""" | |||||
self._logger.info(f"Starting copyvio check for [[{self.title}]]") | |||||
checker = CopyvioChecker( | |||||
self, | |||||
min_confidence=min_confidence, | |||||
max_time=max_time, | |||||
degree=degree, | |||||
logger=self._logger, | |||||
) | |||||
result = checker.run_check( | |||||
max_queries=max_queries, | |||||
no_searches=no_searches, | |||||
no_links=no_links, | |||||
short_circuit=short_circuit, | |||||
) | |||||
self._logger.info(result.get_log_message(self.title)) | |||||
return result | |||||
def copyvio_compare( | |||||
self, | |||||
urls: list[str] | str, | |||||
min_confidence: float = 0.75, | |||||
max_time: float = 30, | |||||
degree: int = DEFAULT_DEGREE, | |||||
) -> CopyvioCheckResult: | |||||
""" | |||||
Check the page, like :py:meth:`copyvio_check`, against specific URLs. | |||||
This is essentially a reduced version of :meth:`copyvio_check` - a copyivo | |||||
comparison is made using Markov chains and the result is returned in a | |||||
:class:`.CopyvioCheckResult` object - but without using a search engine, since | |||||
the suspected "violated" URL is supplied from the start. | |||||
One use case is to generate a result when the URL is retrieved from a cache, | |||||
like the one used in EarwigBot's Toolforge site. After a search is done, the | |||||
resulting URL is stored in a cache for 72 hours so future checks against that | |||||
page will not require another set of time-and-money-consuming search engine | |||||
queries. However, the comparison itself (which includes the article's and the | |||||
source's content) cannot be stored for data retention reasons, so a fresh | |||||
comparison is made using this function. | |||||
Since no searching is done, neither :exc:`.UnknownSearchEngineError` nor | |||||
:exc:`.SearchQueryError` will be raised. | |||||
""" | |||||
if not isinstance(urls, list): | |||||
urls = [urls] | |||||
self._logger.info( | |||||
f"Starting copyvio compare for [[{self.title}]] against {', '.join(urls)}" | |||||
) | |||||
checker = CopyvioChecker( | |||||
self, | |||||
min_confidence=min_confidence, | |||||
max_time=max_time, | |||||
degree=degree, | |||||
logger=self._logger, | |||||
) | |||||
result = checker.run_compare(urls) | |||||
self._logger.info(result.get_log_message(self.title)) | |||||
return result |