Browse Source

Python 3 conversion + typing for all of copyvios

tags/v0.4
Ben Kurtovic 2 months ago
parent
commit
2936ffbf5f
9 changed files with 765 additions and 555 deletions
  1. +0
    -4
      pyproject.toml
  2. +100
    -166
      src/earwigbot/wiki/copyvios/__init__.py
  3. +51
    -35
      src/earwigbot/wiki/copyvios/exclusions.py
  4. +45
    -31
      src/earwigbot/wiki/copyvios/markov.py
  5. +143
    -86
      src/earwigbot/wiki/copyvios/parsers.py
  6. +71
    -57
      src/earwigbot/wiki/copyvios/result.py
  7. +93
    -48
      src/earwigbot/wiki/copyvios/search.py
  8. +155
    -125
      src/earwigbot/wiki/copyvios/workers.py
  9. +107
    -3
      src/earwigbot/wiki/page.py

+ 0
- 4
pyproject.toml View File

@@ -59,10 +59,6 @@ requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"


[tool.pyright] [tool.pyright]
exclude = [
# TODO
"src/earwigbot/wiki/copyvios"
]
pythonVersion = "3.11" pythonVersion = "3.11"
venvPath = "." venvPath = "."
venv = "venv" venv = "venv"


+ 100
- 166
src/earwigbot/wiki/copyvios/__init__.py View File

@@ -18,208 +18,142 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


__all__ = [
"DEFAULT_DEGREE",
"CopyvioChecker",
"CopyvioCheckResult",
"globalize",
"localize",
]

import functools
import logging
import time import time
from urllib.request import build_opener
from collections.abc import Callable


from earwigbot import exceptions
from earwigbot.wiki.copyvios.markov import MarkovChain
from earwigbot.wiki.copyvios.parsers import ArticleTextParser
from earwigbot.wiki.copyvios.search import SEARCH_ENGINES
from earwigbot.wiki.copyvios.exclusions import ExclusionsDB
from earwigbot.wiki.copyvios.markov import DEFAULT_DEGREE, MarkovChain
from earwigbot.wiki.copyvios.parsers import ArticleParser, ParserArgs
from earwigbot.wiki.copyvios.result import CopyvioCheckResult
from earwigbot.wiki.copyvios.search import SearchEngine, get_search_engine
from earwigbot.wiki.copyvios.workers import CopyvioWorkspace, globalize, localize from earwigbot.wiki.copyvios.workers import CopyvioWorkspace, globalize, localize
from earwigbot.wiki.page import Page


__all__ = ["CopyvioMixIn", "globalize", "localize"]



class CopyvioMixIn:
class CopyvioChecker:
""" """
**EarwigBot: Wiki Toolset: Copyright Violation MixIn**
Manages the lifecycle of a copyvio check or comparison.


This is a mixin that provides two public methods, :py:meth:`copyvio_check`
and :py:meth:`copyvio_compare`. The former checks the page for copyright
violations using a search engine API, and the latter compares the page
against a given URL. Credentials for the search engine API are stored in
the :py:class:`~earwigbot.wiki.site.Site`'s config.
Created by :py:class:`~earwigbot.wiki.page.Page` and handles the implementation
details of running a check.
""" """


def __init__(self, site):
self._search_config = site._search_config
self._exclusions_db = self._search_config.get("exclusions_db")
self._addheaders = [
("User-Agent", site.user_agent),
def __init__(
self,
page: Page,
*,
min_confidence: float = 0.75,
max_time: float = 30,
degree: int = DEFAULT_DEGREE,
logger: logging.Logger | None = None,
) -> None:
self._page = page
self._site = page.site
self._config = page.site._search_config
self._min_confidence = min_confidence
self._max_time = max_time
self._degree = degree
self._logger = logger or logging.getLogger("earwigbot.wiki")

self._headers = [
("User-Agent", page.site.user_agent),
("Accept-Encoding", "gzip"), ("Accept-Encoding", "gzip"),
] ]


def _get_search_engine(self):
"""Return a function that can be called to do web searches.

The function takes one argument, a search query, and returns a list of
URLs, ranked by importance. The underlying logic depends on the
*engine* argument within our config; for example, if *engine* is
"Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying.

Raises UnknownSearchEngineError if the 'engine' listed in our config is
unknown to us, and UnsupportedSearchEngineError if we are missing a
required package or module, like oauth2 for "Yahoo! BOSS".
"""
engine = self._search_config["engine"]
if engine not in SEARCH_ENGINES:
raise exceptions.UnknownSearchEngineError(engine)

klass = SEARCH_ENGINES[engine]
credentials = self._search_config["credentials"]
opener = build_opener()
opener.addheaders = self._addheaders

for dep in klass.requirements():
try:
__import__(dep).__name__
except (ModuleNotFoundError, AttributeError):
e = "Missing a required dependency ({}) for the {} engine"
e = e.format(dep, engine)
raise exceptions.UnsupportedSearchEngineError(e)

return klass(credentials, opener)

def copyvio_check(
self,
min_confidence=0.75,
max_queries=15,
max_time=-1,
no_searches=False,
no_links=False,
short_circuit=True,
degree=5,
):
"""Check the page for copyright violations.

Returns a :class:`.CopyvioCheckResult` object with information on the
results of the check.

*min_confidence* is the minimum amount of confidence we must have in
the similarity between a source text and the article in order for us to
consider it a suspected violation. This is a number between 0 and 1.

*max_queries* is self-explanatory; we will never make more than this
number of queries in a given check.

*max_time* can be set to prevent copyvio checks from taking longer than
a set amount of time (generally around a minute), which can be useful
if checks are called through a web server with timeouts. We will stop
checking new URLs as soon as this limit is reached.

Setting *no_searches* to ``True`` will cause only URLs in the wikitext
of the page to be checked; no search engine queries will be made.
Setting *no_links* to ``True`` will cause the opposite to happen: URLs
in the wikitext will be ignored; search engine queries will be made
only. Setting both of these to ``True`` is pointless.

Normally, the checker will short-circuit if it finds a URL that meets
*min_confidence*. This behavior normally causes it to skip any
remaining URLs and web queries, but setting *short_circuit* to
``False`` will prevent this.

Raises :exc:`.CopyvioCheckError` or subclasses
(:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on
errors.
"""
log = "Starting copyvio check for [[{0}]]"
self._logger.info(log.format(self.title))
searcher = self._get_search_engine()
parser = ArticleTextParser(
self.get(),
args={"nltk_dir": self._search_config["nltk_dir"], "lang": self._site.lang},
self._parser = ArticleParser(
self._page.get(),
lang=self._site.lang,
nltk_dir=self._config["nltk_dir"],
) )
article = MarkovChain(parser.strip(), degree=degree)
parser_args = {}
self._article = MarkovChain(self._parser.strip(), degree=self._degree)


if self._exclusions_db:
self._exclusions_db.sync(self.site.name)
@functools.cached_property
def _searcher(self) -> SearchEngine:
return get_search_engine(self._config, self._headers)


def exclude(u):
return self._exclusions_db.check(self.site.name, u)
@property
def _exclusions_db(self) -> ExclusionsDB | None:
return self._config.get("exclusions_db")


parser_args["mirror_hints"] = self._exclusions_db.get_mirror_hints(self)
else:
exclude = None
def _get_exclusion_callback(self) -> Callable[[str], bool] | None:
if not self._exclusions_db:
return None
return functools.partial(self._exclusions_db.check, self._site.name)

def run_check(
self,
*,
max_queries: int = 15,
no_searches: bool = False,
no_links: bool = False,
short_circuit: bool = True,
) -> CopyvioCheckResult:
parser_args: ParserArgs = {}
if self._exclusions_db:
self._exclusions_db.sync(self._site.name)
mirror_hints = self._exclusions_db.get_mirror_hints(self._page)
parser_args["mirror_hints"] = mirror_hints


workspace = CopyvioWorkspace( workspace = CopyvioWorkspace(
article,
min_confidence,
max_time,
self._logger,
self._addheaders,
self._article,
min_confidence=self._min_confidence,
max_time=self._max_time,
logger=self._logger,
headers=self._headers,
short_circuit=short_circuit, short_circuit=short_circuit,
parser_args=parser_args, parser_args=parser_args,
exclude_check=exclude,
config=self._search_config,
degree=degree,
exclusion_callback=self._get_exclusion_callback(),
config=self._config,
degree=self._degree,
) )


if article.size < 20: # Auto-fail very small articles
result = workspace.get_result()
self._logger.info(result.get_log_message(self.title))
return result
if self._article.size < 20: # Auto-fail very small articles
return workspace.get_result()


if not no_links: if not no_links:
workspace.enqueue(parser.get_links())
workspace.enqueue(self._parser.get_links())
num_queries = 0 num_queries = 0
if not no_searches: if not no_searches:
chunks = parser.chunk(max_queries)
chunks = self._parser.chunk(max_queries)
for chunk in chunks: for chunk in chunks:
if short_circuit and workspace.finished: if short_circuit and workspace.finished:
workspace.possible_miss = True workspace.possible_miss = True
break break
log = "[[{0}]] -> querying {1} for {2!r}"
self._logger.debug(log.format(self.title, searcher.name, chunk))
workspace.enqueue(searcher.search(chunk))
self._logger.debug(
f"[[{self._page.title}]] -> querying {self._searcher.name} "
f"for {chunk!r}"
)
workspace.enqueue(self._searcher.search(chunk))
num_queries += 1 num_queries += 1
time.sleep(1)
time.sleep(1) # TODO: Check whether this is needed


workspace.wait() workspace.wait()
result = workspace.get_result(num_queries)
self._logger.info(result.get_log_message(self.title))
return result

def copyvio_compare(self, urls, min_confidence=0.75, max_time=30, degree=5):
"""Check the page like :py:meth:`copyvio_check` against specific URLs.

This is essentially a reduced version of :meth:`copyvio_check` - a
copyivo comparison is made using Markov chains and the result is
returned in a :class:`.CopyvioCheckResult` object - but without using a
search engine, since the suspected "violated" URL is supplied from the
start.

Its primary use is to generate a result when the URL is retrieved from
a cache, like the one used in EarwigBot's Tool Labs site. After a
search is done, the resulting URL is stored in a cache for 72 hours so
future checks against that page will not require another set of
time-and-money-consuming search engine queries. However, the comparison
itself (which includes the article's and the source's content) cannot
be stored for data retention reasons, so a fresh comparison is made
using this function.

Since no searching is done, neither :exc:`.UnknownSearchEngineError`
nor :exc:`.SearchQueryError` will be raised.
"""
if not isinstance(urls, list):
urls = [urls]
log = "Starting copyvio compare for [[{0}]] against {1}"
self._logger.info(log.format(self.title, ", ".join(urls)))
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree)
return workspace.get_result(num_queries)

def run_compare(self, urls: list[str]) -> CopyvioCheckResult:
workspace = CopyvioWorkspace( workspace = CopyvioWorkspace(
article,
min_confidence,
max_time,
self._logger,
self._addheaders,
max_time,
self._article,
min_confidence=self._min_confidence,
max_time=self._max_time,
logger=self._logger,
headers=self._headers,
url_timeout=self._max_time,
num_workers=min(len(urls), 8), num_workers=min(len(urls), 8),
short_circuit=False, short_circuit=False,
config=self._search_config,
degree=degree,
config=self._config,
degree=self._degree,
) )

workspace.enqueue(urls) workspace.enqueue(urls)
workspace.wait() workspace.wait()
result = workspace.get_result()
self._logger.info(result.get_log_message(self.title))
return result
return workspace.get_result()

+ 51
- 35
src/earwigbot/wiki/copyvios/exclusions.py View File

@@ -18,15 +18,24 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import annotations

__all__ = ["ExclusionsDB"]

import logging
import re import re
import sqlite3 import sqlite3
import threading import threading
import time import time
import typing
import urllib.parse import urllib.parse


from earwigbot import exceptions from earwigbot import exceptions


__all__ = ["ExclusionsDB"]
if typing.TYPE_CHECKING:
from earwigbot.wiki.page import Page
from earwigbot.wiki.site import Site
from earwigbot.wiki.sitesdb import SitesDB


DEFAULT_SOURCES = { DEFAULT_SOURCES = {
"all": [ # Applies to all, but located on enwiki "all": [ # Applies to all, but located on enwiki
@@ -52,26 +61,28 @@ class ExclusionsDB:
""" """
**EarwigBot: Wiki Toolset: Exclusions Database Manager** **EarwigBot: Wiki Toolset: Exclusions Database Manager**


Controls the :file:`exclusions.db` file, which stores URLs excluded from
copyright violation checks on account of being known mirrors, for example.
Controls the :file:`exclusions.db` file, which stores URLs excluded from copyright
violation checks on account of being known mirrors, for example.
""" """


def __init__(self, sitesdb, dbfile, logger):
def __init__(self, sitesdb: SitesDB, dbfile: str, logger: logging.Logger) -> None:
self._sitesdb = sitesdb self._sitesdb = sitesdb
self._dbfile = dbfile self._dbfile = dbfile
self._logger = logger self._logger = logger
self._db_access_lock = threading.Lock() self._db_access_lock = threading.Lock()


def __repr__(self):
def __repr__(self) -> str:
"""Return the canonical string representation of the ExclusionsDB.""" """Return the canonical string representation of the ExclusionsDB."""
res = "ExclusionsDB(sitesdb={0!r}, dbfile={1!r}, logger={2!r})"
return res.format(self._sitesdb, self._dbfile, self._logger)
return (
f"ExclusionsDB(sitesdb={self._sitesdb!r}, dbfile={self._dbfile!r}, "
f"logger={self._logger!r})"
)


def __str__(self):
def __str__(self) -> str:
"""Return a nice string representation of the ExclusionsDB.""" """Return a nice string representation of the ExclusionsDB."""
return f"<ExclusionsDB at {self._dbfile}>" return f"<ExclusionsDB at {self._dbfile}>"


def _create(self):
def _create(self) -> None:
"""Initialize the exclusions database with its necessary tables.""" """Initialize the exclusions database with its necessary tables."""
script = """ script = """
CREATE TABLE sources (source_sitename, source_page); CREATE TABLE sources (source_sitename, source_page);
@@ -79,7 +90,7 @@ class ExclusionsDB:
CREATE TABLE exclusions (exclusion_sitename, exclusion_url); CREATE TABLE exclusions (exclusion_sitename, exclusion_url);
""" """
query = "INSERT INTO sources VALUES (?, ?);" query = "INSERT INTO sources VALUES (?, ?);"
sources = []
sources: list[tuple[str, str]] = []
for sitename, pages in DEFAULT_SOURCES.items(): for sitename, pages in DEFAULT_SOURCES.items():
for page in pages: for page in pages:
sources.append((sitename, page)) sources.append((sitename, page))
@@ -88,9 +99,9 @@ class ExclusionsDB:
conn.executescript(script) conn.executescript(script)
conn.executemany(query, sources) conn.executemany(query, sources)


def _load_source(self, site, source):
def _load_source(self, site: Site, source: str) -> set[str]:
"""Load from a specific source and return a set of URLs.""" """Load from a specific source and return a set of URLs."""
urls = set()
urls: set[str] = set()
try: try:
data = site.get_page(source, follow_redirects=True).get() data = site.get_page(source, follow_redirects=True).get()
except exceptions.PageNotFoundError: except exceptions.PageNotFoundError:
@@ -123,7 +134,7 @@ class ExclusionsDB:
urls.add(url) urls.add(url)
return urls return urls


def _update(self, sitename):
def _update(self, sitename: str) -> None:
"""Update the database from listed sources in the index.""" """Update the database from listed sources in the index."""
query1 = "SELECT source_page FROM sources WHERE source_sitename = ?" query1 = "SELECT source_page FROM sources WHERE source_sitename = ?"
query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
@@ -140,7 +151,7 @@ class ExclusionsDB:
else: else:
site = self._sitesdb.get_site(sitename) site = self._sitesdb.get_site(sitename)
with self._db_access_lock, sqlite3.connect(self._dbfile) as conn: with self._db_access_lock, sqlite3.connect(self._dbfile) as conn:
urls = set()
urls: set[str] = set()
for (source,) in conn.execute(query1, (sitename,)): for (source,) in conn.execute(query1, (sitename,)):
urls |= self._load_source(site, source) urls |= self._load_source(site, source)
for (url,) in conn.execute(query2, (sitename,)): for (url,) in conn.execute(query2, (sitename,)):
@@ -154,7 +165,7 @@ class ExclusionsDB:
else: else:
conn.execute(query7, (sitename, int(time.time()))) conn.execute(query7, (sitename, int(time.time())))


def _get_last_update(self, sitename):
def _get_last_update(self, sitename: str) -> int:
"""Return the UNIX timestamp of the last time the db was updated.""" """Return the UNIX timestamp of the last time the db was updated."""
query = "SELECT update_time FROM updates WHERE update_sitename = ?" query = "SELECT update_time FROM updates WHERE update_sitename = ?"
with self._db_access_lock, sqlite3.connect(self._dbfile) as conn: with self._db_access_lock, sqlite3.connect(self._dbfile) as conn:
@@ -165,28 +176,34 @@ class ExclusionsDB:
return 0 return 0
return result[0] if result else 0 return result[0] if result else 0


def sync(self, sitename, force=False):
"""Update the database if it hasn't been updated recently.
def sync(self, sitename: str, force: bool = False) -> None:
"""
Update the database if it hasn't been updated recently.


This updates the exclusions database for the site *sitename* and "all". This updates the exclusions database for the site *sitename* and "all".


Site-specific lists are considered stale after 48 hours; global lists
after 12 hours.
Site-specific lists are considered stale after 48 hours; global lists after
12 hours.
""" """
max_staleness = 60 * 60 * (12 if sitename == "all" else 48) max_staleness = 60 * 60 * (12 if sitename == "all" else 48)
time_since_update = int(time.time() - self._get_last_update(sitename)) time_since_update = int(time.time() - self._get_last_update(sitename))
if force or time_since_update > max_staleness: if force or time_since_update > max_staleness:
log = "Updating stale database: {0} (last updated {1} seconds ago)"
self._logger.info(log.format(sitename, time_since_update))
self._logger.info(
f"Updating stale database: {sitename} (last updated "
f"{time_since_update} seconds ago)"
)
self._update(sitename) self._update(sitename)
else: else:
log = "Database for {0} is still fresh (last updated {1} seconds ago)"
self._logger.debug(log.format(sitename, time_since_update))
self._logger.debug(
f"Database for {sitename} is still fresh (last updated "
f"{time_since_update} seconds ago)"
)
if sitename != "all": if sitename != "all":
self.sync("all", force=force) self.sync("all", force=force)


def check(self, sitename, url):
"""Check whether a given URL is in the exclusions database.
def check(self, sitename: str, url: str) -> bool:
"""
Check whether a given URL is in the exclusions database.


Return ``True`` if the URL is in the database, or ``False`` otherwise. Return ``True`` if the URL is in the database, or ``False`` otherwise.
""" """
@@ -216,19 +233,18 @@ class ExclusionsDB:
else: else:
matches = normalized.startswith(excl) matches = normalized.startswith(excl)
if matches: if matches:
log = "Exclusion detected in {0} for {1}"
self._logger.debug(log.format(sitename, url))
self._logger.debug(f"Exclusion detected in {sitename} for {url}")
return True return True


log = f"No exclusions in {sitename} for {url}"
self._logger.debug(log)
self._logger.debug(f"No exclusions in {sitename} for {url}")
return False return False


def get_mirror_hints(self, page, try_mobile=True):
"""Return a list of strings that indicate the existence of a mirror.
def get_mirror_hints(self, page: Page, try_mobile: bool = True) -> list[str]:
"""
Return a list of strings that indicate the existence of a mirror.


The source parser checks for the presence of these strings inside of
certain HTML tag attributes (``"href"`` and ``"src"``).
The source parser checks for the presence of these strings inside of certain
HTML tag attributes (``"href"`` and ``"src"``).
""" """
site = page.site site = page.site
path = urllib.parse.urlparse(page.url).path path = urllib.parse.urlparse(page.url).path
@@ -238,10 +254,10 @@ class ExclusionsDB:
if try_mobile: if try_mobile:
fragments = re.search(r"^([\w]+)\.([\w]+).([\w]+)$", site.domain) fragments = re.search(r"^([\w]+)\.([\w]+).([\w]+)$", site.domain)
if fragments: if fragments:
roots.append("{}.m.{}.{}".format(*fragments.groups()))
roots.append(f"{fragments[1]}.m.{fragments[2]}.{fragments[3]}")


general = [ general = [
root + site._script_path + "/" + script
root + site.script_path + "/" + script
for root in roots for root in roots
for script in scripts for script in scripts
] ]


+ 45
- 31
src/earwigbot/wiki/copyvios/markov.py View File

@@ -18,29 +18,44 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


__all__ = [
"DEFAULT_DEGREE",
"EMPTY",
"EMPTY_INTERSECTION",
"MarkovChain",
"MarkovChainIntersection",
]

import re import re
from collections.abc import Iterable
from enum import Enum


__all__ = ["EMPTY", "EMPTY_INTERSECTION", "MarkovChain", "MarkovChainIntersection"]
DEFAULT_DEGREE = 5




class MarkovChain:
"""Implements a basic ngram Markov chain of words."""

class Sentinel(Enum):
START = -1 START = -1
END = -2 END = -2


def __init__(self, text, degree=5):

RawChain = dict[tuple[str | Sentinel, ...], int]


class MarkovChain:
"""Implements a basic ngram Markov chain of words."""

def __init__(self, text: str, degree: int = DEFAULT_DEGREE) -> None:
self.text = text self.text = text
self.degree = degree # 2 for bigrams, 3 for trigrams, etc. self.degree = degree # 2 for bigrams, 3 for trigrams, etc.
self.chain = self._build() self.chain = self._build()
self.size = self._get_size() self.size = self._get_size()


def _build(self):
def _build(self) -> RawChain:
"""Build and return the Markov chain from the input text.""" """Build and return the Markov chain from the input text."""
padding = self.degree - 1 padding = self.degree - 1
words = re.sub(r"[^\w\s-]", "", self.text.lower(), flags=re.UNICODE).split()
words = ([self.START] * padding) + words + ([self.END] * padding)
chain = {}
words = re.sub(r"[^\w\s-]", "", self.text.lower()).split()
words = ([Sentinel.START] * padding) + words + ([Sentinel.END] * padding)
chain: RawChain = {}


for i in range(len(words) - self.degree + 1): for i in range(len(words) - self.degree + 1):
phrase = tuple(words[i : i + self.degree]) phrase = tuple(words[i : i + self.degree])
@@ -50,15 +65,15 @@ class MarkovChain:
chain[phrase] = 1 chain[phrase] = 1
return chain return chain


def _get_size(self):
def _get_size(self) -> int:
"""Return the size of the Markov chain: the total number of nodes.""" """Return the size of the Markov chain: the total number of nodes."""
return sum(self.chain.values()) return sum(self.chain.values())


def __repr__(self):
def __repr__(self) -> str:
"""Return the canonical string representation of the MarkovChain.""" """Return the canonical string representation of the MarkovChain."""
return f"MarkovChain(text={self.text!r})" return f"MarkovChain(text={self.text!r})"


def __str__(self):
def __str__(self) -> str:
"""Return a nice string representation of the MarkovChain.""" """Return a nice string representation of the MarkovChain."""
return f"<MarkovChain of size {self.size}>" return f"<MarkovChain of size {self.size}>"


@@ -66,61 +81,60 @@ class MarkovChain:
class MarkovChainIntersection(MarkovChain): class MarkovChainIntersection(MarkovChain):
"""Implements the intersection of two chains (i.e., their shared nodes).""" """Implements the intersection of two chains (i.e., their shared nodes)."""


def __init__(self, mc1, mc2):
def __init__(self, mc1: MarkovChain, mc2: MarkovChain) -> None:
self.mc1, self.mc2 = mc1, mc2 self.mc1, self.mc2 = mc1, mc2
self.chain = self._build() self.chain = self._build()
self.size = self._get_size() self.size = self._get_size()


def _build(self):
def _build(self) -> RawChain:
"""Build and return the Markov chain from the input chains.""" """Build and return the Markov chain from the input chains."""
c1 = self.mc1.chain c1 = self.mc1.chain
c2 = self.mc2.chain c2 = self.mc2.chain
chain = {}
chain: RawChain = {}


for phrase in c1: for phrase in c1:
if phrase in c2: if phrase in c2:
chain[phrase] = min(c1[phrase], c2[phrase]) chain[phrase] = min(c1[phrase], c2[phrase])
return chain return chain


def __repr__(self):
def __repr__(self) -> str:
"""Return the canonical string representation of the intersection.""" """Return the canonical string representation of the intersection."""
res = "MarkovChainIntersection(mc1={0!r}, mc2={1!r})"
return res.format(self.mc1, self.mc2)
return f"MarkovChainIntersection(mc1={self.mc1!r}, mc2={self.mc2!r})"


def __str__(self):
def __str__(self) -> str:
"""Return a nice string representation of the intersection.""" """Return a nice string representation of the intersection."""
res = "<MarkovChainIntersection of size {0} ({1} ^ {2})>"
return res.format(self.size, self.mc1, self.mc2)
return (
f"<MarkovChainIntersection of size {self.size} ({self.mc1} ^ {self.mc2})>"
)




class MarkovChainUnion(MarkovChain): class MarkovChainUnion(MarkovChain):
"""Implemented the union of multiple chains.""" """Implemented the union of multiple chains."""


def __init__(self, chains):
def __init__(self, chains: Iterable[MarkovChain]) -> None:
self.chains = list(chains) self.chains = list(chains)
self.chain = self._build() self.chain = self._build()
self.size = self._get_size() self.size = self._get_size()


def _build(self):
def _build(self) -> RawChain:
"""Build and return the Markov chain from the input chains.""" """Build and return the Markov chain from the input chains."""
union = {}
union: RawChain = {}
for chain in self.chains: for chain in self.chains:
for phrase, count in chain.chain.iteritems():
for phrase, count in chain.chain.items():
if phrase in union: if phrase in union:
union[phrase] += count union[phrase] += count
else: else:
union[phrase] = count union[phrase] = count
return union return union


def __repr__(self):
def __repr__(self) -> str:
"""Return the canonical string representation of the union.""" """Return the canonical string representation of the union."""
res = "MarkovChainUnion(chains={!r})"
return res.format(self.chains)
return f"MarkovChainUnion(chains={self.chains!r})"


def __str__(self):
def __str__(self) -> str:
"""Return a nice string representation of the union.""" """Return a nice string representation of the union."""
res = "<MarkovChainUnion of size {} ({})>"
return res.format(self.size, "| ".join(str(chain) for chain in self.chains))
chains = " | ".join(str(chain) for chain in self.chains)
return f"<MarkovChainUnion of size {self.size} ({chains})>"




EMPTY = MarkovChain("") EMPTY = MarkovChain("")


+ 143
- 86
src/earwigbot/wiki/copyvios/parsers.py View File

@@ -18,44 +18,34 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import annotations

__all__ = ["ArticleParser", "get_parser"]

import io import io
import json import json
import os.path import os.path
import re import re
import typing
import urllib.parse import urllib.parse
import urllib.request import urllib.request
from abc import ABC, abstractmethod
from collections.abc import Callable
from typing import Any, ClassVar, Literal, TypedDict


import mwparserfromhell import mwparserfromhell


from earwigbot.exceptions import ParserExclusionError, ParserRedirectError from earwigbot.exceptions import ParserExclusionError, ParserRedirectError


__all__ = ["ArticleTextParser", "get_parser"]


class _BaseTextParser:
"""Base class for a parser that handles text."""

TYPE = None

def __init__(self, text, url=None, args=None):
self.text = text
self.url = url
self._args = args or {}

def __repr__(self):
"""Return the canonical string representation of the text parser."""
return f"{self.__class__.__name__}(text={self.text!r})"
if typing.TYPE_CHECKING:
import bs4


def __str__(self):
"""Return a nice string representation of the text parser."""
name = self.__class__.__name__
return f"<{name} of text with size {len(self.text)}>"
from earwigbot.wiki.copyvios.workers import OpenedURL




class ArticleTextParser(_BaseTextParser):
class ArticleParser:
"""A parser that can strip and chunk wikicode article text.""" """A parser that can strip and chunk wikicode article text."""


TYPE = "Article"
TEMPLATE_MERGE_THRESHOLD = 35 TEMPLATE_MERGE_THRESHOLD = 35
NLTK_DEFAULT = "english" NLTK_DEFAULT = "english"
NLTK_LANGS = { NLTK_LANGS = {
@@ -78,7 +68,18 @@ class ArticleTextParser(_BaseTextParser):
"tr": "turkish", "tr": "turkish",
} }


def _merge_templates(self, code):
def __init__(self, text: str, lang: str, nltk_dir: str) -> None:
self.text = text
self._lang = lang
self._nltk_dir = nltk_dir

def __repr__(self) -> str:
return f"{self.__class__.__name__}(text={self.text!r})"

def __str__(self) -> str:
return f"<{self.__class__.__name__} of text with size {len(self.text)}>"

def _merge_templates(self, code: mwparserfromhell.wikicode.Wikicode) -> None:
"""Merge template contents in to wikicode when the values are long.""" """Merge template contents in to wikicode when the values are long."""
for template in code.filter_templates(recursive=code.RECURSE_OTHERS): for template in code.filter_templates(recursive=code.RECURSE_OTHERS):
chunks = [] chunks = []
@@ -92,23 +93,25 @@ class ArticleTextParser(_BaseTextParser):
else: else:
code.remove(template) code.remove(template)


def _get_tokenizer(self):
def _get_tokenizer(self) -> Any:
"""Return a NLTK punctuation tokenizer for the article's language.""" """Return a NLTK punctuation tokenizer for the article's language."""
import nltk import nltk


def datafile(lang):
def datafile(lang: str) -> str:
return "file:" + os.path.join( return "file:" + os.path.join(
self._args["nltk_dir"], "tokenizers", "punkt", lang + ".pickle"
self._nltk_dir, "tokenizers", "punkt", lang + ".pickle"
) )


lang = self.NLTK_LANGS.get(self._args.get("lang"), self.NLTK_DEFAULT)
lang = self.NLTK_LANGS.get(self._lang, self.NLTK_DEFAULT)
try: try:
nltk.data.load(datafile(self.NLTK_DEFAULT)) nltk.data.load(datafile(self.NLTK_DEFAULT))
except LookupError: except LookupError:
nltk.download("punkt", self._args["nltk_dir"])
nltk.download("punkt", self._nltk_dir)
return nltk.data.load(datafile(lang)) return nltk.data.load(datafile(lang))


def _get_sentences(self, min_query, max_query, split_thresh):
def _get_sentences(
self, min_query: int, max_query: int, split_thresh: int
) -> list[str]:
"""Split the article text into sentences of a certain length.""" """Split the article text into sentences of a certain length."""


def cut_sentence(words): def cut_sentence(words):
@@ -138,24 +141,27 @@ class ArticleTextParser(_BaseTextParser):
sentences.extend(cut_sentence(sentence.split())) sentences.extend(cut_sentence(sentence.split()))
return [sen for sen in sentences if len(sen) >= min_query] return [sen for sen in sentences if len(sen) >= min_query]


def strip(self):
"""Clean the page's raw text by removing templates and formatting.
def strip(self) -> str:
"""
Clean the page's raw text by removing templates and formatting.


Return the page's text with all HTML and wikicode formatting removed,
including templates, tables, and references. It retains punctuation
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
quotes), original capitalization, and so forth. HTML entities are
replaced by their unicode equivalents.
Return the page's text with all HTML and wikicode formatting removed, including
templates, tables, and references. It retains punctuation (spacing, paragraphs,
periods, commas, (semi)-colons, parentheses, quotes), original capitalization,
and so forth. HTML entities are replaced by their unicode equivalents.


The actual stripping is handled by :py:mod:`mwparserfromhell`. The actual stripping is handled by :py:mod:`mwparserfromhell`.
""" """


def remove(code, node):
"""Remove a node from a code object, ignoring ValueError.
def remove(
code: mwparserfromhell.wikicode.Wikicode, node: mwparserfromhell.nodes.Node
) -> None:
"""
Remove a node from a code object, ignoring ValueError.


Sometimes we will remove a node that contains another node we wish
to remove, and we fail when we try to remove the inner one. Easiest
solution is to just ignore the exception.
Sometimes we will remove a node that contains another node we wish to
remove, and we fail when we try to remove the inner one. Easiest solution
is to just ignore the exception.
""" """
try: try:
code.remove(node) code.remove(node)
@@ -181,26 +187,32 @@ class ArticleTextParser(_BaseTextParser):
self.clean = re.sub(r"\n\n+", "\n", clean).strip() self.clean = re.sub(r"\n\n+", "\n", clean).strip()
return self.clean return self.clean


def chunk(self, max_chunks, min_query=8, max_query=128, split_thresh=32):
"""Convert the clean article text into a list of web-searchable chunks.

No greater than *max_chunks* will be returned. Each chunk will only be
a sentence or two long at most (no more than *max_query*). The idea is
to return a sample of the article text rather than the whole, so we'll
pick and choose from parts of it, especially if the article is large
and *max_chunks* is low, so we don't end up just searching for just the
first paragraph.

This is implemented using :py:mod:`nltk` (https://nltk.org/). A base
directory (*nltk_dir*) is required to store nltk's punctuation
database, and should be passed as an argument to the constructor. It is
typically located in the bot's working directory.
def chunk(
self,
max_chunks: int,
min_query: int = 8,
max_query: int = 128,
split_thresh: int = 32,
) -> list[str]:
"""
Convert the clean article text into a list of web-searchable chunks.

No greater than *max_chunks* will be returned. Each chunk will only be a
sentence or two long at most (no more than *max_query*). The idea is to return
a sample of the article text rather than the whole, so we'll pick and choose
from parts of it, especially if the article is large and *max_chunks* is low,
so we don't end up just searching for just the first paragraph.

This is implemented using :py:mod:`nltk` (https://nltk.org/). A base directory
(*nltk_dir*) is required to store nltk's punctuation database, and should be
passed as an argument to the constructor. It is typically located in the bot's
working directory.
""" """
sentences = self._get_sentences(min_query, max_query, split_thresh) sentences = self._get_sentences(min_query, max_query, split_thresh)
if len(sentences) <= max_chunks: if len(sentences) <= max_chunks:
return sentences return sentences


chunks = []
chunks: list[str] = []
while len(chunks) < max_chunks: while len(chunks) < max_chunks:
if len(chunks) % 5 == 0: if len(chunks) % 5 == 0:
chunk = sentences.pop(0) # Pop from beginning chunk = sentences.pop(0) # Pop from beginning
@@ -216,7 +228,8 @@ class ArticleTextParser(_BaseTextParser):
return chunks return chunks


def get_links(self): def get_links(self):
"""Return a list of all external links in the article.
"""
Return a list of all external links in the article.


The list is restricted to things that we suspect we can parse: i.e., The list is restricted to things that we suspect we can parse: i.e.,
those with schemes of ``http`` and ``https``. those with schemes of ``http`` and ``https``.
@@ -226,14 +239,42 @@ class ArticleTextParser(_BaseTextParser):
return [str(link.url) for link in links if link.url.startswith(schemes)] return [str(link.url) for link in links if link.url.startswith(schemes)]




class _HTMLParser(_BaseTextParser):
class ParserArgs(TypedDict, total=False):
mirror_hints: list[str]
open_url: Callable[[str], OpenedURL | None]


class SourceParser(ABC):
"""Base class for a parser that handles text."""

TYPE: ClassVar[str]

def __init__(self, text: bytes, url: str, args: ParserArgs | None = None) -> None:
self.text = text
self.url = url
self._args = args or {}

def __repr__(self) -> str:
"""Return the canonical string representation of the text parser."""
return f"{self.__class__.__name__}(text={self.text!r})"

def __str__(self) -> str:
"""Return a nice string representation of the text parser."""
return f"<{self.__class__.__name__} of text with size {len(self.text)}>"

@abstractmethod
def parse(self) -> str: ...


class HTMLParser(SourceParser):
"""A parser that can extract the text from an HTML document.""" """A parser that can extract the text from an HTML document."""


TYPE = "HTML" TYPE = "HTML"
hidden_tags = ["script", "style"] hidden_tags = ["script", "style"]


def _fail_if_mirror(self, soup):
"""Look for obvious signs that the given soup is a wiki mirror.
def _fail_if_mirror(self, soup: bs4.BeautifulSoup) -> None:
"""
Look for obvious signs that the given soup is a wiki mirror.


If so, raise ParserExclusionError, which is caught in the workers and If so, raise ParserExclusionError, which is caught in the workers and
causes this source to excluded. causes this source to excluded.
@@ -242,13 +283,14 @@ class _HTMLParser(_BaseTextParser):
return return


def func(attr): def func(attr):
assert "mirror_hints" in self._args
return attr and any(hint in attr for hint in self._args["mirror_hints"]) return attr and any(hint in attr for hint in self._args["mirror_hints"])


if soup.find_all(href=func) or soup.find_all(src=func): if soup.find_all(href=func) or soup.find_all(src=func):
raise ParserExclusionError() raise ParserExclusionError()


@staticmethod @staticmethod
def _get_soup(text):
def _get_soup(text: bytes) -> bs4.BeautifulSoup:
"""Parse some text using BeautifulSoup.""" """Parse some text using BeautifulSoup."""
import bs4 import bs4


@@ -257,11 +299,11 @@ class _HTMLParser(_BaseTextParser):
except ValueError: except ValueError:
return bs4.BeautifulSoup(text) return bs4.BeautifulSoup(text)


def _clean_soup(self, soup):
def _clean_soup(self, soup: bs4.element.Tag) -> str:
"""Clean a BeautifulSoup tree of invisible tags.""" """Clean a BeautifulSoup tree of invisible tags."""
import bs4 import bs4


def is_comment(text):
def is_comment(text: bs4.element.Tag) -> bool:
return isinstance(text, bs4.element.Comment) return isinstance(text, bs4.element.Comment)


for comment in soup.find_all(text=is_comment): for comment in soup.find_all(text=is_comment):
@@ -272,7 +314,7 @@ class _HTMLParser(_BaseTextParser):


return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings) return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings)


def _open(self, url, **kwargs):
def _open(self, url: str, **kwargs: Any) -> bytes | None:
"""Try to read a URL. Return None if it couldn't be read.""" """Try to read a URL. Return None if it couldn't be read."""
opener = self._args.get("open_url") opener = self._args.get("open_url")
if not opener: if not opener:
@@ -280,13 +322,13 @@ class _HTMLParser(_BaseTextParser):
result = opener(url, **kwargs) result = opener(url, **kwargs)
return result.content if result else None return result.content if result else None


def _load_from_blogspot(self, url):
def _load_from_blogspot(self, url: urllib.parse.ParseResult) -> str:
"""Load dynamic content from Blogger Dynamic Views.""" """Load dynamic content from Blogger Dynamic Views."""
match = re.search(r"'postId': '(\d+)'", self.text)
match = re.search(rb"'postId': '(\d+)'", self.text)
if not match: if not match:
return "" return ""
post_id = match.group(1) post_id = match.group(1)
url = f"https://{url.netloc}/feeds/posts/default/{post_id}?"
feed_url = f"https://{url.netloc}/feeds/posts/default/{post_id}?"
params = { params = {
"alt": "json", "alt": "json",
"v": "2", "v": "2",
@@ -294,7 +336,7 @@ class _HTMLParser(_BaseTextParser):
"rewriteforssl": "true", "rewriteforssl": "true",
} }
raw = self._open( raw = self._open(
url + urllib.parse.urlencode(params),
feed_url + urllib.parse.urlencode(params),
allow_content_types=["application/json"], allow_content_types=["application/json"],
) )
if raw is None: if raw is None:
@@ -308,19 +350,24 @@ class _HTMLParser(_BaseTextParser):
except KeyError: except KeyError:
return "" return ""
soup = self._get_soup(text) soup = self._get_soup(text)
if not soup.body:
return ""
return self._clean_soup(soup.body) return self._clean_soup(soup.body)


def parse(self):
"""Return the actual text contained within an HTML document.
def parse(self) -> str:
"""
Return the actual text contained within an HTML document.


Implemented using :py:mod:`BeautifulSoup <bs4>` Implemented using :py:mod:`BeautifulSoup <bs4>`
(https://www.crummy.com/software/BeautifulSoup/).
(https://pypi.org/project/beautifulsoup4/).
""" """
import bs4

url = urllib.parse.urlparse(self.url) if self.url else None url = urllib.parse.urlparse(self.url) if self.url else None
soup = self._get_soup(self.text) soup = self._get_soup(self.text)
if not soup.body: if not soup.body:
# No <body> tag present in HTML ->
# no scrapable content (possibly JS or <iframe> magic):
# No <body> tag present in HTML -> # no scrapable content
# (possibly JS or <iframe> magic):
return "" return ""


self._fail_if_mirror(soup) self._fail_if_mirror(soup)
@@ -328,7 +375,7 @@ class _HTMLParser(_BaseTextParser):


if url and url.netloc == "web.archive.org" and url.path.endswith(".pdf"): if url and url.netloc == "web.archive.org" and url.path.endswith(".pdf"):
playback = body.find(id="playback") playback = body.find(id="playback")
if playback and "src" in playback.attrs:
if isinstance(playback, bs4.element.Tag) and "src" in playback.attrs:
raise ParserRedirectError(playback.attrs["src"]) raise ParserRedirectError(playback.attrs["src"])


content = self._clean_soup(body) content = self._clean_soup(body)
@@ -339,7 +386,7 @@ class _HTMLParser(_BaseTextParser):
return content return content




class _PDFParser(_BaseTextParser):
class PDFParser(SourceParser):
"""A parser that can extract text from a PDF file.""" """A parser that can extract text from a PDF file."""


TYPE = "PDF" TYPE = "PDF"
@@ -348,7 +395,7 @@ class _PDFParser(_BaseTextParser):
("\u2022", " "), ("\u2022", " "),
] ]


def parse(self):
def parse(self) -> str:
"""Return extracted text from the PDF.""" """Return extracted text from the PDF."""
from pdfminer import converter, pdfinterp, pdfpage from pdfminer import converter, pdfinterp, pdfpage


@@ -358,7 +405,7 @@ class _PDFParser(_BaseTextParser):
interp = pdfinterp.PDFPageInterpreter(manager, conv) interp = pdfinterp.PDFPageInterpreter(manager, conv)


try: try:
pages = pdfpage.PDFPage.get_pages(io.StringIO(self.text))
pages = pdfpage.PDFPage.get_pages(io.BytesIO(self.text))
for page in pages: for page in pages:
interp.process_page(page) interp.process_page(page)
except Exception: # pylint: disable=broad-except except Exception: # pylint: disable=broad-except
@@ -372,12 +419,12 @@ class _PDFParser(_BaseTextParser):
return re.sub(r"\n\n+", "\n", value).strip() return re.sub(r"\n\n+", "\n", value).strip()




class _PlainTextParser(_BaseTextParser):
class PlainTextParser(SourceParser):
"""A parser that can unicode-ify and strip text from a plain text page.""" """A parser that can unicode-ify and strip text from a plain text page."""


TYPE = "Text" TYPE = "Text"


def parse(self):
def parse(self) -> str:
"""Unicode-ify and strip whitespace from the plain text document.""" """Unicode-ify and strip whitespace from the plain text document."""
from bs4.dammit import UnicodeDammit from bs4.dammit import UnicodeDammit


@@ -385,15 +432,25 @@ class _PlainTextParser(_BaseTextParser):
return converted.strip() if converted else "" return converted.strip() if converted else ""




_CONTENT_TYPES = {
"text/html": _HTMLParser,
"application/xhtml+xml": _HTMLParser,
"application/pdf": _PDFParser,
"application/x-pdf": _PDFParser,
"text/plain": _PlainTextParser,
_CONTENT_TYPES: dict[str, type[SourceParser]] = {
"text/html": HTMLParser,
"application/xhtml+xml": HTMLParser,
"application/pdf": PDFParser,
"application/x-pdf": PDFParser,
"text/plain": PlainTextParser,
} }




def get_parser(content_type):
@typing.overload
def get_parser(content_type: str) -> type[SourceParser] | None: ...


@typing.overload
def get_parser(
content_type: Literal["text/plain"] = "text/plain",
) -> type[SourceParser]: ...


def get_parser(content_type: str = "text/plain") -> type[SourceParser] | None:
"""Return the parser most able to handle a given content type, or None.""" """Return the parser most able to handle a given content type, or None."""
return _CONTENT_TYPES.get(content_type) return _CONTENT_TYPES.get(content_type)

+ 71
- 57
src/earwigbot/wiki/copyvios/result.py View File

@@ -18,13 +18,26 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import annotations

__all__ = ["CopyvioSource", "CopyvioCheckResult"]

import time
import typing
import urllib.parse import urllib.parse
from threading import Event from threading import Event
from time import time
from typing import Any


from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION
from earwigbot.wiki.copyvios.markov import (
EMPTY,
EMPTY_INTERSECTION,
MarkovChain,
MarkovChainIntersection,
)


__all__ = ["CopyvioSource", "CopyvioCheckResult"]
if typing.TYPE_CHECKING:
from earwigbot.wiki.copyvios.parsers import ParserArgs
from earwigbot.wiki.copyvios.workers import CopyvioWorkspace




class CopyvioSource: class CopyvioSource:
@@ -45,13 +58,13 @@ class CopyvioSource:


def __init__( def __init__(
self, self,
workspace,
url,
headers=None,
timeout=5,
parser_args=None,
search_config=None,
):
workspace: CopyvioWorkspace,
url: str,
headers: list[tuple[str, str]] | None = None,
timeout: float = 5,
parser_args: ParserArgs | None = None,
search_config: dict[str, Any] | None = None,
) -> None:
self.workspace = workspace self.workspace = workspace
self.url = url self.url = url
self.headers = headers self.headers = headers
@@ -68,54 +81,57 @@ class CopyvioSource:
self._event2 = Event() self._event2 = Event()
self._event2.set() self._event2.set()


def __repr__(self):
def __repr__(self) -> str:
"""Return the canonical string representation of the source.""" """Return the canonical string representation of the source."""
res = (
"CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r}, "
"excluded={3!r})"
return (
f"CopyvioSource(url={self.url!r}, confidence={self.confidence!r}, "
f"skipped={self.skipped!r}, excluded={self.excluded!r})"
) )
return res.format(self.url, self.confidence, self.skipped, self.excluded)


def __str__(self):
def __str__(self) -> str:
"""Return a nice string representation of the source.""" """Return a nice string representation of the source."""
if self.excluded: if self.excluded:
return f"<CopyvioSource ({self.url}, excluded)>" return f"<CopyvioSource ({self.url}, excluded)>"
if self.skipped: if self.skipped:
return f"<CopyvioSource ({self.url}, skipped)>" return f"<CopyvioSource ({self.url}, skipped)>"
res = "<CopyvioSource ({0} with {1} conf)>"
return res.format(self.url, self.confidence)
return f"<CopyvioSource ({self.url} with {self.confidence} conf)>"


@property @property
def domain(self):
def domain(self) -> str | None:
"""The source URL's domain name, or None.""" """The source URL's domain name, or None."""
return urllib.parse.urlparse(self.url).netloc or None return urllib.parse.urlparse(self.url).netloc or None


def start_work(self):
def start_work(self) -> None:
"""Mark this source as being worked on right now.""" """Mark this source as being worked on right now."""
self._event2.clear() self._event2.clear()
self._event1.set() self._event1.set()


def update(self, confidence, source_chain, delta_chain):
def update(
self,
confidence: float,
source_chain: MarkovChain,
delta_chain: MarkovChainIntersection,
) -> None:
"""Fill out the confidence and chain information inside this source.""" """Fill out the confidence and chain information inside this source."""
self.confidence = confidence self.confidence = confidence
self.chains = (source_chain, delta_chain) self.chains = (source_chain, delta_chain)


def finish_work(self):
def finish_work(self) -> None:
"""Mark this source as finished.""" """Mark this source as finished."""
self._event2.set() self._event2.set()


def skip(self):
def skip(self) -> None:
"""Deactivate this source without filling in the relevant data.""" """Deactivate this source without filling in the relevant data."""
if self._event1.is_set(): if self._event1.is_set():
return return
self.skipped = True self.skipped = True
self._event1.set() self._event1.set()


def join(self, until):
def join(self, until: float | None = None) -> None:
"""Block until this violation result is filled out.""" """Block until this violation result is filled out."""
for event in [self._event1, self._event2]: for event in [self._event1, self._event2]:
if until:
timeout = until - time()
if until is not None:
timeout = until - time.time()
if timeout <= 0: if timeout <= 0:
return return
event.wait(timeout) event.wait(timeout)
@@ -144,16 +160,15 @@ class CopyvioCheckResult:


def __init__( def __init__(
self, self,
violation,
sources,
queries,
check_time,
article_chain,
possible_miss,
included_sources=None,
unified_confidence=None,
violation: bool,
sources: list[CopyvioSource],
queries: int,
check_time: float,
article_chain: MarkovChain,
possible_miss: bool,
included_sources: list[CopyvioSource] | None = None,
unified_confidence: float | None = None,
): ):
assert isinstance(sources, list)
self.violation = violation self.violation = violation
self.sources = sources self.sources = sources
self.queries = queries self.queries = queries
@@ -163,48 +178,47 @@ class CopyvioCheckResult:
self.included_sources = included_sources if included_sources else [] self.included_sources = included_sources if included_sources else []
self.unified_confidence = unified_confidence self.unified_confidence = unified_confidence


def __repr__(self):
def __repr__(self) -> str:
"""Return the canonical string representation of the result.""" """Return the canonical string representation of the result."""
res = "CopyvioCheckResult(violation={0!r}, sources={1!r}, queries={2!r}, time={3!r})"
return res.format(self.violation, self.sources, self.queries, self.time)
return (
f"CopyvioCheckResult(violation={self.violation!r}, "
f"sources={self.sources!r}, queries={self.queries!r}, time={self.time!r})"
)


def __str__(self):
def __str__(self) -> str:
"""Return a nice string representation of the result.""" """Return a nice string representation of the result."""
res = "<CopyvioCheckResult ({0} with best {1})>"
return res.format(self.violation, self.best)
return f"<CopyvioCheckResult ({self.violation} with best {self.best})>"


@property @property
def best(self):
def best(self) -> CopyvioSource | None:
"""The best known source, or None if no sources exist.""" """The best known source, or None if no sources exist."""
return self.sources[0] if self.sources else None return self.sources[0] if self.sources else None


@property @property
def confidence(self):
def confidence(self) -> float:
"""The confidence of the best source, or 0 if no sources exist.""" """The confidence of the best source, or 0 if no sources exist."""
if self.unified_confidence is not None: if self.unified_confidence is not None:
return self.unified_confidence return self.unified_confidence
if self.best:
if self.best is not None:
return self.best.confidence return self.best.confidence
return 0.0 return 0.0


@property @property
def url(self):
def url(self) -> str | None:
"""The URL of the best source, or None if no sources exist.""" """The URL of the best source, or None if no sources exist."""
return self.best.url if self.best else None return self.best.url if self.best else None


def get_log_message(self, title):
def get_log_message(self, title: str) -> str:
"""Build a relevant log message for this copyvio check result.""" """Build a relevant log message for this copyvio check result."""
if not self.sources: if not self.sources:
log = "No violation for [[{0}]] (no sources; {1} queries; {2} seconds)"
return log.format(title, self.queries, self.time)
log = "{0} for [[{1}]] (best: {2} ({3} confidence); {4} sources; {5} queries; {6} seconds)"
return (
f"No violation for [[{title}]] (no sources; {self.queries} queries; "
f"{self.time} seconds)"
)

is_vio = "Violation detected" if self.violation else "No violation" is_vio = "Violation detected" if self.violation else "No violation"
return log.format(
is_vio,
title,
self.url,
self.confidence,
len(self.sources),
self.queries,
self.time,
return (
f"{is_vio} for [[{title}]] (best: {self.url} ({self.confidence} "
f"confidence); {len(self.sources)} sources; {self.queries} queries; "
f"{self.time} seconds)"
) )

+ 93
- 48
src/earwigbot/wiki/copyvios/search.py View File

@@ -18,91 +18,101 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


import re
from gzip import GzipFile
from io import StringIO
from json import loads
from urllib.error import URLError
from urllib.parse import urlencode

from earwigbot.exceptions import SearchQueryError

__all__ = [ __all__ = [
"BingSearchEngine", "BingSearchEngine",
"GoogleSearchEngine", "GoogleSearchEngine",
"SearchEngine",
"YandexSearchEngine", "YandexSearchEngine",
"SEARCH_ENGINES",
"get_search_engine",
] ]


import base64
import gzip
import io
import json
import re
import urllib.parse
import urllib.request
from abc import ABC, abstractmethod
from typing import Any
from urllib.error import URLError


class _BaseSearchEngine:
from earwigbot import exceptions


class SearchEngine(ABC):
"""Base class for a simple search engine interface.""" """Base class for a simple search engine interface."""


name = "Base" name = "Base"


def __init__(self, cred, opener):
def __init__(
self, cred: dict[str, str], opener: urllib.request.OpenerDirector
) -> None:
"""Store credentials (*cred*) and *opener* for searching later on.""" """Store credentials (*cred*) and *opener* for searching later on."""
self.cred = cred self.cred = cred
self.opener = opener self.opener = opener
self.count = 5 self.count = 5


def __repr__(self):
def __repr__(self) -> str:
"""Return the canonical string representation of the search engine.""" """Return the canonical string representation of the search engine."""
return f"{self.__class__.__name__}()" return f"{self.__class__.__name__}()"


def __str__(self):
def __str__(self) -> str:
"""Return a nice string representation of the search engine.""" """Return a nice string representation of the search engine."""
return f"<{self.__class__.__name__}>" return f"<{self.__class__.__name__}>"


def _open(self, *args):
def _open(self, url: str) -> bytes:
"""Open a URL (like urlopen) and try to return its contents.""" """Open a URL (like urlopen) and try to return its contents."""
try: try:
response = self.opener.open(*args)
response = self.opener.open(url)
result = response.read() result = response.read()
except (OSError, URLError) as exc: except (OSError, URLError) as exc:
err = SearchQueryError(f"{self.name} Error: {exc}")
err.cause = exc
raise err
raise exceptions.SearchQueryError(f"{self.name} Error: {exc}")


if response.headers.get("Content-Encoding") == "gzip": if response.headers.get("Content-Encoding") == "gzip":
stream = StringIO(result)
gzipper = GzipFile(fileobj=stream)
stream = io.BytesIO(result)
gzipper = gzip.GzipFile(fileobj=stream)
result = gzipper.read() result = gzipper.read()


code = response.getcode() code = response.getcode()
if code != 200: if code != 200:
err = "{0} Error: got response code '{1}':\n{2}'"
raise SearchQueryError(err.format(self.name, code, result))
raise exceptions.SearchQueryError(
f"{self.name} Error: got response code '{code}':\n{result}'"
)


return result return result


@staticmethod @staticmethod
def requirements():
def requirements() -> list[str]:
"""Return a list of packages required by this search engine.""" """Return a list of packages required by this search engine."""
return [] return []


def search(self, query):
"""Use this engine to search for *query*.
@abstractmethod
def search(self, query: str) -> list[str]:
"""
Use this engine to search for *query*.


Not implemented in this base class; overridden in subclasses. Not implemented in this base class; overridden in subclasses.
""" """
raise NotImplementedError()




class BingSearchEngine(_BaseSearchEngine):
class BingSearchEngine(SearchEngine):
"""A search engine interface with Bing Search (via Azure Marketplace).""" """A search engine interface with Bing Search (via Azure Marketplace)."""


name = "Bing" name = "Bing"


def __init__(self, cred, opener):
def __init__(
self, cred: dict[str, str], opener: urllib.request.OpenerDirector
) -> None:
super().__init__(cred, opener) super().__init__(cred, opener)


key = self.cred["key"] key = self.cred["key"]
auth = (key + ":" + key).encode("base64").replace("\n", "")
self.opener.addheaders.append(("Authorization", "Basic " + auth))
auth = base64.b64encode(f"{key}:{key}".encode()).decode()
self.opener.addheaders.append(("Authorization", f"Basic {auth}"))


def search(self, query: str) -> list[str]: def search(self, query: str) -> list[str]:
"""Do a Bing web search for *query*.
"""
Do a Bing web search for *query*.


Returns a list of URLs ranked by relevance (as determined by Bing). Returns a list of URLs ranked by relevance (as determined by Bing).
Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
@@ -112,20 +122,19 @@ class BingSearchEngine(_BaseSearchEngine):
params = { params = {
"$format": "json", "$format": "json",
"$top": str(self.count), "$top": str(self.count),
"Query": "'\"" + query.replace('"', "").encode("utf8") + "\"'",
"Query": "'\"" + query.replace('"', "") + "\"'",
"Market": "'en-US'", "Market": "'en-US'",
"Adult": "'Off'", "Adult": "'Off'",
"Options": "'DisableLocationDetection'", "Options": "'DisableLocationDetection'",
"WebSearchOptions": "'DisableHostCollapsing+DisableQueryAlterations'", "WebSearchOptions": "'DisableHostCollapsing+DisableQueryAlterations'",
} }


result = self._open(url + urlencode(params))
result = self._open(url + urllib.parse.urlencode(params))


try: try:
res = loads(result)
res = json.loads(result)
except ValueError: except ValueError:
err = "Bing Error: JSON could not be decoded"
raise SearchQueryError(err)
raise exceptions.SearchQueryError("Bing Error: JSON could not be decoded")


try: try:
results = res["d"]["results"] results = res["d"]["results"]
@@ -134,13 +143,14 @@ class BingSearchEngine(_BaseSearchEngine):
return [result["Url"] for result in results] return [result["Url"] for result in results]




class GoogleSearchEngine(_BaseSearchEngine):
class GoogleSearchEngine(SearchEngine):
"""A search engine interface with Google Search.""" """A search engine interface with Google Search."""


name = "Google" name = "Google"


def search(self, query: str) -> list[str]: def search(self, query: str) -> list[str]:
"""Do a Google web search for *query*.
"""
Do a Google web search for *query*.


Returns a list of URLs ranked by relevance (as determined by Google). Returns a list of URLs ranked by relevance (as determined by Google).
Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
@@ -157,13 +167,13 @@ class GoogleSearchEngine(_BaseSearchEngine):
"fields": "items(link)", "fields": "items(link)",
} }


result = self._open(url + urlencode(params))
result = self._open(url + urllib.parse.urlencode(params))


try: try:
res = loads(result)
res = json.loads(result)
except ValueError: except ValueError:
err = "Google Error: JSON could not be decoded" err = "Google Error: JSON could not be decoded"
raise SearchQueryError(err)
raise exceptions.SearchQueryError(err)


try: try:
return [item["link"] for item in res["items"]] return [item["link"] for item in res["items"]]
@@ -171,7 +181,7 @@ class GoogleSearchEngine(_BaseSearchEngine):
return [] return []




class YandexSearchEngine(_BaseSearchEngine):
class YandexSearchEngine(SearchEngine):
"""A search engine interface with Yandex Search.""" """A search engine interface with Yandex Search."""


name = "Yandex" name = "Yandex"
@@ -181,7 +191,8 @@ class YandexSearchEngine(_BaseSearchEngine):
return ["lxml.etree"] return ["lxml.etree"]


def search(self, query: str) -> list[str]: def search(self, query: str) -> list[str]:
"""Do a Yandex web search for *query*.
"""
Do a Yandex web search for *query*.


Returns a list of URLs ranked by relevance (as determined by Yandex). Returns a list of URLs ranked by relevance (as determined by Yandex).
Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
@@ -201,17 +212,51 @@ class YandexSearchEngine(_BaseSearchEngine):
"groupby": f"mode=flat.groups-on-page={self.count}", "groupby": f"mode=flat.groups-on-page={self.count}",
} }


result = self._open(url + urlencode(params))
result = self._open(url + urllib.parse.urlencode(params))


try: try:
data = lxml.etree.fromstring(result) # type: ignore
data = lxml.etree.fromstring(result)
return [elem.text for elem in data.xpath(".//url")] return [elem.text for elem in data.xpath(".//url")]
except lxml.etree.Error as exc: except lxml.etree.Error as exc:
raise SearchQueryError("Yandex XML parse error: " + str(exc))
raise exceptions.SearchQueryError(f"Yandex XML parse error: {exc}")




SEARCH_ENGINES = {
SEARCH_ENGINES: dict[str, type[SearchEngine]] = {
"Bing": BingSearchEngine, "Bing": BingSearchEngine,
"Google": GoogleSearchEngine, "Google": GoogleSearchEngine,
"Yandex": YandexSearchEngine, "Yandex": YandexSearchEngine,
} }


def get_search_engine(
search_config: dict[str, Any], headers: list[tuple[str, str]]
) -> SearchEngine:
"""Return a function that can be called to do web searches.

The function takes one argument, a search query, and returns a list of URLs, ranked
by importance. The underlying logic depends on the *engine* argument within our
config; for example, if *engine* is "Yahoo! BOSS", we'll use YahooBOSSSearchEngine
for querying.

Raises UnknownSearchEngineError if the 'engine' listed in our config is unknown to
us, and UnsupportedSearchEngineError if we are missing a required package or
module, like oauth2 for "Yahoo! BOSS".
"""
engine = search_config["engine"]
if engine not in SEARCH_ENGINES:
raise exceptions.UnknownSearchEngineError(engine)

klass = SEARCH_ENGINES[engine]
credentials = search_config["credentials"]
opener = urllib.request.build_opener()
opener.addheaders = headers

for dep in klass.requirements():
try:
__import__(dep).__name__
except (ModuleNotFoundError, AttributeError):
e = "Missing a required dependency ({}) for the {} engine"
e = e.format(dep, engine)
raise exceptions.UnsupportedSearchEngineError(e)

return klass(credentials, opener)

+ 155
- 125
src/earwigbot/wiki/copyvios/workers.py View File

@@ -18,59 +18,61 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import annotations

__all__ = ["globalize", "localize", "CopyvioWorkspace"]

import base64 import base64
import collections import collections
import dataclasses
import functools import functools
import gzip
import io
import logging
import math
import queue
import struct
import threading
import time import time
import urllib.parse import urllib.parse
from collections import deque
from gzip import GzipFile
import urllib.request
from collections.abc import Callable, Container
from dataclasses import dataclass
from http.client import HTTPException from http.client import HTTPException
from io import StringIO
from logging import getLogger
from math import log
from queue import Empty, Queue
from struct import error as struct_error
from threading import Lock, Thread
from typing import Any
from urllib.error import URLError from urllib.error import URLError
from urllib.request import Request, build_opener


from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError from earwigbot.exceptions import ParserExclusionError, ParserRedirectError
from earwigbot.wiki.copyvios.markov import ( from earwigbot.wiki.copyvios.markov import (
DEFAULT_DEGREE,
MarkovChain, MarkovChain,
MarkovChainIntersection, MarkovChainIntersection,
MarkovChainUnion, MarkovChainUnion,
) )
from earwigbot.wiki.copyvios.parsers import get_parser
from earwigbot.wiki.copyvios.parsers import ParserArgs, SourceParser, get_parser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource


tldextract = importer.new("tldextract")

__all__ = ["globalize", "localize", "CopyvioWorkspace"]

INCLUDE_THRESHOLD = 0.15 INCLUDE_THRESHOLD = 0.15


_MAX_REDIRECTS = 3 _MAX_REDIRECTS = 3
_MAX_RAW_SIZE = 20 * 1024**2 _MAX_RAW_SIZE = 20 * 1024**2


_is_globalized = False _is_globalized = False
_global_queues = None
_global_workers = []
_global_queues: _CopyvioQueues | None = None
_global_workers: list[_CopyvioWorker] = []


_OpenedURL = collections.namedtuple("_OpenedURL", ["content", "parser_class"])


def globalize(num_workers: int = 8) -> None:
"""
Cause all copyvio checks to be done by one global set of workers.


def globalize(num_workers=8):
"""Cause all copyvio checks to be done by one global set of workers.

This is useful when checks are being done through a web interface where
large numbers of simulatenous requests could be problematic. The global
workers are spawned when the function is called, run continuously, and
intelligently handle multiple checks.
This is useful when checks are being done through a web interface where large
numbers of simulatenous requests could be problematic. The global workers are
spawned when the function is called, run continuously, and intelligently handle
multiple checks.


This function is not thread-safe and should only be called when no checks
are being done. It has no effect if it has already been called.
This function is not thread-safe and should only be called when no checks are being
done. It has no effect if it has already been called.
""" """
global _is_globalized, _global_queues global _is_globalized, _global_queues
if _is_globalized: if _is_globalized:
@@ -84,19 +86,20 @@ def globalize(num_workers=8):
_is_globalized = True _is_globalized = True




def localize():
def localize() -> None:
"""Return to using page-specific workers for copyvio checks. """Return to using page-specific workers for copyvio checks.


This disables changes made by :func:`globalize`, including stoping the
global worker threads.
This disables changes made by :func:`globalize`, including stoping the global
worker threads.


This function is not thread-safe and should only be called when no checks
are being done.
This function is not thread-safe and should only be called when no checks are
being done.
""" """
global _is_globalized, _global_queues, _global_workers global _is_globalized, _global_queues, _global_workers
if not _is_globalized: if not _is_globalized:
return return


assert _global_queues is not None
for i in range(len(_global_workers)): for i in range(len(_global_workers)):
_global_queues.unassigned.put((StopIteration, None)) _global_queues.unassigned.put((StopIteration, None))
_global_queues = None _global_queues = None
@@ -104,30 +107,50 @@ def localize():
_is_globalized = False _is_globalized = False




@dataclass(frozen=True)
class OpenedURL:
content: bytes
parser_class: type[SourceParser]


SourceQueue = collections.deque[CopyvioSource]
UnassignedQueue = queue.Queue[
tuple[str, SourceQueue] | tuple[type[StopIteration], None]
]


@dataclass(frozen=True)
class _CopyvioQueues: class _CopyvioQueues:
"""Stores data necessary to maintain the various queues during a check.""" """Stores data necessary to maintain the various queues during a check."""


def __init__(self):
self.lock = Lock()
self.sites = {}
self.unassigned = Queue()
lock: threading.Lock = dataclasses.field(default_factory=threading.Lock)
sites: dict[str, SourceQueue] = dataclasses.field(default_factory=dict)
unassigned: UnassignedQueue = dataclasses.field(default_factory=queue.Queue)




class _CopyvioWorker: class _CopyvioWorker:
"""A multithreaded URL opener/parser instance.""" """A multithreaded URL opener/parser instance."""


def __init__(self, name, queues, until=None):
def __init__(
self, name: str, queues: _CopyvioQueues, until: float | None = None
) -> None:
self._name = name self._name = name
self._queues = queues self._queues = queues
self._until = until self._until = until


self._site = None
self._queue = None
self._search_config = None
self._opener = build_opener()
self._logger = getLogger("earwigbot.wiki.cvworker." + name)
self._site: str | None = None
self._queue: SourceQueue | None = None
self._search_config: dict[str, Any] | None = None
self._opener = urllib.request.build_opener()
self._logger = logging.getLogger("earwigbot.wiki.cvworker." + name)


def _try_map_proxy_url(self, url, parsed, extra_headers, is_error=False):
def _try_map_proxy_url(
self,
url: str,
parsed: urllib.parse.ParseResult,
extra_headers: dict[str, str],
is_error: bool = False,
) -> tuple[str, bool]:
if not self._search_config or "proxies" not in self._search_config: if not self._search_config or "proxies" not in self._search_config:
return url, False return url, False
for proxy_info in self._search_config["proxies"]: for proxy_info in self._search_config["proxies"]:
@@ -152,17 +175,20 @@ class _CopyvioWorker:
return url, True return url, True
return url, False return url, False


def _open_url_raw(self, url, timeout=5, allow_content_types=None):
def _open_url_raw(
self,
url: str,
timeout: float = 5,
allow_content_types: Container[str] | None = None,
) -> OpenedURL | None:
"""Open a URL, without parsing it. """Open a URL, without parsing it.


None will be returned for URLs that cannot be read for whatever reason. None will be returned for URLs that cannot be read for whatever reason.
""" """
parsed = urllib.parse.urlparse(url) parsed = urllib.parse.urlparse(url)
if not isinstance(url, str):
url = url.encode("utf8")
extra_headers = {}
extra_headers: dict[str, str] = {}
url, _ = self._try_map_proxy_url(url, parsed, extra_headers) url, _ = self._try_map_proxy_url(url, parsed, extra_headers)
request = Request(url, headers=extra_headers)
request = urllib.request.Request(url, headers=extra_headers)
try: try:
response = self._opener.open(request, timeout=timeout) response = self._opener.open(request, timeout=timeout)
except (OSError, URLError, HTTPException, ValueError): except (OSError, URLError, HTTPException, ValueError):
@@ -170,14 +196,14 @@ class _CopyvioWorker:
url, parsed, extra_headers, is_error=True url, parsed, extra_headers, is_error=True
) )
if not remapped: if not remapped:
self._logger.exception("Failed to fetch URL: %s", url)
self._logger.exception(f"Failed to fetch URL: {url}")
return None return None
self._logger.info("Failed to fetch URL, trying proxy remap: %s", url)
request = Request(url, headers=extra_headers)
self._logger.info(f"Failed to fetch URL, trying proxy remap: {url}")
request = urllib.request.Request(url, headers=extra_headers)
try: try:
response = self._opener.open(request, timeout=timeout) response = self._opener.open(request, timeout=timeout)
except (OSError, URLError, HTTPException, ValueError): except (OSError, URLError, HTTPException, ValueError):
self._logger.exception("Failed to fetch URL after proxy remap: %s", url)
self._logger.exception(f"Failed to fetch URL after proxy remap: {url}")
return None return None


try: try:
@@ -193,7 +219,7 @@ class _CopyvioWorker:
): ):
return None return None
if not parser_class: if not parser_class:
parser_class = get_parser("text/plain")
parser_class = get_parser()
if size > (15 if parser_class.TYPE == "PDF" else 2) * 1024**2: if size > (15 if parser_class.TYPE == "PDF" else 2) * 1024**2:
return None return None


@@ -207,28 +233,27 @@ class _CopyvioWorker:
return None return None


if response.headers.get("Content-Encoding") == "gzip": if response.headers.get("Content-Encoding") == "gzip":
stream = StringIO(content)
gzipper = GzipFile(fileobj=stream)
stream = io.BytesIO(content)
gzipper = gzip.GzipFile(fileobj=stream)
try: try:
content = gzipper.read() content = gzipper.read()
except (OSError, struct_error):
except (OSError, struct.error):
return None return None


if len(content) > _MAX_RAW_SIZE: if len(content) > _MAX_RAW_SIZE:
return None return None
return _OpenedURL(content, parser_class)
return OpenedURL(content, parser_class)


def _open_url(self, source, redirects=0):
def _open_url(self, source: CopyvioSource, redirects: int = 0) -> str | None:
"""Open a URL and return its parsed content, or None. """Open a URL and return its parsed content, or None.


First, we will decompress the content if the headers contain "gzip" as
its content encoding. Then, we will return the content stripped using
an HTML parser if the headers indicate it is HTML, or return the
content directly if it is plain text. If we don't understand the
content type, we'll return None.
First, we will decompress the content if the headers contain "gzip" as its
content encoding. Then, we will return the content stripped using an HTML
parser if the headers indicate it is HTML, or return the content directly if it
is plain text. If we don't understand the content type, we'll return None.


If a URLError was raised while opening the URL or an IOError was raised
while decompressing, None will be returned.
If a URLError was raised while opening the URL or an IOError was raised while
decompressing, None will be returned.
""" """
self._search_config = source.search_config self._search_config = source.search_config
if source.headers: if source.headers:
@@ -238,9 +263,9 @@ class _CopyvioWorker:
if result is None: if result is None:
return None return None


args = source.parser_args.copy() if source.parser_args else {}
args: ParserArgs = source.parser_args.copy() if source.parser_args else {}
args["open_url"] = functools.partial(self._open_url_raw, timeout=source.timeout) args["open_url"] = functools.partial(self._open_url_raw, timeout=source.timeout)
parser = result.parser_class(result.content, url=source.url, args=args)
parser = result.parser_class(result.content, source.url, args=args)
try: try:
return parser.parse() return parser.parse()
except ParserRedirectError as exc: except ParserRedirectError as exc:
@@ -249,30 +274,31 @@ class _CopyvioWorker:
source.url = exc.url.decode("utf8") source.url = exc.url.decode("utf8")
return self._open_url(source, redirects=redirects + 1) return self._open_url(source, redirects=redirects + 1)


def _acquire_new_site(self):
def _acquire_new_site(self) -> None:
"""Block for a new unassigned site queue.""" """Block for a new unassigned site queue."""
if self._until: if self._until:
timeout = self._until - time.time() timeout = self._until - time.time()
if timeout <= 0: if timeout <= 0:
raise Empty
raise queue.Empty()
else: else:
timeout = None timeout = None


self._logger.debug("Waiting for new site queue") self._logger.debug("Waiting for new site queue")
site, queue = self._queues.unassigned.get(timeout=timeout)
if site is StopIteration:
site, q = self._queues.unassigned.get(timeout=timeout)
if isinstance(site, type) and issubclass(site, StopIteration):
raise StopIteration raise StopIteration
self._logger.debug(f"Acquired new site queue: {site}") self._logger.debug(f"Acquired new site queue: {site}")
self._site = site self._site = site
self._queue = queue
self._queue = q


def _dequeue(self):
def _dequeue(self) -> CopyvioSource:
"""Remove a source from one of the queues.""" """Remove a source from one of the queues."""
if not self._site: if not self._site:
self._acquire_new_site() self._acquire_new_site()
assert self._site is not None
assert self._queue is not None


logmsg = "Fetching source URL from queue {0}"
self._logger.debug(logmsg.format(self._site))
self._logger.debug(f"Fetching source URL from queue {self._site}")
self._queues.lock.acquire() self._queues.lock.acquire()
try: try:
source = self._queue.popleft() source = self._queue.popleft()
@@ -294,11 +320,11 @@ class _CopyvioWorker:
self._queues.lock.release() self._queues.lock.release()
return source return source


def _handle_once(self):
"""Handle a single source from one of the queues."""
def _handle_once(self) -> bool:
"""Handle a single source from one of the queues. Return if we should exit."""
try: try:
source = self._dequeue() source = self._dequeue()
except Empty:
except queue.Empty:
self._logger.debug("Exiting: queue timed out") self._logger.debug("Exiting: queue timed out")
return False return False
except StopIteration: except StopIteration:
@@ -320,12 +346,11 @@ class _CopyvioWorker:
source.workspace.compare(source, chain) source.workspace.compare(source, chain)
return True return True


def _run(self):
def _run(self) -> None:
"""Main entry point for the worker thread. """Main entry point for the worker thread.


We will keep fetching URLs from the queues and handling them until
either we run out of time, or we get an exit signal that the queue is
now empty.
We will keep fetching URLs from the queues and handling them until either we
run out of time, or we get an exit signal that the queue is now empty.
""" """
while True: while True:
try: try:
@@ -335,9 +360,9 @@ class _CopyvioWorker:
self._logger.exception("Uncaught exception in worker") self._logger.exception("Uncaught exception in worker")
time.sleep(5) # Delay if we get stuck in a busy loop time.sleep(5) # Delay if we get stuck in a busy loop


def start(self):
def start(self) -> None:
"""Start the copyvio worker in a new thread.""" """Start the copyvio worker in a new thread."""
thread = Thread(target=self._run, name="cvworker-" + self._name)
thread = threading.Thread(target=self._run, name="cvworker-" + self._name)
thread.daemon = True thread.daemon = True
thread.start() thread.start()


@@ -347,20 +372,20 @@ class CopyvioWorkspace:


def __init__( def __init__(
self, self,
article,
min_confidence,
max_time,
logger,
headers,
url_timeout=5,
num_workers=8,
short_circuit=True,
parser_args=None,
exclude_check=None,
config=None,
degree=5,
):
self.sources = []
article: MarkovChain,
min_confidence: float,
max_time: float,
logger: logging.Logger,
headers: list[tuple[str, str]],
url_timeout: float = 5,
num_workers: int = 8,
short_circuit: bool = True,
parser_args: ParserArgs | None = None,
exclusion_callback: Callable[[str], bool] | None = None,
config: dict[str, Any] | None = None,
degree: int = DEFAULT_DEGREE,
) -> None:
self.sources: list[CopyvioSource] = []
self.finished = False self.finished = False
self.possible_miss = False self.possible_miss = False


@@ -369,8 +394,8 @@ class CopyvioWorkspace:
self._min_confidence = min_confidence self._min_confidence = min_confidence
self._start_time = time.time() self._start_time = time.time()
self._until = (self._start_time + max_time) if max_time > 0 else None self._until = (self._start_time + max_time) if max_time > 0 else None
self._handled_urls = set()
self._finish_lock = Lock()
self._handled_urls: set[str] = set()
self._finish_lock = threading.Lock()
self._short_circuit = short_circuit self._short_circuit = short_circuit
self._source_args = { self._source_args = {
"workspace": self, "workspace": self,
@@ -379,10 +404,11 @@ class CopyvioWorkspace:
"parser_args": parser_args, "parser_args": parser_args,
"search_config": config, "search_config": config,
} }
self._exclude_check = exclude_check
self._exclusion_callback = exclusion_callback
self._degree = degree self._degree = degree


if _is_globalized: if _is_globalized:
assert _global_queues is not None
self._queues = _global_queues self._queues = _global_queues
else: else:
self._queues = _CopyvioQueues() self._queues = _CopyvioQueues()
@@ -391,28 +417,27 @@ class CopyvioWorkspace:
name = f"local-{id(self) % 10000:04}.{i}" name = f"local-{id(self) % 10000:04}.{i}"
_CopyvioWorker(name, self._queues, self._until).start() _CopyvioWorker(name, self._queues, self._until).start()


def _calculate_confidence(self, delta):
def _calculate_confidence(self, delta: MarkovChainIntersection) -> float:
"""Return the confidence of a violation as a float between 0 and 1.""" """Return the confidence of a violation as a float between 0 and 1."""


def conf_with_article_and_delta(article, delta):
def conf_with_article_and_delta(article: float, delta: float) -> float:
"""Calculate confidence using the article and delta chain sizes.""" """Calculate confidence using the article and delta chain sizes."""
# This piecewise function exhibits exponential growth until it
# reaches the default "suspect" confidence threshold, at which
# point it transitions to polynomial growth with a limit of 1 as
# (delta / article) approaches 1.
# This piecewise function exhibits exponential growth until it reaches the
# default "suspect" confidence threshold, at which point it transitions to
# polynomial growth with a limit of 1 as # (delta / article) approaches 1.
# A graph can be viewed here: https://goo.gl/mKPhvr # A graph can be viewed here: https://goo.gl/mKPhvr
ratio = delta / article ratio = delta / article
if ratio <= 0.52763: if ratio <= 0.52763:
return -log(1 - ratio)
return -math.log(1 - ratio)
else: else:
return (-0.8939 * (ratio**2)) + (1.8948 * ratio) - 0.0009 return (-0.8939 * (ratio**2)) + (1.8948 * ratio) - 0.0009


def conf_with_delta(delta):
def conf_with_delta(delta: float) -> float:
"""Calculate confidence using just the delta chain size.""" """Calculate confidence using just the delta chain size."""
# This piecewise function was derived from experimental data using # This piecewise function was derived from experimental data using
# reference points at (0, 0), (100, 0.5), (250, 0.75), (500, 0.9),
# and (1000, 0.95), with a limit of 1 as delta approaches infinity.
# A graph can be viewed here: https://goo.gl/lVl7or
# reference points at (0, 0), (100, 0.5), (250, 0.75), (500, 0.9), and
# (1000, 0.95), with a limit of 1 as delta approaches infinity. A graph can
# be viewed here: https://goo.gl/lVl7or
if delta <= 100: if delta <= 100:
return delta / (delta + 100) return delta / (delta + 100)
elif delta <= 250: elif delta <= 250:
@@ -430,7 +455,7 @@ class CopyvioWorkspace:
) )
) )


def _finish_early(self):
def _finish_early(self) -> None:
"""Finish handling links prematurely (if we've hit min_confidence).""" """Finish handling links prematurely (if we've hit min_confidence)."""
self._logger.debug("Confidence threshold met; skipping remaining sources") self._logger.debug("Confidence threshold met; skipping remaining sources")
with self._queues.lock: with self._queues.lock:
@@ -438,7 +463,7 @@ class CopyvioWorkspace:
source.skip() source.skip()
self.finished = True self.finished = True


def enqueue(self, urls):
def enqueue(self, urls: list[str]) -> None:
"""Put a list of URLs into the various worker queues.""" """Put a list of URLs into the various worker queues."""
for url in urls: for url in urls:
with self._queues.lock: with self._queues.lock:
@@ -449,7 +474,7 @@ class CopyvioWorkspace:
source = CopyvioSource(url=url, **self._source_args) source = CopyvioSource(url=url, **self._source_args)
self.sources.append(source) self.sources.append(source)


if self._exclude_check and self._exclude_check(url):
if self._exclusion_callback and self._exclusion_callback(url):
self._logger.debug(f"enqueue(): exclude {url}") self._logger.debug(f"enqueue(): exclude {url}")
source.excluded = True source.excluded = True
source.skip() source.skip()
@@ -460,32 +485,37 @@ class CopyvioWorkspace:
continue continue


try: try:
import tldextract

key = tldextract.extract(url).registered_domain key = tldextract.extract(url).registered_domain
except ImportError: # Fall back on very naive method
except ModuleNotFoundError: # Fall back on very naive method
from urllib.parse import urlparse from urllib.parse import urlparse


key = ".".join(urlparse(url).netloc.split(".")[-2:]) key = ".".join(urlparse(url).netloc.split(".")[-2:])


logmsg = "enqueue(): {0} {1} -> {2}"
logmsg = f"enqueue(): %s {key} -> {url}"
if key in self._queues.sites: if key in self._queues.sites:
self._logger.debug(logmsg.format("append", key, url))
self._logger.debug(logmsg % "append")
self._queues.sites[key].append(source) self._queues.sites[key].append(source)
else: else:
self._logger.debug(logmsg.format("new", key, url))
self._queues.sites[key] = queue = deque()
queue.append(source)
self._queues.unassigned.put((key, queue))
self._logger.debug(logmsg % "new")
q: SourceQueue = collections.deque()
q.append(source)
self._queues.sites[key] = q
self._queues.unassigned.put((key, q))


def compare(self, source, source_chain):
def compare(self, source: CopyvioSource, source_chain: MarkovChain | None) -> None:
"""Compare a source to the article; call _finish_early if necessary.""" """Compare a source to the article; call _finish_early if necessary."""
if source_chain: if source_chain:
delta = MarkovChainIntersection(self._article, source_chain) delta = MarkovChainIntersection(self._article, source_chain)
conf = self._calculate_confidence(delta) conf = self._calculate_confidence(delta)
else: else:
delta = None
conf = 0.0 conf = 0.0
self._logger.debug(f"compare(): {source.url} -> {conf}") self._logger.debug(f"compare(): {source.url} -> {conf}")
with self._finish_lock: with self._finish_lock:
if source_chain: if source_chain:
assert delta is not None
source.update(conf, source_chain, delta) source.update(conf, source_chain, delta)
source.finish_work() source.finish_work()
if not self.finished and conf >= self._min_confidence: if not self.finished and conf >= self._min_confidence:
@@ -494,7 +524,7 @@ class CopyvioWorkspace:
else: else:
self.finished = True self.finished = True


def wait(self):
def wait(self) -> None:
"""Wait for the workers to finish handling the sources.""" """Wait for the workers to finish handling the sources."""
self._logger.debug(f"Waiting on {len(self.sources)} sources") self._logger.debug(f"Waiting on {len(self.sources)} sources")
for source in self.sources: for source in self.sources:
@@ -505,7 +535,7 @@ class CopyvioWorkspace:
for i in range(self._num_workers): for i in range(self._num_workers):
self._queues.unassigned.put((StopIteration, None)) self._queues.unassigned.put((StopIteration, None))


def get_result(self, num_queries=0):
def get_result(self, num_queries: int = 0) -> CopyvioCheckResult:
"""Return a CopyvioCheckResult containing the results of this check.""" """Return a CopyvioCheckResult containing the results of this check."""
self.sources.sort( self.sources.sort(
key=lambda s: ( key=lambda s: (


+ 107
- 3
src/earwigbot/wiki/page.py View File

@@ -35,14 +35,14 @@ import mwparserfromhell


from earwigbot import exceptions from earwigbot import exceptions
from earwigbot.exceptions import APIError from earwigbot.exceptions import APIError
from earwigbot.wiki.copyvios import CopyvioMixIn
from earwigbot.wiki.copyvios import DEFAULT_DEGREE, CopyvioChecker, CopyvioCheckResult


if typing.TYPE_CHECKING: if typing.TYPE_CHECKING:
from earwigbot.wiki.site import Site from earwigbot.wiki.site import Site
from earwigbot.wiki.user import User from earwigbot.wiki.user import User




class Page(CopyvioMixIn):
class Page:
""" """
**EarwigBot: Wiki Toolset: Page** **EarwigBot: Wiki Toolset: Page**


@@ -110,7 +110,6 @@ class Page(CopyvioMixIn):
__init__() will not do any API queries, but it will use basic namespace logic __init__() will not do any API queries, but it will use basic namespace logic
to determine our namespace ID and if we are a talkpage. to determine our namespace ID and if we are a talkpage.
""" """
super().__init__(site)
self._site = site self._site = site
self._title = title.strip() self._title = title.strip()
self._follow_redirects = self._keep_following = follow_redirects self._follow_redirects = self._keep_following = follow_redirects
@@ -873,3 +872,108 @@ class Page(CopyvioMixIn):
return False return False


return True return True

def copyvio_check(
self,
min_confidence: float = 0.75,
max_queries: int = 15,
max_time: float = -1,
no_searches: bool = False,
no_links: bool = False,
short_circuit: bool = True,
degree: int = DEFAULT_DEGREE,
) -> CopyvioCheckResult:
"""
Check the page for copyright violations.

Returns a :class:`.CopyvioCheckResult` object with information on the results
of the check.

*min_confidence* is the minimum amount of confidence we must have in the
similarity between a source text and the article in order for us to consider it
a suspected violation. This is a number between 0 and 1.

*max_queries* is self-explanatory; we will never make more than this number of
queries in a given check.

*max_time* can be set to prevent copyvio checks from taking longer than a set
amount of time (generally around a minute), which can be useful if checks are
called through a web server with timeouts. We will stop checking new URLs as
soon as this limit is reached.

Setting *no_searches* to ``True`` will cause only URLs in the wikitext of the
page to be checked; no search engine queries will be made. Setting *no_links*
to ``True`` will cause the opposite to happen: URLs in the wikitext will be
ignored; search engine queries will be made only. Setting both of these to
``True`` is pointless.

Normally, the checker will short-circuit if it finds a URL that meets
*min_confidence*. This behavior normally causes it to skip any remaining URLs
and web queries, but setting *short_circuit* to ``False`` will prevent this.

The *degree* controls the n-gram word size used in comparing similarity. It
should usually be a number between 3 and 5.

Raises :exc:`.CopyvioCheckError` or subclasses
(:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on errors.
"""
self._logger.info(f"Starting copyvio check for [[{self.title}]]")
checker = CopyvioChecker(
self,
min_confidence=min_confidence,
max_time=max_time,
degree=degree,
logger=self._logger,
)

result = checker.run_check(
max_queries=max_queries,
no_searches=no_searches,
no_links=no_links,
short_circuit=short_circuit,
)
self._logger.info(result.get_log_message(self.title))
return result

def copyvio_compare(
self,
urls: list[str] | str,
min_confidence: float = 0.75,
max_time: float = 30,
degree: int = DEFAULT_DEGREE,
) -> CopyvioCheckResult:
"""
Check the page, like :py:meth:`copyvio_check`, against specific URLs.

This is essentially a reduced version of :meth:`copyvio_check` - a copyivo
comparison is made using Markov chains and the result is returned in a
:class:`.CopyvioCheckResult` object - but without using a search engine, since
the suspected "violated" URL is supplied from the start.

One use case is to generate a result when the URL is retrieved from a cache,
like the one used in EarwigBot's Toolforge site. After a search is done, the
resulting URL is stored in a cache for 72 hours so future checks against that
page will not require another set of time-and-money-consuming search engine
queries. However, the comparison itself (which includes the article's and the
source's content) cannot be stored for data retention reasons, so a fresh
comparison is made using this function.

Since no searching is done, neither :exc:`.UnknownSearchEngineError` nor
:exc:`.SearchQueryError` will be raised.
"""
if not isinstance(urls, list):
urls = [urls]
self._logger.info(
f"Starting copyvio compare for [[{self.title}]] against {', '.join(urls)}"
)
checker = CopyvioChecker(
self,
min_confidence=min_confidence,
max_time=max_time,
degree=degree,
logger=self._logger,
)

result = checker.run_compare(urls)
self._logger.info(result.get_log_message(self.title))
return result

Loading…
Cancel
Save