From dfae10cf12e46d6ad3c33312fdc7bd4649be47c3 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 3 Nov 2024 23:09:01 -0500 Subject: [PATCH] Fixes for nltk usage in copyvios --- pyproject.toml | 2 +- src/earwigbot/wiki/copyvios/parsers.py | 14 ++++++-------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e34ed43..18e76be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ copyvios = [ "beautifulsoup4 >= 4.9.3", # Parsing/scraping HTML "charset_normalizer >= 3.3.2", # Encoding detection for BeautifulSoup "lxml >= 4.6.3", # Faster parser for BeautifulSoup - "nltk >= 3.6.1", # Parsing sentences to split article content + "nltk >= 3.9.1", # Parsing sentences to split article content "pdfminer >= 20191125", # Extracting text from PDF files "tldextract >= 3.1.0", # Getting domains for the multithreaded workers ] diff --git a/src/earwigbot/wiki/copyvios/parsers.py b/src/earwigbot/wiki/copyvios/parsers.py index dc8fcad..c66b9e2 100644 --- a/src/earwigbot/wiki/copyvios/parsers.py +++ b/src/earwigbot/wiki/copyvios/parsers.py @@ -95,19 +95,17 @@ class ArticleParser: def _get_tokenizer(self) -> Any: """Return a NLTK punctuation tokenizer for the article's language.""" - import nltk + os.environ["NLTK_DATA"] = self._nltk_dir # Yuck! - def datafile(lang: str) -> str: - return "file:" + os.path.join( - self._nltk_dir, "tokenizers", "punkt", lang + ".pickle" - ) + import nltk + from nltk.tokenize import PunktTokenizer lang = self.NLTK_LANGS.get(self._lang, self.NLTK_DEFAULT) try: - nltk.data.load(datafile(self.NLTK_DEFAULT)) + PunktTokenizer(lang) except LookupError: - nltk.download("punkt", self._nltk_dir) - return nltk.data.load(datafile(lang)) + nltk.download("punkt_tab", self._nltk_dir) + return PunktTokenizer(lang) def _get_sentences( self, min_query: int, max_query: int, split_thresh: int