diff --git a/pyproject.toml b/pyproject.toml index e34ed43..18e76be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ copyvios = [ "beautifulsoup4 >= 4.9.3", # Parsing/scraping HTML "charset_normalizer >= 3.3.2", # Encoding detection for BeautifulSoup "lxml >= 4.6.3", # Faster parser for BeautifulSoup - "nltk >= 3.6.1", # Parsing sentences to split article content + "nltk >= 3.9.1", # Parsing sentences to split article content "pdfminer >= 20191125", # Extracting text from PDF files "tldextract >= 3.1.0", # Getting domains for the multithreaded workers ] diff --git a/src/earwigbot/wiki/copyvios/parsers.py b/src/earwigbot/wiki/copyvios/parsers.py index dc8fcad..c66b9e2 100644 --- a/src/earwigbot/wiki/copyvios/parsers.py +++ b/src/earwigbot/wiki/copyvios/parsers.py @@ -95,19 +95,17 @@ class ArticleParser: def _get_tokenizer(self) -> Any: """Return a NLTK punctuation tokenizer for the article's language.""" - import nltk + os.environ["NLTK_DATA"] = self._nltk_dir # Yuck! - def datafile(lang: str) -> str: - return "file:" + os.path.join( - self._nltk_dir, "tokenizers", "punkt", lang + ".pickle" - ) + import nltk + from nltk.tokenize import PunktTokenizer lang = self.NLTK_LANGS.get(self._lang, self.NLTK_DEFAULT) try: - nltk.data.load(datafile(self.NLTK_DEFAULT)) + PunktTokenizer(lang) except LookupError: - nltk.download("punkt", self._nltk_dir) - return nltk.data.load(datafile(lang)) + nltk.download("punkt_tab", self._nltk_dir) + return PunktTokenizer(lang) def _get_sentences( self, min_query: int, max_query: int, split_thresh: int