Fixes for nltk usage in copyvios

2 months ago · dfae10cf12
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ copyvios = [
    "beautifulsoup4 >= 4.9.3",  # Parsing/scraping HTML
    "charset_normalizer >= 3.3.2",  # Encoding detection for BeautifulSoup
    "lxml >= 4.6.3",  # Faster parser for BeautifulSoup
    "nltk >= 3.6.1",  # Parsing sentences to split article content
    "nltk >= 3.9.1",  # Parsing sentences to split article content
    "pdfminer >= 20191125",  # Extracting text from PDF files
    "tldextract >= 3.1.0",  # Getting domains for the multithreaded workers
 ]
--- a/src/earwigbot/wiki/copyvios/parsers.py
+++ b/src/earwigbot/wiki/copyvios/parsers.py
@@ -95,19 +95,17 @@ class ArticleParser:

    def _get_tokenizer(self) -> Any:
        """Return a NLTK punctuation tokenizer for the article's language."""
        import nltk
        os.environ["NLTK_DATA"] = self._nltk_dir  # Yuck!

        def datafile(lang: str) -> str:
            return "file:" + os.path.join(
                self._nltk_dir, "tokenizers", "punkt", lang + ".pickle"
            )
        import nltk
        from nltk.tokenize import PunktTokenizer

        lang = self.NLTK_LANGS.get(self._lang, self.NLTK_DEFAULT)
        try:
            nltk.data.load(datafile(self.NLTK_DEFAULT))
            PunktTokenizer(lang)
        except LookupError:
            nltk.download("punkt", self._nltk_dir)
        return nltk.data.load(datafile(lang))
            nltk.download("punkt_tab", self._nltk_dir)
        return PunktTokenizer(lang)

    def _get_sentences(
        self, min_query: int, max_query: int, split_thresh: int