Browse Source

Fixes for nltk usage in copyvios

main
Ben Kurtovic 1 month ago
parent
commit
dfae10cf12
2 changed files with 7 additions and 9 deletions
  1. +1
    -1
      pyproject.toml
  2. +6
    -8
      src/earwigbot/wiki/copyvios/parsers.py

+ 1
- 1
pyproject.toml View File

@@ -37,7 +37,7 @@ copyvios = [
"beautifulsoup4 >= 4.9.3", # Parsing/scraping HTML
"charset_normalizer >= 3.3.2", # Encoding detection for BeautifulSoup
"lxml >= 4.6.3", # Faster parser for BeautifulSoup
"nltk >= 3.6.1", # Parsing sentences to split article content
"nltk >= 3.9.1", # Parsing sentences to split article content
"pdfminer >= 20191125", # Extracting text from PDF files
"tldextract >= 3.1.0", # Getting domains for the multithreaded workers
]


+ 6
- 8
src/earwigbot/wiki/copyvios/parsers.py View File

@@ -95,19 +95,17 @@ class ArticleParser:

def _get_tokenizer(self) -> Any:
"""Return a NLTK punctuation tokenizer for the article's language."""
import nltk
os.environ["NLTK_DATA"] = self._nltk_dir # Yuck!

def datafile(lang: str) -> str:
return "file:" + os.path.join(
self._nltk_dir, "tokenizers", "punkt", lang + ".pickle"
)
import nltk
from nltk.tokenize import PunktTokenizer

lang = self.NLTK_LANGS.get(self._lang, self.NLTK_DEFAULT)
try:
nltk.data.load(datafile(self.NLTK_DEFAULT))
PunktTokenizer(lang)
except LookupError:
nltk.download("punkt", self._nltk_dir)
return nltk.data.load(datafile(lang))
nltk.download("punkt_tab", self._nltk_dir)
return PunktTokenizer(lang)

def _get_sentences(
self, min_query: int, max_query: int, split_thresh: int


Loading…
Cancel
Save