Skip to content

Commit

Permalink
fix: 1.x - nltk upgrade, use nltk.download('punkt_tab') (#8256)
Browse files Browse the repository at this point in the history
* Use nltk.download('punkt_tab'), pin nltk>=3.9.1

* Add reno note

* Pin urllib3<2.0.0

* Exp with deps

* Updates

* Updates

* Pin python-pptx<=1.0

* Skip tests with old nltk pickle model files

* Update reno note

* Ignore tokenizer_model_folder nltk parameter

* Add upgrade section for the release note

* mypy fixes

* Update pyproject.toml

Co-authored-by: Stefano Fiorucci <[email protected]>

* Update release notes

* Use PunktTokenizer instead of nltk.data.load

---------

Co-authored-by: Stefano Fiorucci <[email protected]>
  • Loading branch information
vblagoje and anakin87 authored Aug 29, 2024
1 parent 883cd46 commit 8c95fab
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 8 deletions.
16 changes: 12 additions & 4 deletions haystack/nodes/preprocessor/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

with LazyImport("Run 'pip install farm-haystack[preprocessing]' or 'pip install nltk'") as nltk_import:
import nltk
from nltk.tokenize.punkt import PunktTokenizer

iso639_to_nltk = {
"ru": "russian",
Expand Down Expand Up @@ -120,10 +121,18 @@ def __init__(
nltk.data.find("tokenizers/punkt")
except LookupError:
try:
nltk.download("punkt")
nltk.download("punkt_tab")
except FileExistsError as error:
logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: %s", error)
pass

if tokenizer_model_folder is not None:
warnings.warn(
"Custom NLTK tokenizers are no longer allowed. "
"The 'tokenizer_model_folder' parameter will be ignored. "
"Please use the built-in nltk tokenizers instead by specifying the `language` parameter."
)
self.tokenizer_model_folder = None
self.clean_whitespace = clean_whitespace
self.clean_header_footer = clean_header_footer
self.clean_empty_lines = clean_empty_lines
Expand All @@ -134,7 +143,6 @@ def __init__(
self.split_respect_sentence_boundary = split_respect_sentence_boundary
self.tokenizer = tokenizer
self.language = language
self.tokenizer_model_folder = tokenizer_model_folder
self.print_log: Set[str] = set()
self.id_hash_keys = id_hash_keys
self.progress_bar = progress_bar
Expand Down Expand Up @@ -922,14 +930,14 @@ def _load_sentence_tokenizer(self, language_name: Optional[str]) -> "nltk.tokeni

# Use a default NLTK model
elif language_name is not None:
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
sentence_tokenizer = PunktTokenizer(language_name)
else:
logger.error(
"PreProcessor couldn't find the default sentence tokenizer model for %s. "
" Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter.",
self.language,
)
sentence_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
sentence_tokenizer = PunktTokenizer() # default english model

return sentence_tokenizer

Expand Down
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ dependencies = [
[project.optional-dependencies]
inference = [
"transformers[torch,sentencepiece]==4.39.3",
"sentence-transformers>=2.3.1", # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
"sentence-transformers<=3.0.0,>=2.3.1", # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
"huggingface-hub>=0.5.0",
]
elasticsearch = [
Expand Down Expand Up @@ -147,13 +147,13 @@ crawler = [
"selenium>=4.11.0"
]
preprocessing = [
"nltk",
"nltk>=3.9.1",
"langdetect", # for language classification
]
file-conversion = [
"azure-ai-formrecognizer>=3.2.0b2", # Microsoft Azure's Form Recognizer service (text and table exctrator)
"python-docx",
"python-pptx",
"python-pptx<=1.0",
"tika", # Apache Tika (text & metadata extractor)
"beautifulsoup4",
"markdown",
Expand Down Expand Up @@ -191,7 +191,7 @@ colab = [
dev = [
"pre-commit",
# Type check
"mypy",
"mypy==1.10.0",
# Test
"pytest",
"pytest-cov",
Expand Down
9 changes: 9 additions & 0 deletions releasenotes/notes/upgrade-ntlk-1e94de2d6f5dd3b6.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
fixes:
- |
Upgrades ntlk to 3.9.1 as prior versions are affect by https://nvd.nist.gov/vuln/detail/CVE-2024-39705.
upgrade:
- |
Upgrades ntlk to 3.9.1 as prior versions are affect by https://nvd.nist.gov/vuln/detail/CVE-2024-39705. Due to these security vulnerabilities, it is not possible to use custom NLTK tokenizer models with the new version (for example in PreProcessor). Users can still use built-in nltk tokenizers by specifying the language parameter in the PreProcessor. See PreProcessor documentation for more details.
enhancements:
- |
Pins sentence-transformers<=3.0.0,>=2.3.1 and python-pptx<=1.0 to avoid some minor typing incompatibilities with the newer version of the respective libraries.
1 change: 1 addition & 0 deletions test/nodes/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ def test_preprocess_sentence_split_custom_models_non_default_language(split_leng

@pytest.mark.unit
@pytest.mark.parametrize("split_length_and_results", [(1, 8), (8, 1)])
@pytest.mark.skip(reason="Skipped after upgrade to nltk 3.9, can't load this model pt anymore")
def test_preprocess_sentence_split_custom_models(split_length_and_results, samples_path):
split_length, expected_documents_count = split_length_and_results

Expand Down

0 comments on commit 8c95fab

Please sign in to comment.