From 7cab90f9fb14a7727225c5ca6cadafd9bf2c2f54 Mon Sep 17 00:00:00 2001 From: Simon Suo Date: Fri, 3 Nov 2023 16:45:26 -0700 Subject: [PATCH] Fix nltk bug in multi-threaded environments (#8668) wip --- llama_index/text_splitter/utils.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/llama_index/text_splitter/utils.py b/llama_index/text_splitter/utils.py index 11ccc62ef403b..e959d723f1428 100644 --- a/llama_index/text_splitter/utils.py +++ b/llama_index/text_splitter/utils.py @@ -1,7 +1,10 @@ +import logging from typing import Callable, List from llama_index.text_splitter.types import TextSplitter +logger = logging.getLogger(__name__) + def truncate_text(text: str, text_splitter: TextSplitter) -> str: """Truncate text to fit within the chunk size.""" @@ -46,7 +49,14 @@ def split_by_sentence_tokenizer() -> Callable[[str], List[str]]: try: nltk.data.find("tokenizers/punkt") except LookupError: - nltk.download("punkt", download_dir=nltk_data_dir) + try: + nltk.download("punkt", download_dir=nltk_data_dir) + except FileExistsError: + logger.info( + "Tried to re-download NLTK files but already exists. " + "This could happen in multi-theaded deployments, " + "should be benign" + ) tokenizer = nltk.tokenize.PunktSentenceTokenizer()