From 7cab90f9fb14a7727225c5ca6cadafd9bf2c2f54 Mon Sep 17 00:00:00 2001
From: Simon Suo <simonsdsuo@gmail.com>
Date: Fri, 3 Nov 2023 16:45:26 -0700
Subject: [PATCH] Fix nltk bug in multi-threaded environments (#8668)

wip
---
 llama_index/text_splitter/utils.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/llama_index/text_splitter/utils.py b/llama_index/text_splitter/utils.py
index 11ccc62ef403b..e959d723f1428 100644
--- a/llama_index/text_splitter/utils.py
+++ b/llama_index/text_splitter/utils.py
@@ -1,7 +1,10 @@
+import logging
 from typing import Callable, List
 
 from llama_index.text_splitter.types import TextSplitter
 
+logger = logging.getLogger(__name__)
+
 
 def truncate_text(text: str, text_splitter: TextSplitter) -> str:
     """Truncate text to fit within the chunk size."""
@@ -46,7 +49,14 @@ def split_by_sentence_tokenizer() -> Callable[[str], List[str]]:
     try:
         nltk.data.find("tokenizers/punkt")
     except LookupError:
-        nltk.download("punkt", download_dir=nltk_data_dir)
+        try:
+            nltk.download("punkt", download_dir=nltk_data_dir)
+        except FileExistsError:
+            logger.info(
+                "Tried to re-download NLTK files but already exists. "
+                "This could happen in multi-theaded deployments, "
+                "should be benign"
+            )
 
     tokenizer = nltk.tokenize.PunktSentenceTokenizer()