Skip to content

Commit

Permalink
Fix nltk bug in multi-threaded environments (run-llama#8668)
Browse files Browse the repository at this point in the history
wip
  • Loading branch information
Disiok authored Nov 3, 2023
1 parent d494329 commit 7cab90f
Showing 1 changed file with 11 additions and 1 deletion.
12 changes: 11 additions & 1 deletion llama_index/text_splitter/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import logging
from typing import Callable, List

from llama_index.text_splitter.types import TextSplitter

logger = logging.getLogger(__name__)


def truncate_text(text: str, text_splitter: TextSplitter) -> str:
"""Truncate text to fit within the chunk size."""
Expand Down Expand Up @@ -46,7 +49,14 @@ def split_by_sentence_tokenizer() -> Callable[[str], List[str]]:
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt", download_dir=nltk_data_dir)
try:
nltk.download("punkt", download_dir=nltk_data_dir)
except FileExistsError:
logger.info(
"Tried to re-download NLTK files but already exists. "
"This could happen in multi-theaded deployments, "
"should be benign"
)

tokenizer = nltk.tokenize.PunktSentenceTokenizer()

Expand Down

0 comments on commit 7cab90f

Please sign in to comment.