From cbd47667ef1866d2c2fa27d1ea3c0d9a0c2cfd17 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 6 Jan 2024 19:31:19 -0500 Subject: [PATCH] correct name of doc/docx loader --- src/constants.py | 3 ++- src/document_processor.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/constants.py b/src/constants.py index 7268cfbf..65b3cf90 100644 --- a/src/constants.py +++ b/src/constants.py @@ -273,7 +273,8 @@ DOCUMENT_LOADERS = { ".pdf": "PyMuPDFLoader", - ".docx": "Docx2txtLoader", + ".docx": "UnstructuredWordDocumentLoader", + ".doc": "UnstructuredWordDocumentLoader", ".txt": "TextLoader", ".enex": "EverNoteLoader", ".epub": "UnstructuredEPubLoader", diff --git a/src/document_processor.py b/src/document_processor.py index bdf14e9e..3742fa78 100644 --- a/src/document_processor.py +++ b/src/document_processor.py @@ -7,7 +7,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import ( PyMuPDFLoader, - Docx2txtLoader, + UnstructuredWordDocumentLoader, TextLoader, EverNoteLoader, UnstructuredEPubLoader, @@ -67,7 +67,7 @@ def load_single_document(file_path: Path) -> Document: elif file_extension == ".epub": loader = UnstructuredEPubLoader(str(file_path), mode="single", strategy="fast") elif file_extension == ".docx": - loader = Docx2txtLoader(str(file_path), mode="single", strategy="fast") + loader = UnstructuredWordDocumentLoader(str(file_path), mode="single", strategy="fast") elif file_extension == ".rtf": loader = UnstructuredRTFLoader(str(file_path), mode="single", strategy="fast") elif file_extension == ".odt":