From 9866ca0507f5f6c0de8241388d7af664a371cc91 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sun, 7 Jan 2024 13:10:45 -0500 Subject: [PATCH] revert Turns out unstructured doc/docx requires libre office...reverting, fixing but, and remove .doc support. --- src/constants.py | 3 +-- src/document_processor.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/constants.py b/src/constants.py index 65b3cf90..7268cfbf 100644 --- a/src/constants.py +++ b/src/constants.py @@ -273,8 +273,7 @@ DOCUMENT_LOADERS = { ".pdf": "PyMuPDFLoader", - ".docx": "UnstructuredWordDocumentLoader", - ".doc": "UnstructuredWordDocumentLoader", + ".docx": "Docx2txtLoader", ".txt": "TextLoader", ".enex": "EverNoteLoader", ".epub": "UnstructuredEPubLoader", diff --git a/src/document_processor.py b/src/document_processor.py index 3742fa78..3f6e9dde 100644 --- a/src/document_processor.py +++ b/src/document_processor.py @@ -7,7 +7,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import ( PyMuPDFLoader, - UnstructuredWordDocumentLoader, + Docx2txtLoader, TextLoader, EverNoteLoader, UnstructuredEPubLoader, @@ -67,7 +67,7 @@ def load_single_document(file_path: Path) -> Document: elif file_extension == ".epub": loader = UnstructuredEPubLoader(str(file_path), mode="single", strategy="fast") elif file_extension == ".docx": - loader = UnstructuredWordDocumentLoader(str(file_path), mode="single", strategy="fast") + loader = Docx2txtLoader(str(file_path)) elif file_extension == ".rtf": loader = UnstructuredRTFLoader(str(file_path), mode="single", strategy="fast") elif file_extension == ".odt":