From 9866ca0507f5f6c0de8241388d7af664a371cc91 Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sun, 7 Jan 2024 13:10:45 -0500
Subject: [PATCH] revert

Turns out unstructured doc/docx requires libre office...reverting, fixing but, and remove .doc support.
---
 src/constants.py          | 3 +--
 src/document_processor.py | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/constants.py b/src/constants.py
index 65b3cf90..7268cfbf 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -273,8 +273,7 @@
 
 DOCUMENT_LOADERS = {
     ".pdf": "PyMuPDFLoader",
-    ".docx": "UnstructuredWordDocumentLoader",
-    ".doc": "UnstructuredWordDocumentLoader",
+    ".docx": "Docx2txtLoader",
     ".txt": "TextLoader",
     ".enex": "EverNoteLoader",
     ".epub": "UnstructuredEPubLoader",
diff --git a/src/document_processor.py b/src/document_processor.py
index 3742fa78..3f6e9dde 100644
--- a/src/document_processor.py
+++ b/src/document_processor.py
@@ -7,7 +7,7 @@
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.document_loaders import (
     PyMuPDFLoader,
-    UnstructuredWordDocumentLoader,
+    Docx2txtLoader,
     TextLoader,
     EverNoteLoader,
     UnstructuredEPubLoader,
@@ -67,7 +67,7 @@ def load_single_document(file_path: Path) -> Document:
         elif file_extension == ".epub":
             loader = UnstructuredEPubLoader(str(file_path), mode="single", strategy="fast")
         elif file_extension == ".docx":
-            loader = UnstructuredWordDocumentLoader(str(file_path), mode="single", strategy="fast")
+            loader = Docx2txtLoader(str(file_path))
         elif file_extension == ".rtf":
             loader = UnstructuredRTFLoader(str(file_path), mode="single", strategy="fast")
         elif file_extension == ".odt":