Skip to content

Commit

Permalink
revert
Browse files Browse the repository at this point in the history
Turns out unstructured doc/docx requires libre office...reverting, fixing but, and remove .doc support.
  • Loading branch information
BBC-Esq authored Jan 7, 2024
1 parent 7810bfa commit 9866ca0
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 4 deletions.
3 changes: 1 addition & 2 deletions src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,8 +273,7 @@

DOCUMENT_LOADERS = {
".pdf": "PyMuPDFLoader",
".docx": "UnstructuredWordDocumentLoader",
".doc": "UnstructuredWordDocumentLoader",
".docx": "Docx2txtLoader",
".txt": "TextLoader",
".enex": "EverNoteLoader",
".epub": "UnstructuredEPubLoader",
Expand Down
4 changes: 2 additions & 2 deletions src/document_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import (
PyMuPDFLoader,
UnstructuredWordDocumentLoader,
Docx2txtLoader,
TextLoader,
EverNoteLoader,
UnstructuredEPubLoader,
Expand Down Expand Up @@ -67,7 +67,7 @@ def load_single_document(file_path: Path) -> Document:
elif file_extension == ".epub":
loader = UnstructuredEPubLoader(str(file_path), mode="single", strategy="fast")
elif file_extension == ".docx":
loader = UnstructuredWordDocumentLoader(str(file_path), mode="single", strategy="fast")
loader = Docx2txtLoader(str(file_path))
elif file_extension == ".rtf":
loader = UnstructuredRTFLoader(str(file_path), mode="single", strategy="fast")
elif file_extension == ".odt":
Expand Down

0 comments on commit 9866ca0

Please sign in to comment.