Skip to content

Commit

Permalink
v3.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
BBC-Esq authored Dec 27, 2023
1 parent b36e176 commit 0668baf
Show file tree
Hide file tree
Showing 25 changed files with 1,220 additions and 174 deletions.
2 changes: 0 additions & 2 deletions src/bark_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,6 @@ def play_audio_thread(self):
finally:
p.terminate()

# print attributes of the BarkModel instance
# print(dir(self.model))
self.release_resources()

def process_text_thread(self):
Expand Down
50 changes: 45 additions & 5 deletions src/choose_documents.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,62 @@
import subprocess
import os
from pathlib import Path
from PySide6.QtWidgets import QApplication, QFileDialog
from PySide6.QtWidgets import QApplication, QFileDialog, QDialog, QVBoxLayout, QTextEdit, QPushButton, QHBoxLayout
import sys

def choose_documents_directory():
allowed_extensions = ['.pdf', '.docx', '.epub', '.txt', '.enex', '.eml', '.msg', '.csv', '.xls', '.xlsx', '.rtf', '.odt',
'.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff']
current_dir = Path(__file__).parent.resolve()
docs_folder = current_dir / "Docs_for_DB"
images_folder = current_dir / "Images_for_DB"
file_dialog = QFileDialog()
file_dialog.setFileMode(QFileDialog.ExistingFiles)
file_paths, _ = file_dialog.getOpenFileNames(None, "Choose Documents for Database", str(current_dir))
file_paths, _ = file_dialog.getOpenFileNames(None, "Choose Documents and Images for Database", str(current_dir))

if file_paths:
docs_folder.mkdir(parents=True, exist_ok=True)
incompatible_files = []
compatible_files = []

for file_path in file_paths:
symlink_target = docs_folder / Path(file_path).name
symlink_target.symlink_to(file_path)
extension = Path(file_path).suffix.lower()
if extension in allowed_extensions:
if extension in ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff']:
target_folder = images_folder
else:
target_folder = docs_folder
target_folder.mkdir(parents=True, exist_ok=True)
symlink_target = target_folder / Path(file_path).name
symlink_target.symlink_to(file_path)
else:
incompatible_files.append(Path(file_path).name)

if incompatible_files:
dialog = QDialog()
dialog.setWindowTitle("Incompatible Files Detected")
layout = QVBoxLayout()

text_edit = QTextEdit()
text_edit.setReadOnly(True)
text_edit.setText("One or more files selected are not compatible to be put into the database. Click 'Ok' to only add compatible documents or 'cancel' to back out::\n\n" + "\n".join(incompatible_files))
layout.addWidget(text_edit)

button_box = QHBoxLayout()
ok_button = QPushButton("OK")
cancel_button = QPushButton("Cancel")
button_box.addWidget(ok_button)
button_box.addWidget(cancel_button)
layout.addLayout(button_box)

dialog.setLayout(layout)

ok_button.clicked.connect(dialog.accept)
cancel_button.clicked.connect(dialog.reject)

user_choice = dialog.exec()

if user_choice == QDialog.Rejected:
return

def see_documents_directory():
current_dir = Path(__file__).parent.resolve()
Expand Down
9 changes: 0 additions & 9 deletions src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,6 @@
},
'model': 'BAAI/bge-base-en-v1.5'
},
{
'details': {
'description': 'Well rounded & slight RAG improvement.',
'dimensions': 768,
'max_sequence': 512,
'size_mb': 438
},
'model': 'BAAI/llm-embedder'
},
{
'details': {
'description': 'Well rounded & customizable.',
Expand Down
5 changes: 2 additions & 3 deletions src/create_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,10 @@ def main():
EMBEDDING_MODEL_NAME = config_data.get("EMBEDDING_MODEL_NAME")

my_cprint(f"Loading documents.", "white")
documents = load_documents(SOURCE_DIRECTORY) # First invocation of document_processor.py script
documents = load_documents(SOURCE_DIRECTORY) # invoke document_processor.py; returns a list of document objects
my_cprint(f"Successfully loaded documents.", "white")

texts = split_documents(documents) # Second invocation of document_processor.py script
my_cprint(f"Successfully split documents.", "white")
texts = split_documents(documents) # invoke document_processor.py again; returns a list of split document objects

embeddings = get_embeddings(EMBEDDING_MODEL_NAME, config_data)
my_cprint("Embedding model loaded.", "green")
Expand Down
87 changes: 58 additions & 29 deletions src/document_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,20 @@
)

from constants import DOCUMENT_LOADERS
from loader_vision_llava import llava_process_images
from loader_vision_cogvlm import cogvlm_process_images

ENABLE_PRINT = True
ROOT_DIRECTORY = Path(__file__).parent
SOURCE_DIRECTORY = ROOT_DIRECTORY / "Docs_for_DB"
INGEST_THREADS = os.cpu_count() or 8

def my_cprint(*args, **kwargs):
if ENABLE_PRINT:
filename = "document_processor.py"
modified_message = f"{filename}: {args[0]}"
cprint(modified_message, *args[1:], **kwargs)

ROOT_DIRECTORY = Path(__file__).parent
SOURCE_DIRECTORY = ROOT_DIRECTORY / "Docs_for_DB"
INGEST_THREADS = os.cpu_count() or 8

for ext, loader_name in DOCUMENT_LOADERS.items():
DOCUMENT_LOADERS[ext] = globals()[loader_name]

Expand All @@ -41,19 +42,27 @@ def my_cprint(*args, **kwargs):
UnstructuredODTLoader, UnstructuredMarkdownLoader,
UnstructuredExcelLoader, UnstructuredCSVLoader
)
from pathlib import Path
from langchain.docstore.document import Document
# Other necessary imports...

def process_images_wrapper(config):
chosen_model = config["vision"]["chosen_model"]
if chosen_model == 'llava' or chosen_model == 'bakllava':
return llava_process_images()
elif chosen_model == 'cogvlm':
return cogvlm_process_images()
else:
return []

def load_single_document(file_path: Path) -> Document:
file_extension = file_path.suffix.lower()
loader_class = DOCUMENT_LOADERS.get(file_extension)

if loader_class:
if file_extension == ".txt":
loader = loader_class(str(file_path), encoding='utf-8')
loader = loader_class(str(file_path), encoding='utf-8', autodetect_encoding=True)
elif file_extension == ".epub":
loader = UnstructuredEPubLoader(str(file_path), mode="single", strategy="fast")
elif file_extension == ".docx":
loader = Docx2txtLoader(str(file_path), mode="single", strategy="fast")
elif file_extension == ".rtf":
loader = UnstructuredRTFLoader(str(file_path), mode="single", strategy="fast")
elif file_extension == ".odt":
Expand All @@ -71,40 +80,55 @@ def load_single_document(file_path: Path) -> Document:

document = loader.load()[0]

'''
# Write the content to a .txt file
with open("output_load_single_document.txt", "w", encoding="utf-8") as output_file:
output_file.write(document.page_content)
'''
# with open("output_load_single_document.txt", "w", encoding="utf-8") as output_file:
# output_file.write(document.page_content)

# text extracted before metadata added
return document

def load_document_batch(filepaths):
with ThreadPoolExecutor(len(filepaths)) as exe:
futures = [exe.submit(load_single_document, name) for name in filepaths]
data_list = [future.result() for future in futures]
return (data_list, filepaths)
return (data_list, filepaths) # "data_list" = list of all document objects created by load single document

def load_documents(source_dir: Path) -> list[Document]:
all_files = list(source_dir.iterdir())
paths = [f for f in all_files if f.suffix in DOCUMENT_LOADERS.keys()]

n_workers = min(INGEST_THREADS, max(len(paths), 1))
my_cprint(f"Number of workers assigned: {n_workers}", "white")
chunksize = round(len(paths) / n_workers)

if chunksize == 0:
raise ValueError(f"chunksize must be a non-zero integer, but got {chunksize}. len(paths): {len(paths)}, n_workers: {n_workers}")

docs = []

if paths:
n_workers = min(INGEST_THREADS, max(len(paths), 1))
my_cprint(f"Number of workers assigned: {n_workers}", "white")
chunksize = round(len(paths) / n_workers)

if chunksize == 0:
raise ValueError(f"chunksize must be a non-zero integer, but got {chunksize}. len(paths): {len(paths)}, n_workers: {n_workers}")

with ProcessPoolExecutor(n_workers) as executor:
futures = [executor.submit(load_document_batch, paths[i : (i + chunksize)]) for i in range(0, len(paths), chunksize)]
for future in as_completed(futures):
contents, _ = future.result()
docs.extend(contents)
my_cprint(f"Number of NON-IMAGE files loaded: {len(docs)}", "yellow")

additional_docs = []

with ProcessPoolExecutor(n_workers) as executor:
futures = [executor.submit(load_document_batch, paths[i : (i + chunksize)]) for i in range(0, len(paths), chunksize)]
for future in as_completed(futures):
contents, _ = future.result()
docs.extend(contents)
my_cprint(f"Number of files loaded: {len(docs)}", "white")

return docs # end of first invocation by create_database.py
my_cprint(f"Loading images, if any.", "yellow")

with open("config.yaml", "r") as config_file:
config = yaml.safe_load(config_file)

# Use ProcessPoolExecutor to run the selected image processing function in a separate process
with ProcessPoolExecutor(1) as executor:
future = executor.submit(process_images_wrapper, config)
processed_docs = future.result() # Get the result from the future
additional_docs = processed_docs if processed_docs is not None else []

docs.extend(additional_docs) # Add to pre-existing list

return docs

def split_documents(documents):
my_cprint(f"Splitting documents.", "white")
Expand All @@ -131,3 +155,8 @@ def split_documents(documents):
my_cprint(f"Chunks between {lower_bound} and {upper_bound} characters: {count}", "white")

return texts

'''
# document object structure: Document(page_content="[ALL TEXT EXTRACTED]", metadata={'source': '[FULL FILE PATH WITH DOUBLE BACKSLASHES'})
# list structure: [Document(page_content="...", metadata={'source': '...'}), Document(page_content="...", metadata={'source': '...'})]
'''
Loading

0 comments on commit 0668baf

Please sign in to comment.