v3.0.0

BBC-Esq · Dec 27, 2023 · 0668baf · 0668baf
1 parent b36e176
commit 0668baf
Show file tree

Hide file tree

Showing 25 changed files with 1,220 additions and 174 deletions.
diff --git a/src/bark_module.py b/src/bark_module.py
@@ -116,8 +116,6 @@ def play_audio_thread(self):
             finally:
                 p.terminate()
 
-        # print attributes of the BarkModel instance
-        # print(dir(self.model))
         self.release_resources()
 
     def process_text_thread(self):

diff --git a/src/choose_documents.py b/src/choose_documents.py
@@ -1,22 +1,62 @@
 import subprocess
 import os
 from pathlib import Path
-from PySide6.QtWidgets import QApplication, QFileDialog
+from PySide6.QtWidgets import QApplication, QFileDialog, QDialog, QVBoxLayout, QTextEdit, QPushButton, QHBoxLayout
 import sys
 
 def choose_documents_directory():
+    allowed_extensions = ['.pdf', '.docx', '.epub', '.txt', '.enex', '.eml', '.msg', '.csv', '.xls', '.xlsx', '.rtf', '.odt',
+                          '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff']
     current_dir = Path(__file__).parent.resolve()
     docs_folder = current_dir / "Docs_for_DB"
+    images_folder = current_dir / "Images_for_DB"
     file_dialog = QFileDialog()
     file_dialog.setFileMode(QFileDialog.ExistingFiles)
-    file_paths, _ = file_dialog.getOpenFileNames(None, "Choose Documents for Database", str(current_dir))
+    file_paths, _ = file_dialog.getOpenFileNames(None, "Choose Documents and Images for Database", str(current_dir))
 
     if file_paths:
-        docs_folder.mkdir(parents=True, exist_ok=True)
+        incompatible_files = []
+        compatible_files = []
 
         for file_path in file_paths:
-            symlink_target = docs_folder / Path(file_path).name
-            symlink_target.symlink_to(file_path)
+            extension = Path(file_path).suffix.lower()
+            if extension in allowed_extensions:
+                if extension in ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff']:
+                    target_folder = images_folder
+                else:
+                    target_folder = docs_folder
+                target_folder.mkdir(parents=True, exist_ok=True)
+                symlink_target = target_folder / Path(file_path).name
+                symlink_target.symlink_to(file_path)
+            else:
+                incompatible_files.append(Path(file_path).name)
+
+        if incompatible_files:
+            dialog = QDialog()
+            dialog.setWindowTitle("Incompatible Files Detected")
+            layout = QVBoxLayout()
+
+            text_edit = QTextEdit()
+            text_edit.setReadOnly(True)
+            text_edit.setText("One or more files selected are not compatible to be put into the database.  Click 'Ok' to only add compatible documents or 'cancel' to back out::\n\n" + "\n".join(incompatible_files))
+            layout.addWidget(text_edit)
+
+            button_box = QHBoxLayout()
+            ok_button = QPushButton("OK")
+            cancel_button = QPushButton("Cancel")
+            button_box.addWidget(ok_button)
+            button_box.addWidget(cancel_button)
+            layout.addLayout(button_box)
+
+            dialog.setLayout(layout)
+
+            ok_button.clicked.connect(dialog.accept)
+            cancel_button.clicked.connect(dialog.reject)
+
+            user_choice = dialog.exec()
+
+            if user_choice == QDialog.Rejected:
+                return
 
 def see_documents_directory():
     current_dir = Path(__file__).parent.resolve()

diff --git a/src/constants.py b/src/constants.py
@@ -17,15 +17,6 @@
         },
         'model': 'BAAI/bge-base-en-v1.5'
     },
-    {
-        'details': {
-            'description': 'Well rounded & slight RAG improvement.',
-            'dimensions': 768,
-            'max_sequence': 512,
-            'size_mb': 438
-        },
-        'model': 'BAAI/llm-embedder'
-    },
     {
         'details': {
             'description': 'Well rounded & customizable.',

diff --git a/src/create_database.py b/src/create_database.py
@@ -38,11 +38,10 @@ def main():
     EMBEDDING_MODEL_NAME = config_data.get("EMBEDDING_MODEL_NAME")
 
     my_cprint(f"Loading documents.", "white")
-    documents = load_documents(SOURCE_DIRECTORY) # First invocation of document_processor.py script
+    documents = load_documents(SOURCE_DIRECTORY) # invoke document_processor.py; returns a list of document objects
     my_cprint(f"Successfully loaded documents.", "white")
 
-    texts = split_documents(documents) # Second invocation of document_processor.py script
-    my_cprint(f"Successfully split documents.", "white")
+    texts = split_documents(documents) # invoke document_processor.py again; returns a list of split document objects
 
     embeddings = get_embeddings(EMBEDDING_MODEL_NAME, config_data)
     my_cprint("Embedding model loaded.", "green")

diff --git a/src/document_processor.py b/src/document_processor.py
@@ -20,19 +20,20 @@
 )
 
 from constants import DOCUMENT_LOADERS
+from loader_vision_llava import llava_process_images
+from loader_vision_cogvlm import cogvlm_process_images
 
 ENABLE_PRINT = True
+ROOT_DIRECTORY = Path(__file__).parent
+SOURCE_DIRECTORY = ROOT_DIRECTORY / "Docs_for_DB"
+INGEST_THREADS = os.cpu_count() or 8
 
 def my_cprint(*args, **kwargs):
     if ENABLE_PRINT:
         filename = "document_processor.py"
         modified_message = f"{filename}: {args[0]}"
         cprint(modified_message, *args[1:], **kwargs)
 
-ROOT_DIRECTORY = Path(__file__).parent
-SOURCE_DIRECTORY = ROOT_DIRECTORY / "Docs_for_DB"
-INGEST_THREADS = os.cpu_count() or 8
-
 for ext, loader_name in DOCUMENT_LOADERS.items():
     DOCUMENT_LOADERS[ext] = globals()[loader_name]
 
@@ -41,19 +42,27 @@ def my_cprint(*args, **kwargs):
     UnstructuredODTLoader, UnstructuredMarkdownLoader, 
     UnstructuredExcelLoader, UnstructuredCSVLoader
 )
-from pathlib import Path
-from langchain.docstore.document import Document
-# Other necessary imports...
+
+def process_images_wrapper(config):
+    chosen_model = config["vision"]["chosen_model"]
+    if chosen_model == 'llava' or chosen_model == 'bakllava':
+        return llava_process_images()
+    elif chosen_model == 'cogvlm':
+        return cogvlm_process_images()
+    else:
+        return []
 
 def load_single_document(file_path: Path) -> Document:
     file_extension = file_path.suffix.lower()
     loader_class = DOCUMENT_LOADERS.get(file_extension)
 
     if loader_class:
         if file_extension == ".txt":
-            loader = loader_class(str(file_path), encoding='utf-8')
+            loader = loader_class(str(file_path), encoding='utf-8', autodetect_encoding=True)
         elif file_extension == ".epub":
             loader = UnstructuredEPubLoader(str(file_path), mode="single", strategy="fast")
+        elif file_extension == ".docx":
+            loader = Docx2txtLoader(str(file_path), mode="single", strategy="fast")
         elif file_extension == ".rtf":
             loader = UnstructuredRTFLoader(str(file_path), mode="single", strategy="fast")
         elif file_extension == ".odt":
@@ -71,40 +80,55 @@ def load_single_document(file_path: Path) -> Document:
 
     document = loader.load()[0]
 
-    '''
-    # Write the content to a .txt file
-    with open("output_load_single_document.txt", "w", encoding="utf-8") as output_file:
-        output_file.write(document.page_content)
-    '''
+    # with open("output_load_single_document.txt", "w", encoding="utf-8") as output_file:
+        # output_file.write(document.page_content)
+
+    # text extracted before metadata added
     return document
 
 def load_document_batch(filepaths):
     with ThreadPoolExecutor(len(filepaths)) as exe:
         futures = [exe.submit(load_single_document, name) for name in filepaths]
         data_list = [future.result() for future in futures]
-    return (data_list, filepaths)
+    return (data_list, filepaths) # "data_list" = list of all document objects created by load single document
 
 def load_documents(source_dir: Path) -> list[Document]:
     all_files = list(source_dir.iterdir())
     paths = [f for f in all_files if f.suffix in DOCUMENT_LOADERS.keys()]
 
-    n_workers = min(INGEST_THREADS, max(len(paths), 1))
-    my_cprint(f"Number of workers assigned: {n_workers}", "white")
-    chunksize = round(len(paths) / n_workers)
-
-    if chunksize == 0:
-        raise ValueError(f"chunksize must be a non-zero integer, but got {chunksize}. len(paths): {len(paths)}, n_workers: {n_workers}")
-
     docs = []
+
+    if paths:
+        n_workers = min(INGEST_THREADS, max(len(paths), 1))
+        my_cprint(f"Number of workers assigned: {n_workers}", "white")
+        chunksize = round(len(paths) / n_workers)
+
+        if chunksize == 0:
+            raise ValueError(f"chunksize must be a non-zero integer, but got {chunksize}. len(paths): {len(paths)}, n_workers: {n_workers}")
+
+        with ProcessPoolExecutor(n_workers) as executor:
+            futures = [executor.submit(load_document_batch, paths[i : (i + chunksize)]) for i in range(0, len(paths), chunksize)]
+            for future in as_completed(futures):
+                contents, _ = future.result()
+                docs.extend(contents)
+                my_cprint(f"Number of NON-IMAGE files loaded: {len(docs)}", "yellow")
+
+    additional_docs = []
 
-    with ProcessPoolExecutor(n_workers) as executor:
-        futures = [executor.submit(load_document_batch, paths[i : (i + chunksize)]) for i in range(0, len(paths), chunksize)]
-        for future in as_completed(futures):
-            contents, _ = future.result()
-            docs.extend(contents)
-            my_cprint(f"Number of files loaded: {len(docs)}", "white")
-
-    return docs # end of first invocation by create_database.py
+    my_cprint(f"Loading images, if any.", "yellow")
+
+    with open("config.yaml", "r") as config_file:
+        config = yaml.safe_load(config_file)
+
+        # Use ProcessPoolExecutor to run the selected image processing function in a separate process
+        with ProcessPoolExecutor(1) as executor:
+            future = executor.submit(process_images_wrapper, config)
+            processed_docs = future.result()  # Get the result from the future
+            additional_docs = processed_docs if processed_docs is not None else []
+
+    docs.extend(additional_docs)  # Add to pre-existing list
+
+    return docs
 
 def split_documents(documents):
     my_cprint(f"Splitting documents.", "white")
@@ -131,3 +155,8 @@ def split_documents(documents):
         my_cprint(f"Chunks between {lower_bound} and {upper_bound} characters: {count}", "white")
 
     return texts
+
+'''
+# document object structure: Document(page_content="[ALL TEXT EXTRACTED]", metadata={'source': '[FULL FILE PATH WITH DOUBLE BACKSLASHES'})
+# list structure: [Document(page_content="...", metadata={'source': '...'}), Document(page_content="...", metadata={'source': '...'})]
+'''