v2.6.1

BBC-Esq · Nov 9, 2023 · 60b9722 · 60b9722
1 parent 74f9c59
commit 60b9722
Show file tree

Hide file tree

Showing 15 changed files with 676 additions and 51 deletions.
diff --git a/src/config.py b/src/config.py
@@ -0,0 +1,27 @@
+import os
+
+# Global settings and variables
+ENABLE_PRINT = False
+GLOBAL_VAR = True
+
+# Directories
+ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
+SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/Docs_for_DB"
+
+# Ingest settings
+INGEST_THREADS = os.cpu_count() or 8
+
+# Document mappings
+DOCUMENT_MAP = {
+    ".pdf": "PDFMinerLoader",
+    ".docx": "Docx2txtLoader",
+    ".txt": "TextLoader",
+    ".json": "JSONLoader",
+    ".enex": "EverNoteLoader",
+    ".eml": "UnstructuredEmailLoader",
+    ".msg": "UnstructuredEmailLoader",
+    ".csv": "UnstructuredCSVLoader",
+    ".xls": "UnstructuredExcelLoader",
+    ".xlsx": "UnstructuredExcelLoader",
+    ".rtf": "UnstructuredRTFLoader",
+}
diff --git a/src/config.yaml b/src/config.yaml
@@ -23,14 +23,19 @@ AVAILABLE_MODELS:
 - jinaai/jina-embedding-b-en-v1
 - jinaai/jina-embedding-s-en-v1
 - jinaai/jina-embedding-t-en-v1
-- jinaai/jina-embeddings-v2-base-en
-- jinaai/jina-embeddings-v2-small-en
-COMPUTE_DEVICE: cpu
-EMBEDDING_MODEL_NAME: 
-chunk_overlap: 200
-chunk_size: 600
+COMPUTE_DEVICE: cuda
+Compute_Device:
+  available:
+  - cuda
+  - cpu
+  database_creation: cuda
+  database_query: cpu
+EMBEDDING_MODEL_NAME: C:/PATH/Scripts/ChromaDB-Plugin-for-LM-Studio/v2_6 - working/Embedding_Models/sentence-transformers--gtr-t5-base
 database:
-  contexts: 15
+  chunk_overlap: 150
+  chunk_size: 500
+  contexts: 10
+  device: null
   similarity: 0.9
 embedding-models:
   bge:
@@ -41,7 +46,7 @@ embedding-models:
 server:
   api_key: ''
   connection_str: http://localhost:1234/v1
-  model_max_tokens: 512
+  model_max_tokens: -1
   model_temperature: 0.1
   prefix: '[INST]'
   suffix: '[/INST]'
@@ -51,7 +56,15 @@ styles:
   frame: 'background-color: #161b22;'
   input: 'background-color: #2e333b; color: light gray; font: 13pt "Segoe UI Historic";'
   text: 'background-color: #092327; color: light gray; font: 12pt "Segoe UI Historic";'
-transcriber:
+transcribe_file:
   device: cpu
+  file: C:/PATH/Scripts/ChromaDB-Plugin-for-LM-Studio/v2_6 - working/test.mp3
+  language: Option 1
+  model: base.en
+  quant: int8
+  timestamps: true
+  translate: false
+transcriber:
+  device: cuda
   model: base.en
   quant: float32
diff --git a/src/create_database - backup.py b/src/create_database - backup.py
@@ -0,0 +1,152 @@
+import logging
+import os
+import shutil
+import yaml
+import gc
+from langchain.docstore.document import Document
+from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
+from langchain.vectorstores import Chroma
+from chromadb.config import Settings
+from document_processor import load_documents, split_documents
+import torch
+from utilities import validate_symbolic_links
+from termcolor import cprint
+# from memory_profiler import profile
+
+ENABLE_PRINT = True
+ENABLE_CUDA_PRINT = False
+
+torch.cuda.reset_peak_memory_stats()
+
+def my_cprint(*args, **kwargs):
+    if ENABLE_PRINT:
+        filename = "create_database.py"
+        modified_message = f"{filename}: {args[0]}"
+        cprint(modified_message, *args[1:], **kwargs)
+
+def print_cuda_memory():
+    if ENABLE_CUDA_PRINT:
+        max_allocated_memory = torch.cuda.max_memory_allocated()
+        memory_allocated = torch.cuda.memory_allocated()
+        reserved_memory = torch.cuda.memory_reserved()
+
+        my_cprint(f"Max CUDA memory allocated: {max_allocated_memory / (1024**2):.2f} MB", "green")
+        my_cprint(f"Total CUDA memory allocated: {memory_allocated / (1024**2):.2f} MB", "yellow")
+        my_cprint(f"Total CUDA memory reserved: {reserved_memory / (1024**2):.2f} MB", "yellow")
+
+print_cuda_memory()
+
+ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
+SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/Docs_for_DB"
+PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/Vector_DB"
+INGEST_THREADS = os.cpu_count() or 8
+
+CHROMA_SETTINGS = Settings(
+    chroma_db_impl="duckdb+parquet",
+    persist_directory=PERSIST_DIRECTORY,
+    anonymized_telemetry=False
+)
+
+# @profile
+def main():
+    print_cuda_memory()
+
+    with open(os.path.join(ROOT_DIRECTORY, "config.yaml"), 'r') as stream:
+        config_data = yaml.safe_load(stream)
+
+    EMBEDDING_MODEL_NAME = config_data.get("EMBEDDING_MODEL_NAME")
+
+    # calls document_processor.py
+    my_cprint(f"Loading documents.", "cyan")
+    documents = load_documents(SOURCE_DIRECTORY)
+    my_cprint(f"Successfully loaded documents.", "cyan")
+
+    # calls document_processory.py
+    texts = split_documents(documents)
+    print_cuda_memory()
+
+    # calls get_embeddings function
+    embeddings = get_embeddings(EMBEDDING_MODEL_NAME, config_data)
+    my_cprint("Embedding model loaded.", "green")
+    print_cuda_memory()
+
+    if os.path.exists(PERSIST_DIRECTORY):
+        shutil.rmtree(PERSIST_DIRECTORY)
+    os.makedirs(PERSIST_DIRECTORY)
+
+    my_cprint("Creating database.", "cyan")
+
+    db = Chroma.from_documents(
+        texts, embeddings, 
+        persist_directory=PERSIST_DIRECTORY, 
+        client_settings=CHROMA_SETTINGS,
+    )
+    print_cuda_memory()
+
+    # persist database
+    my_cprint("Persisting database.", "cyan")
+    db.persist()
+    my_cprint("Database persisted.", "cyan")
+    print_cuda_memory()
+
+    del embeddings.client
+    # my_cprint("Deleted embeddings.client.", "red")
+    print_cuda_memory()
+
+    del embeddings
+    # my_cprint("Deleted embeddings variable.", "red")
+    print_cuda_memory()
+
+    torch.cuda.empty_cache()
+    # my_cprint("CUDA cache emptied.", "red")
+    print_cuda_memory()
+
+    gc.collect()
+    my_cprint("Embedding model removed from memory.", "red")
+    print_cuda_memory()
+
+    # print(torch.cuda.memory_summary())
+
+# @profile
+def get_embeddings(EMBEDDING_MODEL_NAME, config_data, normalize_embeddings=False):
+    my_cprint("Creating embeddings.", "cyan")
+    print_cuda_memory()
+
+    compute_device = config_data['Compute_Device']['database_creation']
+
+    if "instructor" in EMBEDDING_MODEL_NAME:
+        embed_instruction = config_data['embedding-models']['instructor'].get('embed_instruction')
+        query_instruction = config_data['embedding-models']['instructor'].get('query_instruction')
+
+        return HuggingFaceInstructEmbeddings(# creating model instance
+            model_name=EMBEDDING_MODEL_NAME,
+            model_kwargs={"device": compute_device},
+            encode_kwargs={"normalize_embeddings": normalize_embeddings},
+            embed_instruction=embed_instruction,
+            query_instruction=query_instruction
+        )
+
+    elif "bge" in EMBEDDING_MODEL_NAME:
+        query_instruction = config_data['embedding-models']['bge'].get('query_instruction')
+
+        return HuggingFaceBgeEmbeddings(
+            model_name=EMBEDDING_MODEL_NAME,
+            model_kwargs={"device": compute_device},
+            query_instruction=query_instruction,
+            encode_kwargs={"normalize_embeddings": normalize_embeddings}
+        )
+
+    else:
+
+        return HuggingFaceEmbeddings(
+            model_name=EMBEDDING_MODEL_NAME,
+            model_kwargs={"device": compute_device},
+            encode_kwargs={"normalize_embeddings": normalize_embeddings}
+        )
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s",
+        level=logging.INFO
+    )
+    main()
diff --git a/src/create_database.py b/src/create_database.py
@@ -16,7 +16,7 @@
 ENABLE_PRINT = True
 ENABLE_CUDA_PRINT = False
 
-torch.cuda.reset_peak_memory_stats()
+# torch.cuda.reset_peak_memory_stats()
 
 def my_cprint(*args, **kwargs):
     if ENABLE_PRINT:
@@ -96,7 +96,6 @@ def main():
     gc.collect()
     my_cprint("Embedding model removed from memory.", "red")
     print_cuda_memory()
-
 
 # @profile
 def get_embeddings(EMBEDDING_MODEL_NAME, config_data, normalize_embeddings=False):
@@ -140,4 +139,4 @@ def get_embeddings(EMBEDDING_MODEL_NAME, config_data, normalize_embeddings=False
         format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s",
         level=logging.INFO
     )
-    main()
+    main()
diff --git a/src/gui.py b/src/gui.py
@@ -1,6 +1,6 @@
 from PySide6.QtWidgets import (
     QApplication, QWidget, QPushButton, QVBoxLayout, QTabWidget,
-    QTextEdit, QSplitter, QFrame, QStyleFactory, QLabel, QHBoxLayout
+    QTextEdit, QSplitter, QFrame, QStyleFactory, QLabel, QGridLayout
 )
 from PySide6.QtCore import Qt, QThread, Signal, QUrl
 from PySide6.QtWebEngineWidgets import QWebEngineView
@@ -37,34 +37,35 @@ def init_ui(self):
         self.setGeometry(300, 300, 975, 975)
         self.setMinimumSize(550, 610)
 
+        # Left panel setup with grid layout
         self.left_frame = QFrame()
-        left_vbox = QVBoxLayout()
+        grid_layout = QGridLayout()
+
+        # Tab widget spanning two columns
         tab_widget = create_tabs()
-        left_vbox.addWidget(tab_widget)
-
+        grid_layout.addWidget(tab_widget, 0, 0, 1, 2)  # Span two columns
+
+        # Button definitions and positions in the grid
         button_data = [
             ("Download Embedding Model", lambda: download_embedding_model(self)),
             ("Set Embedding Model Directory", select_embedding_model_directory),
             ("Choose Documents for Database", choose_documents_directory),
             ("Create Vector Database", self.on_create_button_clicked)
         ]
-
-        # Create two rows of buttons
-        for i in range(0, len(button_data), 2):
-            button_row = QHBoxLayout()
-            for j in range(2):
-                if i + j < len(button_data):
-                    text, handler = button_data[i+j]
-                    button = QPushButton(text)
-                    button.setStyleSheet(styles.get('button', ''))
-                    button.clicked.connect(handler)
-                    button_row.addWidget(button)
-            left_vbox.addLayout(button_row)
-
-        self.left_frame.setLayout(left_vbox)
+        button_positions = [(1, 0), (1, 1), (2, 0), (2, 1)]
+
+        # Create and add buttons to the grid layout
+        for position, (text, handler) in zip(button_positions, button_data):
+            button = QPushButton(text)
+            button.setStyleSheet(styles.get('button', ''))
+            button.clicked.connect(handler)
+            grid_layout.addWidget(button, *position)
+
+        self.left_frame.setLayout(grid_layout)
         self.left_frame.setStyleSheet(styles.get('frame', ''))
         main_splitter.addWidget(self.left_frame)
 
+        # Right panel setup
         right_frame = QFrame()
         right_vbox = QVBoxLayout()
 
@@ -84,8 +85,10 @@ def init_ui(self):
 
         right_vbox.addWidget(submit_questions_button)
 
+        # Define widget containing buttons
         button_row_widget = create_button_row(self.on_submit_button_clicked, styles.get('button', ''))
 
+        # Add widgets from button_module.py
         right_vbox.addWidget(button_row_widget)
 
         right_frame.setLayout(right_vbox)