Skip to content

Commit

Permalink
v2.6.1
Browse files Browse the repository at this point in the history
  • Loading branch information
BBC-Esq authored Nov 9, 2023
1 parent 74f9c59 commit 60b9722
Show file tree
Hide file tree
Showing 15 changed files with 676 additions and 51 deletions.
27 changes: 27 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os

# Global settings and variables
ENABLE_PRINT = False
GLOBAL_VAR = True

# Directories
ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/Docs_for_DB"

# Ingest settings
INGEST_THREADS = os.cpu_count() or 8

# Document mappings
DOCUMENT_MAP = {
".pdf": "PDFMinerLoader",
".docx": "Docx2txtLoader",
".txt": "TextLoader",
".json": "JSONLoader",
".enex": "EverNoteLoader",
".eml": "UnstructuredEmailLoader",
".msg": "UnstructuredEmailLoader",
".csv": "UnstructuredCSVLoader",
".xls": "UnstructuredExcelLoader",
".xlsx": "UnstructuredExcelLoader",
".rtf": "UnstructuredRTFLoader",
}
31 changes: 22 additions & 9 deletions src/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,19 @@ AVAILABLE_MODELS:
- jinaai/jina-embedding-b-en-v1
- jinaai/jina-embedding-s-en-v1
- jinaai/jina-embedding-t-en-v1
- jinaai/jina-embeddings-v2-base-en
- jinaai/jina-embeddings-v2-small-en
COMPUTE_DEVICE: cpu
EMBEDDING_MODEL_NAME:
chunk_overlap: 200
chunk_size: 600
COMPUTE_DEVICE: cuda
Compute_Device:
available:
- cuda
- cpu
database_creation: cuda
database_query: cpu
EMBEDDING_MODEL_NAME: C:/PATH/Scripts/ChromaDB-Plugin-for-LM-Studio/v2_6 - working/Embedding_Models/sentence-transformers--gtr-t5-base
database:
contexts: 15
chunk_overlap: 150
chunk_size: 500
contexts: 10
device: null
similarity: 0.9
embedding-models:
bge:
Expand All @@ -41,7 +46,7 @@ embedding-models:
server:
api_key: ''
connection_str: http://localhost:1234/v1
model_max_tokens: 512
model_max_tokens: -1
model_temperature: 0.1
prefix: '[INST]'
suffix: '[/INST]'
Expand All @@ -51,7 +56,15 @@ styles:
frame: 'background-color: #161b22;'
input: 'background-color: #2e333b; color: light gray; font: 13pt "Segoe UI Historic";'
text: 'background-color: #092327; color: light gray; font: 12pt "Segoe UI Historic";'
transcriber:
transcribe_file:
device: cpu
file: C:/PATH/Scripts/ChromaDB-Plugin-for-LM-Studio/v2_6 - working/test.mp3
language: Option 1
model: base.en
quant: int8
timestamps: true
translate: false
transcriber:
device: cuda
model: base.en
quant: float32
152 changes: 152 additions & 0 deletions src/create_database - backup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import logging
import os
import shutil
import yaml
import gc
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma
from chromadb.config import Settings
from document_processor import load_documents, split_documents
import torch
from utilities import validate_symbolic_links
from termcolor import cprint
# from memory_profiler import profile

ENABLE_PRINT = True
ENABLE_CUDA_PRINT = False

torch.cuda.reset_peak_memory_stats()

def my_cprint(*args, **kwargs):
if ENABLE_PRINT:
filename = "create_database.py"
modified_message = f"{filename}: {args[0]}"
cprint(modified_message, *args[1:], **kwargs)

def print_cuda_memory():
if ENABLE_CUDA_PRINT:
max_allocated_memory = torch.cuda.max_memory_allocated()
memory_allocated = torch.cuda.memory_allocated()
reserved_memory = torch.cuda.memory_reserved()

my_cprint(f"Max CUDA memory allocated: {max_allocated_memory / (1024**2):.2f} MB", "green")
my_cprint(f"Total CUDA memory allocated: {memory_allocated / (1024**2):.2f} MB", "yellow")
my_cprint(f"Total CUDA memory reserved: {reserved_memory / (1024**2):.2f} MB", "yellow")

print_cuda_memory()

ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/Docs_for_DB"
PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/Vector_DB"
INGEST_THREADS = os.cpu_count() or 8

CHROMA_SETTINGS = Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=PERSIST_DIRECTORY,
anonymized_telemetry=False
)

# @profile
def main():
print_cuda_memory()

with open(os.path.join(ROOT_DIRECTORY, "config.yaml"), 'r') as stream:
config_data = yaml.safe_load(stream)

EMBEDDING_MODEL_NAME = config_data.get("EMBEDDING_MODEL_NAME")

# calls document_processor.py
my_cprint(f"Loading documents.", "cyan")
documents = load_documents(SOURCE_DIRECTORY)
my_cprint(f"Successfully loaded documents.", "cyan")

# calls document_processory.py
texts = split_documents(documents)
print_cuda_memory()

# calls get_embeddings function
embeddings = get_embeddings(EMBEDDING_MODEL_NAME, config_data)
my_cprint("Embedding model loaded.", "green")
print_cuda_memory()

if os.path.exists(PERSIST_DIRECTORY):
shutil.rmtree(PERSIST_DIRECTORY)
os.makedirs(PERSIST_DIRECTORY)

my_cprint("Creating database.", "cyan")

db = Chroma.from_documents(
texts, embeddings,
persist_directory=PERSIST_DIRECTORY,
client_settings=CHROMA_SETTINGS,
)
print_cuda_memory()

# persist database
my_cprint("Persisting database.", "cyan")
db.persist()
my_cprint("Database persisted.", "cyan")
print_cuda_memory()

del embeddings.client
# my_cprint("Deleted embeddings.client.", "red")
print_cuda_memory()

del embeddings
# my_cprint("Deleted embeddings variable.", "red")
print_cuda_memory()

torch.cuda.empty_cache()
# my_cprint("CUDA cache emptied.", "red")
print_cuda_memory()

gc.collect()
my_cprint("Embedding model removed from memory.", "red")
print_cuda_memory()

# print(torch.cuda.memory_summary())

# @profile
def get_embeddings(EMBEDDING_MODEL_NAME, config_data, normalize_embeddings=False):
my_cprint("Creating embeddings.", "cyan")
print_cuda_memory()

compute_device = config_data['Compute_Device']['database_creation']

if "instructor" in EMBEDDING_MODEL_NAME:
embed_instruction = config_data['embedding-models']['instructor'].get('embed_instruction')
query_instruction = config_data['embedding-models']['instructor'].get('query_instruction')

return HuggingFaceInstructEmbeddings(# creating model instance
model_name=EMBEDDING_MODEL_NAME,
model_kwargs={"device": compute_device},
encode_kwargs={"normalize_embeddings": normalize_embeddings},
embed_instruction=embed_instruction,
query_instruction=query_instruction
)

elif "bge" in EMBEDDING_MODEL_NAME:
query_instruction = config_data['embedding-models']['bge'].get('query_instruction')

return HuggingFaceBgeEmbeddings(
model_name=EMBEDDING_MODEL_NAME,
model_kwargs={"device": compute_device},
query_instruction=query_instruction,
encode_kwargs={"normalize_embeddings": normalize_embeddings}
)

else:

return HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL_NAME,
model_kwargs={"device": compute_device},
encode_kwargs={"normalize_embeddings": normalize_embeddings}
)

if __name__ == "__main__":
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s",
level=logging.INFO
)
main()
5 changes: 2 additions & 3 deletions src/create_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
ENABLE_PRINT = True
ENABLE_CUDA_PRINT = False

torch.cuda.reset_peak_memory_stats()
# torch.cuda.reset_peak_memory_stats()

def my_cprint(*args, **kwargs):
if ENABLE_PRINT:
Expand Down Expand Up @@ -96,7 +96,6 @@ def main():
gc.collect()
my_cprint("Embedding model removed from memory.", "red")
print_cuda_memory()


# @profile
def get_embeddings(EMBEDDING_MODEL_NAME, config_data, normalize_embeddings=False):
Expand Down Expand Up @@ -140,4 +139,4 @@ def get_embeddings(EMBEDDING_MODEL_NAME, config_data, normalize_embeddings=False
format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s",
level=logging.INFO
)
main()
main()
39 changes: 21 additions & 18 deletions src/gui.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from PySide6.QtWidgets import (
QApplication, QWidget, QPushButton, QVBoxLayout, QTabWidget,
QTextEdit, QSplitter, QFrame, QStyleFactory, QLabel, QHBoxLayout
QTextEdit, QSplitter, QFrame, QStyleFactory, QLabel, QGridLayout
)
from PySide6.QtCore import Qt, QThread, Signal, QUrl
from PySide6.QtWebEngineWidgets import QWebEngineView
Expand Down Expand Up @@ -37,34 +37,35 @@ def init_ui(self):
self.setGeometry(300, 300, 975, 975)
self.setMinimumSize(550, 610)

# Left panel setup with grid layout
self.left_frame = QFrame()
left_vbox = QVBoxLayout()
grid_layout = QGridLayout()

# Tab widget spanning two columns
tab_widget = create_tabs()
left_vbox.addWidget(tab_widget)

grid_layout.addWidget(tab_widget, 0, 0, 1, 2) # Span two columns

# Button definitions and positions in the grid
button_data = [
("Download Embedding Model", lambda: download_embedding_model(self)),
("Set Embedding Model Directory", select_embedding_model_directory),
("Choose Documents for Database", choose_documents_directory),
("Create Vector Database", self.on_create_button_clicked)
]

# Create two rows of buttons
for i in range(0, len(button_data), 2):
button_row = QHBoxLayout()
for j in range(2):
if i + j < len(button_data):
text, handler = button_data[i+j]
button = QPushButton(text)
button.setStyleSheet(styles.get('button', ''))
button.clicked.connect(handler)
button_row.addWidget(button)
left_vbox.addLayout(button_row)

self.left_frame.setLayout(left_vbox)
button_positions = [(1, 0), (1, 1), (2, 0), (2, 1)]

# Create and add buttons to the grid layout
for position, (text, handler) in zip(button_positions, button_data):
button = QPushButton(text)
button.setStyleSheet(styles.get('button', ''))
button.clicked.connect(handler)
grid_layout.addWidget(button, *position)

self.left_frame.setLayout(grid_layout)
self.left_frame.setStyleSheet(styles.get('frame', ''))
main_splitter.addWidget(self.left_frame)

# Right panel setup
right_frame = QFrame()
right_vbox = QVBoxLayout()

Expand All @@ -84,8 +85,10 @@ def init_ui(self):

right_vbox.addWidget(submit_questions_button)

# Define widget containing buttons
button_row_widget = create_button_row(self.on_submit_button_clicked, styles.get('button', ''))

# Add widgets from button_module.py
right_vbox.addWidget(button_row_widget)

right_frame.setLayout(right_vbox)
Expand Down
Loading

0 comments on commit 60b9722

Please sign in to comment.