diff --git a/src/database_interactions.py b/src/database_interactions.py index 69761858..c0aee504 100644 --- a/src/database_interactions.py +++ b/src/database_interactions.py @@ -65,31 +65,21 @@ def initialize_vector_model(self, embedding_model_name, config_data): if key in EMBEDDING_MODEL_NAME: encode_kwargs['batch_size'] = value break - - my_cprint(f"Vector model initialized with a batch size of {encode_kwargs['batch_size']}", "blue") if "instructor" in embedding_model_name: encode_kwargs['show_progress_bar'] = True + print(f"Using embedding model path: {embedding_model_name}") - if "xl" in embedding_model_name: - model_version = "xl" - elif "base" in embedding_model_name: - model_version = "base" - else: - model_version = "large" - - model_name = f"hkunlp/instructor-{model_version}" - model = HuggingFaceInstructEmbeddings( - model_name=model_name, - cache_folder=embedding_model_name, + model_name=embedding_model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, ) + elif "bge" in embedding_model_name: query_instruction = config_data['embedding-models']['bge'].get('query_instruction') encode_kwargs['show_progress_bar'] = True - + model = HuggingFaceBgeEmbeddings( model_name=embedding_model_name, model_kwargs=model_kwargs, @@ -97,16 +87,6 @@ def initialize_vector_model(self, embedding_model_name, config_data): encode_kwargs=encode_kwargs ) - elif "nomic" in embedding_model_name: - model_kwargs['trust_remote_code'] = True - encode_kwargs['show_progress_bar'] = True - - model = HuggingFaceBgeEmbeddings( - model_name=embedding_model_name, - model_kwargs=model_kwargs, - encode_kwargs=encode_kwargs, - embed_instruction = "search_document:" - ) else: model = HuggingFaceEmbeddings( model_name=embedding_model_name, @@ -118,7 +98,7 @@ def initialize_vector_model(self, embedding_model_name, config_data): return model, encode_kwargs def create_database(self, texts, embeddings): - my_cprint("Creating vectors and database...\n\n NOTE:\n\nNOTE: The progress bar only relates to computing vectors, not inserting them into the database. Rest assured, after it reaches 100% it is still working unless you get an error message.\n", "yellow") + my_cprint("Creating vectors and database...\n\nNOTE: The progress bar only relates to computing vectors, not inserting them into the database. Rest assured, after it reaches 100% it is still working unless you get an error message.\n", "yellow") start_time = time.time() @@ -202,7 +182,7 @@ def run(self): json_docs_to_save = documents # load audio documents - audio_documents = self.load_audio_documents() # Now calling the method internally + audio_documents = self.load_audio_documents() documents.extend(audio_documents) if len(audio_documents) > 0: print(f"Loaded {len(audio_documents)} audio transcription(s)...") diff --git a/src/replace_sourcecode.py b/src/replace_sourcecode.py new file mode 100644 index 00000000..5aa34acd --- /dev/null +++ b/src/replace_sourcecode.py @@ -0,0 +1,75 @@ +import hashlib +from pathlib import Path +import shutil +import sys + +class DependencyUpdater: + def __init__(self): + self.site_packages_path = self.get_site_packages_path() + + def get_site_packages_path(self): + paths = sys.path + site_packages_paths = [Path(path) for path in paths if 'site-packages' in path.lower()] + return site_packages_paths[0] if site_packages_paths else None + + def find_dependency_path(self, dependency_path_segments): + current_path = self.site_packages_path + if current_path and current_path.exists(): + for segment in dependency_path_segments: + next_path = next((current_path / child for child in current_path.iterdir() if child.name.lower() == segment.lower()), None) + if next_path is None: + return None + current_path = next_path + return current_path + return None + + @staticmethod + def hash_file(filepath): + hasher = hashlib.sha256() + with open(filepath, 'rb') as afile: + buf = afile.read() + hasher.update(buf) + return hasher.hexdigest() + + @staticmethod + def copy_and_overwrite_if_necessary(source_path, target_path): + if not target_path.exists() or DependencyUpdater.hash_file(source_path) != DependencyUpdater.hash_file(target_path): + shutil.copy(source_path, target_path) + print(f"{source_path} has been successfully copied to {target_path}.") + else: + print(f"{target_path} is already up to date.") + + def update_file_in_dependency(self, source_folder, file_name, dependency_path_segments): + target_path = self.find_dependency_path(dependency_path_segments) + if target_path is None: + print("Target dependency path not found.") + return + + source_path = Path(__file__).parent / source_folder / file_name + if not source_path.exists(): + print(f"{file_name} not found in {source_folder}.") + return + + target_file = None + for child in target_path.iterdir(): + if child.is_file() and child.name.lower() == file_name.lower(): + target_file = child + break + + if target_file: + target_file_path = target_file + else: + target_file_path = target_path / file_name + + self.copy_and_overwrite_if_necessary(source_path, target_file_path) + +def replace_pdf_file(): + updater = DependencyUpdater() + updater.update_file_in_dependency("user_manual", "pdf.py", ["langchain_community", "document_loaders", "parsers"]) + +def replace_instructor_file(): + updater = DependencyUpdater() + updater.update_file_in_dependency("user_manual", "instructor.py", ["InstructorEmbedding"]) + +if __name__ == "__main__": + replace_pdf_file() diff --git a/src/requirements.txt b/src/requirements.txt index 539b289d..464ff81e 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -5,7 +5,7 @@ ctranslate2==4.1.0 tiledb==0.27.1 tiledb-vector-search==0.2.0 PyMuPDF==1.24 -# InstructorEmbedding==1.0.1 +InstructorEmbedding==1.0.1 sentence-transformers==2.6.1 pandas==2.2.1 docx2txt==0.8 @@ -29,7 +29,7 @@ einops==0.7.0 scipy==1.12.0 timm==0.9.16 sounddevice==0.4.6 -WhisperSpeech==0.8 +# WhisperSpeech==0.8 # required by whisperS2T platformdirs==4.2.0 diff --git a/src/server_connector.py b/src/server_connector.py index 7818504e..5a198c72 100644 --- a/src/server_connector.py +++ b/src/server_connector.py @@ -76,20 +76,9 @@ def initialize_vector_model(config): encode_kwargs = {'normalize_embeddings': False, 'batch_size': 1} if "instructor" in model_path: - if "xl" in model_path: - model_version = "xl" - elif "base" in model_path: - model_version = "base" - elif "large" in model_path: - model_version = "large" - else: - model_version = "default" - - model_name = f"hkunlp/instructor-{model_version}" return HuggingFaceInstructEmbeddings( - model_name=model_name, - cache_folder=model_path, + model_name=model_path, model_kwargs={"device": compute_device}, encode_kwargs=encode_kwargs, ) @@ -99,14 +88,6 @@ def initialize_vector_model(config): return HuggingFaceBgeEmbeddings(model_name=model_path, model_kwargs={"device": compute_device}, query_instruction=query_instruction, encode_kwargs=encode_kwargs) - elif "nomic" in model_path: - model = HuggingFaceBgeEmbeddings( - model_name=model_path, - model_kwargs={"device": compute_device, "trust_remote_code": True}, - encode_kwargs=encode_kwargs, - query_instruction = "search_query: Answer this question.", - ) - else: return HuggingFaceEmbeddings(model_name=model_path, model_kwargs={"device": compute_device}, encode_kwargs=encode_kwargs) diff --git a/src/setup.py b/src/setup.py index 9bce1e32..11d542a8 100644 --- a/src/setup.py +++ b/src/setup.py @@ -4,7 +4,7 @@ import tkinter as tk from tkinter import messagebox import constants as c -from replace_pdf import replace_pdf_file +from replace_sourcecode import replace_pdf_file, replace_instructor_file def tkinter_message_box(title, message, type="info", yes_no=False): root = tk.Tk() @@ -94,8 +94,9 @@ def setup_windows_installation(): os.system("python -m pip install --upgrade pip") install_pytorch(cuda_version_num, proceed) os.system("pip3 install -r requirements.txt") - os.system("pip install git+https://github.com/SilasMarvin/instructor-embedding.git@silas-update-for-newer-sentence-transformers") + #os.system("pip install git+https://github.com/SilasMarvin/instructor-embedding.git@silas-update-for-newer-sentence-transformers") os.system("pip3 install --no-deps -U git+https://github.com/shashikg/WhisperS2T.git") + os.system("pip3 install git+https://github.com/collabora/WhisperSpeech.git") os.system("pip3 install bitsandbytes") major, minor = map(int, sys.version.split()[0].split('.')[:2]) @@ -104,5 +105,6 @@ def setup_windows_installation(): os.system("pip3 install nvidia-ml-py==12.535.133") replace_pdf_file() + replace_instructor_file() setup_windows_installation() diff --git a/src/tts_module.py b/src/tts_module.py index 880c0fff..8a6a0181 100644 --- a/src/tts_module.py +++ b/src/tts_module.py @@ -177,15 +177,7 @@ def __init__(self): def initialize_model(self): s2a_ref = 'collabora/whisperspeech:s2a-q4-base-en+pl.model' - # s2a_ref = 'collabora/whisperspeech:s2a-q4-hq-fast-en+pl.model' # only works with WhisperSpeech repo code - # s2a_ref = 'collabora/whisperspeech:s2a-q4-small-en+pl.model' - # s2a_ref = 'collabora/whisperspeech:s2a-q4-tiny-en+pl.model' - - # t2s_ref = 'collabora/whisperspeech:t2s-base-en+pl.model' - # t2s_ref = 'collabora/whisperspeech:t2s-fast-medium-en+pl+yt.model' - # t2s_ref = 'collabora/whisperspeech:t2s-hq-fast-en+pl.model' - # t2s_ref = 'collabora/whisperspeech:t2s-small-en+pl.model' - t2s_ref = 'collabora/whisperspeech:t2s-tiny-en+pl.model' + t2s_ref = 'collabora/whisperspeech:t2s-base-en+pl.model' self.pipe = Pipeline(s2a_ref=s2a_ref, t2s_ref=t2s_ref) my_cprint(f"Using {s2a_ref} s2a model and {t2s_ref} t2s model.", "green") @@ -284,7 +276,6 @@ def release_resources(self): if torch.cuda.is_available(): torch.cuda.empty_cache() - # Force a garbage collection gc.collect() my_cprint("WhisperSpeech model removed from memory.", "red")