diff --git a/src/database_interactions.py b/src/database_interactions.py index 1be2749d..41630c03 100644 --- a/src/database_interactions.py +++ b/src/database_interactions.py @@ -254,6 +254,8 @@ def run(self): if len(audio_documents) > 0: print(f"Loaded {len(audio_documents)} audio transcription(s)...") + texts = [] # listed created to hold split documents + # split documents if isinstance(documents, list) and documents: texts = split_documents(documents) diff --git a/src/document_processor.py b/src/document_processor.py index 0f4d520e..c22e35cb 100644 --- a/src/document_processor.py +++ b/src/document_processor.py @@ -122,29 +122,29 @@ def split_documents(documents): chunk_overlap = config["database"]["chunk_overlap"] text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) - print(f"Text splitter type: {type(text_splitter)}, content: {text_splitter.__dict__}") + # print(f"Text splitter type: {type(text_splitter)}, content: {text_splitter.__dict__}") # Summarize documents before conversion - type_count = defaultdict(int) - exceptions = [] - for i, doc in enumerate(documents): - doc_type = type(doc).__name__ - content_type = type(doc.page_content).__name__ - type_key = f"{doc_type}, content type: {content_type}" - type_count[type_key] += 1 + # type_count = defaultdict(int) + # exceptions = [] + # for i, doc in enumerate(documents): + # doc_type = type(doc).__name__ + # content_type = type(doc.page_content).__name__ + # type_key = f"{doc_type}, content type: {content_type}" + # type_count[type_key] += 1 - if content_type != 'str': - exceptions.append(f"Document {i} has unexpected content type: {content_type}") + # if content_type != 'str': + # exceptions.append(f"Document {i} has unexpected content type: {content_type}") - print("Document summary before conversion:") - print(f"Total documents: {len(documents)}") - for type_key, count in type_count.items(): - print(f"{count} documents of type: {type_key}") + # print("Document summary before conversion:") + # print(f"Total documents: {len(documents)}") + # for type_key, count in type_count.items(): + # print(f"{count} documents of type: {type_key}") - if exceptions: - print("\nExceptions found:") - for exception in exceptions: - print(exception) + # if exceptions: + # print("\nExceptions found:") + # for exception in exceptions: + # print(exception) # Convert "page content" within each document object to a string if it isn't already for i, doc in enumerate(documents): @@ -153,26 +153,26 @@ def split_documents(documents): documents[i].page_content = str(doc.page_content) # Summarize documents after conversion - type_count.clear() - exceptions.clear() - for i, doc in enumerate(documents): - doc_type = type(doc).__name__ - content_type = type(doc.page_content).__name__ - type_key = f"{doc_type}, content type: {content_type}" - type_count[type_key] += 1 + # type_count.clear() + # exceptions.clear() + # for i, doc in enumerate(documents): + # doc_type = type(doc).__name__ + # content_type = type(doc.page_content).__name__ + # type_key = f"{doc_type}, content type: {content_type}" + # type_count[type_key] += 1 - if content_type != 'str': - exceptions.append(f"Document {i} has unexpected content type: {content_type}") + # if content_type != 'str': + # exceptions.append(f"Document {i} has unexpected content type: {content_type}") - print("\nDocument summary after conversion:") - print(f"Total documents: {len(documents)}") - for type_key, count in type_count.items(): - print(f"{count} documents of type: {type_key}") + # print("\nDocument summary after conversion:") + # print(f"Total documents: {len(documents)}") + # for type_key, count in type_count.items(): + # print(f"{count} documents of type: {type_key}") - if exceptions: - print("\nExceptions found:") - for exception in exceptions: - print(exception) + # if exceptions: + # print("\nExceptions found:") + # for exception in exceptions: + # print(exception) try: print(f"\nSplitting {len(documents)} documents.") diff --git a/src/gui_tabs_settings_database_create.py b/src/gui_tabs_settings_database_create.py index 10af9b55..18e25620 100644 --- a/src/gui_tabs_settings_database_create.py +++ b/src/gui_tabs_settings_database_create.py @@ -6,17 +6,15 @@ class ChunkSettingsTab(QWidget): def __init__(self): super(ChunkSettingsTab, self).__init__() - with open('config.yaml', 'r', encoding='utf-8') as f: config_data = yaml.safe_load(f) self.database_config = config_data['database'] self.compute_device_options = config_data['Compute_Device']['available'] self.database_creation_device = config_data['Compute_Device']['database_creation'] - grid_layout = QGridLayout() - + # Device selection and current setting - self.device_label = QLabel("Create Device:") + self.device_label = QLabel("Device:") grid_layout.addWidget(self.device_label, 0, 0) self.device_combo = QComboBox() self.device_combo.addItems(self.compute_device_options) @@ -26,29 +24,29 @@ def __init__(self): grid_layout.addWidget(self.device_combo, 0, 2) self.current_device_label = QLabel(f"{self.database_creation_device}") grid_layout.addWidget(self.current_device_label, 0, 1) - - # Chunk overlap and current setting - self.chunk_overlap_label = QLabel("Chunk Overlap:") - grid_layout.addWidget(self.chunk_overlap_label, 0, 3) - self.chunk_overlap_edit = QLineEdit() - self.chunk_overlap_edit.setPlaceholderText("Enter new chunk_overlap...") - self.chunk_overlap_edit.setValidator(QIntValidator()) - grid_layout.addWidget(self.chunk_overlap_edit, 0, 5) - current_overlap = self.database_config.get('chunk_overlap', '') - self.current_overlap_label = QLabel(f"{current_overlap}") - grid_layout.addWidget(self.current_overlap_label, 0, 4) - - # Chunk size and current setting - self.chunk_size_label = QLabel("Chunk Size:") - grid_layout.addWidget(self.chunk_size_label, 0, 6) + + # Chunk size and current setting (moved to the left) + self.chunk_size_label = QLabel("Chunk Size (# characters):") + grid_layout.addWidget(self.chunk_size_label, 0, 3) self.chunk_size_edit = QLineEdit() self.chunk_size_edit.setPlaceholderText("Enter new chunk_size...") self.chunk_size_edit.setValidator(QIntValidator()) - grid_layout.addWidget(self.chunk_size_edit, 0, 8) + grid_layout.addWidget(self.chunk_size_edit, 0, 5) current_size = self.database_config.get('chunk_size', '') self.current_size_label = QLabel(f"{current_size}") - grid_layout.addWidget(self.current_size_label, 0, 7) - + grid_layout.addWidget(self.current_size_label, 0, 4) + + # Chunk overlap and current setting (moved to the right) + self.chunk_overlap_label = QLabel("Overlap (# characters):") + grid_layout.addWidget(self.chunk_overlap_label, 0, 6) + self.chunk_overlap_edit = QLineEdit() + self.chunk_overlap_edit.setPlaceholderText("Enter new chunk_overlap...") + self.chunk_overlap_edit.setValidator(QIntValidator()) + grid_layout.addWidget(self.chunk_overlap_edit, 0, 8) + current_overlap = self.database_config.get('chunk_overlap', '') + self.current_overlap_label = QLabel(f"{current_overlap}") + grid_layout.addWidget(self.current_overlap_label, 0, 7) + self.setLayout(grid_layout) def update_config(self): diff --git a/src/gui_tabs_settings_database_query.py b/src/gui_tabs_settings_database_query.py index e5d2c780..9d521b31 100644 --- a/src/gui_tabs_settings_database_query.py +++ b/src/gui_tabs_settings_database_query.py @@ -19,7 +19,7 @@ def __init__(self): self.field_data = {} self.label_data = {} - self.query_device_label = QLabel(f"Query Device: {self.database_query_device}") + self.query_device_label = QLabel(f"Device: {self.database_query_device}") self.query_device_combo = QComboBox() self.query_device_combo.addItems(self.compute_device_options) if self.database_query_device in self.compute_device_options: