v6.8.0

BBC-Esq · Sep 5, 2024 · d4528d6 · d4528d6
1 parent 6169b45
commit d4528d6
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 41 deletions.
diff --git a/src/extract_metadata.py b/src/extract_metadata.py
@@ -49,9 +49,19 @@ def extract_audio_metadata(file_path):
 
 def add_pymupdf_page_metadata(doc: Document, chunk_size: int = 1200, chunk_overlap: int = 600) -> List[Document]:
     """
-    Splits and adds page metadata to each chunk of a pdf document.  Relies on the custom implementation of pymupdfparser
-    Called by document_processor.py.
-    """
+    Called by document_processor.py.  Chunks the body of text returned by the custom pymupdfparser script.
+    Uses a helper method named `split_text` to assign the appropriate page metadata to each chunk.
+
+        Detailed Process:
+        1. The method first identifies the positions of the custom page markers within the text using a regular expression.
+           These markers denote the start of a new page (e.g., `[[page1]]`).
+        2. The text is then cleaned by removing the page markers, resulting in a continuous block of text.
+        3. The cleaned text is split into chunks based on the specified `chunk_size`. If the chunk size exceeds the
+           remaining length of the text, the last chunk is adjusted to include the remaining text.
+        4. For each chunk, the method determines the appropriate page number by finding the nearest preceding page
+           marker position.
+        5. The method returns a list of tuples where each tuple contains a chunk of text and the page number associated with that chunk.
+        """
     def split_text(text: str, chunk_size: int, chunk_overlap: int) -> List[Tuple[str, int]]:
         page_markers = [(m.start(), int(m.group(1))) for m in re.finditer(r'\[\[page(\d+)\]\]', text)]
         clean_text = re.sub(r'\[\[page\d+\]\]', '', text)

diff --git a/src/gui_tabs_settings_vision.py b/src/gui_tabs_settings_vision.py
@@ -79,7 +79,7 @@ def populate_model_combobox(self):
                     else:
                         available_models.append(model)
                 else:
-                    available_models.append(model)  # Add non-CUDA models even if CUDA is available
+                    available_models.append(model)
             else:
                 if not requires_cuda:
                     available_models.append(model)

diff --git a/src/module_process_images.py b/src/module_process_images.py
@@ -26,20 +26,6 @@
 
 set_logging_level()
 
-# warnings.filterwarnings("ignore", category=FutureWarning)
-# warnings.filterwarnings("ignore", category=UserWarning)
-# warnings.filterwarnings("ignore", category=DeprecationWarning)
-# warnings.filterwarnings("ignore", message=".*Torch was not compiled with flash attention.*")
-
-# datasets_logger = logging.getLogger('datasets')
-# datasets_logger.setLevel(logging.WARNING)
-# logging.getLogger("transformers").setLevel(logging.CRITICAL)
-# logging.getLogger("transformers").setLevel(logging.ERROR)
-# logging.getLogger("transformers").setLevel(logging.WARNING)
-# logging.getLogger("transformers").setLevel(logging.INFO)
-# logging.getLogger("transformers").setLevel(logging.DEBUG)
-# logging.getLogger().setLevel(logging.WARNING)
-
 ALLOWED_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff']
 
 current_directory = Path(__file__).parent
@@ -202,7 +188,6 @@ def process_single_image(self, raw_image):
         model_response = full_response.split("ASSISTANT: ")[-1]
         return model_response
 
-
 class loader_llava_next(BaseLoader):
     def initialize_model_and_tokenizer(self):
         chosen_model = self.config['vision']['chosen_model']
@@ -247,7 +232,6 @@ def process_single_image(self, raw_image):
 
         return model_response
 
-
 class loader_falcon(BaseLoader):
     def initialize_model_and_tokenizer(self):
         chosen_model = self.config['vision']['chosen_model']
@@ -302,8 +286,6 @@ def process_single_image(self, raw_image):
 
         return model_response
 
-
-
 class loader_moondream(BaseLoader):
     def initialize_model_and_tokenizer(self):
         chosen_model = self.config['vision']['chosen_model']
@@ -329,7 +311,6 @@ def process_single_image(self, raw_image):
         summary = self.model.answer_question(enc_image, "Describe what this image depicts in as much detail as possible.", self.tokenizer)
         return summary
 
-
 class loader_florence2(BaseLoader):
     def __init__(self, config):
         super().__init__(config)
@@ -392,7 +373,6 @@ def process_single_image(self, raw_image):
 
         return parsed_answer['<MORE_DETAILED_CAPTION>']
 
-
 class loader_phi3vision(BaseLoader):
     def initialize_model_and_tokenizer(self):
         chosen_model = self.config['vision']['chosen_model']
@@ -465,7 +445,6 @@ def process_single_image(self, raw_image):
 
         return response
 
-
 class loader_minicpm_V_2_6(BaseLoader):
     def initialize_model_and_tokenizer(self):
         chosen_model = self.config['vision']['chosen_model']

diff --git a/src/module_transcribe.py b/src/module_transcribe.py
@@ -112,23 +112,22 @@ def convert_to_wav(self, audio_file):
         output_file = f"{Path(audio_file).stem}_converted.wav"
         output_path = Path(__file__).parent / output_file
 
-        with av.open(audio_file) as container:
-            stream = next(s for s in container.streams if s.type == 'audio')
-
-            resampler = av.AudioResampler(
-                format='s16',
-                layout='mono',
-                rate=16000,
-            )
+        with av.open(audio_file) as input_container:
+            input_stream = input_container.streams.audio[0]
 
             output_container = av.open(str(output_path), mode='w')
             output_stream = output_container.add_stream('pcm_s16le', rate=16000)
-            output_stream.layout = 'mono'
+            output_stream.channels = 1
+
+            resampler = av.AudioResampler(format='s16', layout='mono', rate=16000)
+
+            # Determine optimal chunk size (adjust as needed)
+            chunk_size = 1024 * 32  # 32KB chunks
 
-            for frame in container.decode(audio=0):
+            for frame in input_container.decode(audio=0):
                 frame.pts = None
                 resampled_frames = resampler.resample(frame)
-                if resampled_frames is not None:
+                if resampled_frames:
                     for resampled_frame in resampled_frames:
                         for packet in output_stream.encode(resampled_frame):
                             output_container.mux(packet)

diff --git a/src/setup_windows.py b/src/setup_windows.py
@@ -253,7 +253,6 @@ def install_libraries(libraries):
     "protobuf==5.27.2",
     "psutil==6.0.0",
     "pyarrow==17.0.0",
-    "pyarrow-hotfix==0.6",
     "pycparser==2.22",
     "pydantic==2.7.4",
     "pydantic_core==2.18.4",
@@ -317,10 +316,8 @@ def install_libraries(libraries):
     "zipp==3.19.2"
 ]
 
-# matplotlib==3.9.2
-# pyparsing==3.1.2
-# cycler==0.12.1
-# kiwisolver==1.4.5
+# pip install matplotlib==3.9.2 pyparsing==3.1.2 cycler==0.12.1 kiwisolver==1.4.5 --no-deps
+# matplotlib will still show conflicts re missing libraries, but these are not needed to run my specific plots
 
 full_install_libraries = [
     "pyside6==6.7.2",