Added examples, TimeShift and GetTempo nodes

christian-byrne · Jul 19, 2024 · b7d6600 · b7d6600
1 parent a5107dd
commit b7d6600
Show file tree

Hide file tree

Showing 12 changed files with 487 additions and 123 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,4 @@ samples
 lut
 video-editing/
 video-editing/**/**
+testing-all-nodes-megaworkflow.json
diff --git a/README.md b/README.md
@@ -2,7 +2,6 @@
 ![worklow picture](./wiki/pics/Selection_016.png)
 
 
-
 https://github.com/user-attachments/assets/c5cf20de-a17f-438d-81ac-0c392af669cf
 
 
@@ -26,51 +25,66 @@ https://github.com/user-attachments/assets/c5af418e-7137-4c36-b86e-3352cf558ea8
 
 
 
-#### *`Separating Song Vocals`*
 
-<details>
+#### *`Replacing BGM with StableAudio-Generated BGM`*
 
+<details>
+
 <summary> &nbsp; Show </summary>
 
-[workflow.json](./wiki/workflows/isolate-vocals-song.json)
+*For example, to replace copyrighted BGM with new music that has the same mood*.
 
-https://github.com/user-attachments/assets/c5cf20de-a17f-438d-81ac-0c392af669cf
+*NOTE*: In order to load videos into the LoadAudio Node, change [this line](https://github.com/comfyanonymous/ComfyUI/blob/faa57430b0ff882275b1afcf6610e8e9f8a5929b/comfy_extras/nodes_audio.py#L185) in your comfy install to include the `.ext` (e.g., `.mp4`)
+
+[workflow json](./wiki/workflows/replace-bgm.json)
+
+https://github.com/user-attachments/assets/a7d5656b-5f8b-439a-936f-6ebb6a0d538a
 
 </details>
 
 
+#### *`Remixing Songs with StableAudio`*
 
-#### *`Replacing BGM with Generated BGM`*
 
 <details>
-  
+
 <summary> &nbsp; Show </summary>
 
-*For example, to replace copyrighted BGM with new music that has the same mood*.
+- [workflow json](./wiki/workflows/remix-songs.json)
+- [example output (audio file) with embedded workflow](./wiki/examples/ComfyUI_temp_iaepj_00001_.flac)
+- [example output (audio file) with embedded workflow](./wiki/examples/ComfyUI_00002_.flac)
 
-*NOTE*: In order to load videos into the LoadAudio Node, change [this line](https://github.com/comfyanonymous/ComfyUI/blob/faa57430b0ff882275b1afcf6610e8e9f8a5929b/comfy_extras/nodes_audio.py#L185) in your comfy install to include the `.ext` (e.g., `.mp4`)
+</details>
 
-[workflow json](./wiki/workflows/replace-bgm.json)
 
-https://github.com/user-attachments/assets/a7d5656b-5f8b-439a-936f-6ebb6a0d538a
+#### *`Separating Song Vocals`*
+
+<details>
+
+<summary> &nbsp; Show </summary>
+
+[workflow.json](./wiki/workflows/isolate-vocals-song.json)
+
+https://github.com/user-attachments/assets/c5cf20de-a17f-438d-81ac-0c392af669cf
 
 </details>
 
 
-#### *`Remixing Songs with StableAudio`*
+
+
+#### *`Extracting Instrumentals from Songs`*
 
 
 <details>
 
 <summary> &nbsp; Show </summary>
 
-- [workflow json](./wiki/workflows/remix-songs.json)
-- [example output (audio file) with embedded workflow](./wiki/examples/ComfyUI_temp_ksudt_00002_.flac)
-- [example output (audio file) with embedded workflow](./wiki/examples/ComfyUI_00002_.flac)
+- [workflow json](./wiki/workflows/extract-instrumental.json)
 
 </details>
 
 
+&nbsp;
 
 # Requirements
 

diff --git a/__init__.py b/__init__.py
@@ -3,6 +3,8 @@
 from .src.crop import AudioCrop
 from .src.combine import AudioCombine
 from .src.combine_video_with_audio import AudioVideoCombine
+from .src.time_shift import TimeShift
+from .src.get_tempo import GetTempo
 
 
 NODE_CLASS_MAPPINGS = {
@@ -11,4 +13,6 @@
     "AudioCombine": AudioCombine,
     "AudioTempoMatch": TempoMatch,
     "AudioVideoCombine": AudioVideoCombine,
+    "AudioSpeedShift": TimeShift,
+    "AudioGetTempo": GetTempo,
 }
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "audio-separation-nodes-comfyui"
 description = "Separate audio track into stems (vocals, bass, drums, other). Along with tools to recombine, tempo match, slice/crop audio"
-version = "1.2.1"
+version = "1.3.0"
 license = "LICENSE"
 dependencies = ["librosa==0.10.2", "numpy", "torchaudio>=2.3.0", "moviepy"]
 

diff --git a/src/get_tempo.py b/src/get_tempo.py
@@ -0,0 +1,29 @@
+from .utils import estimate_tempo
+
+from typing import Tuple
+from ._types import AUDIO
+
+
+class GetTempo:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "audio": ("AUDIO",),
+            },
+        }
+
+    FUNCTION = "main"
+    RETURN_TYPES = ("STRING", "FLOAT", "INTEGER")
+    RETURN_NAMES = ("tempo_string", "tempo_float", "tempo_integer")
+    CATEGORY = "audio"
+
+    def main(
+        self,
+        audio: AUDIO,
+    ) -> Tuple[AUDIO, AUDIO]:
+        waveform = audio["waveform"].squeeze(0)
+        sample_rate = audio["sample_rate"]
+        tempo = estimate_tempo(waveform, sample_rate)
+
+        return (f"{int(round(tempo))}", tempo, int(tempo))
diff --git a/src/resample.py b/src/resample.py
@@ -38,21 +38,16 @@ def __init__(
             self.new_freq = self.orig_freq * self.LOWER_CLAMP
 
         diff = abs(1 - change_ratio)
-        print(f"Change Ratio: {change_ratio}, Diff: {diff}")
         if diff > 0.08:
             self.chunk_size_seconds = min(self.chunk_size_seconds, 1)
         elif diff > 0.002:
             self.chunk_size_seconds = min(self.chunk_size_seconds, 2)
         else:
             self.chunk_size_seconds = min(self.chunk_size_seconds, 4)
 
-        print(f"Chunk Size: {self.chunk_size_seconds}")
-
         # If the frequencies are float, try to convert to int while
         # maintaining ratio (https://github.com/pytorch/audio/issues/1487).
         self.orig_freq, self.new_freq = ChunkResampler.reduce_ratio(orig_freq, new_freq)
-        print(f"Orig Freq: {self.orig_freq}, New Freq: {self.new_freq}")
-
         self.device = comfy.model_management.get_torch_device()
         self.resample = Resample(self.orig_freq, self.new_freq).to(self.device)
 

diff --git a/src/separation.py b/src/separation.py
@@ -107,7 +107,7 @@ def separate_sources(
         chunk_fade_shape: str = "linear",
     ) -> torch.Tensor:
         """
-        https://pytorch.org/audio/stable/tutorials/hybrid_demucs_tutorial.html
+        From: https://pytorch.org/audio/stable/tutorials/hybrid_demucs_tutorial.html
 
         Apply model to a given mixture. Use fade, and add segments together in order to add model segment by segment.
 

diff --git a/src/tempo_match.py b/src/tempo_match.py
@@ -1,9 +1,4 @@
-import math
-import numpy as np
-import torch
-import librosa
-
-import torchaudio.functional as F
+from .utils import estimate_tempo, time_shift
 
 from typing import Tuple
 from ._types import AUDIO
@@ -23,113 +18,26 @@ def INPUT_TYPES(cls):
     RETURN_TYPES = ("AUDIO", "AUDIO")
     CATEGORY = "audio"
 
-    def time_shift(
-        self,
-        waveform: torch.Tensor,
-        rate: float,
-        fft_size: int = 2048,
-        hop_size: int = None,
-        win_length: int = None,
-    ) -> torch.Tensor:
-        """
-        Args:
-            waveform (torch.Tensor): Time-domain input of shape [channels, frames]
-            rate (float): rate to shift the waveform by
-            fft_size (int): Size of the FFT to be used (power of 2)
-            hop_size (int): Hop length for overlap (e.g., fft_size // 4)
-            win_length (int): Window size (often equal to fft_size)
-
-        Returns:
-            torch.Tensor: Time-domain output of same shape/type as input [channels, frames]
-            
-        """
-        if hop_size is None:
-            hop_size = fft_size // 4
-        if win_length is None:
-            win_length = fft_size
-
-        window = torch.hann_window(
-            win_length, device=waveform.device
-        )  # shape: [win_length]
-
-        with torch.no_grad():
-            complex_spectogram = torch.stft(
-                waveform,
-                n_fft=fft_size,
-                hop_length=hop_size,
-                win_length=win_length,
-                window=window,
-                return_complex=True,
-            )  # shape: [channels, freq, time]
-
-            if complex_spectogram.dtype != torch.cfloat:
-                raise TypeError(f"Expected complex-valued STFT for phase vocoder, got dtype {complex_spectogram.dtype}")
-
-            phase_advance = torch.linspace(0, math.pi * hop_size, complex_spectogram.shape[1])[
-                ..., None
-            ]  #  shape: [freq, 1]
-
-            stretched_spectogram = F.phase_vocoder(
-                complex_spectogram, rate, phase_advance
-            )  # shape: [channels, freq, stretched_time]
-
-            expected_time = math.ceil(complex_spectogram.shape[2] / rate)
-            assert (
-                abs(stretched_spectogram.shape[2] - expected_time) < 3
-            ), f"Expected Time: {expected_time}, Stretched Time: {stretched_spectogram.shape[2]}"
-
-            # Convert back to time basis with inverse STFT
-            return torch.istft(
-                stretched_spectogram,
-                n_fft=fft_size,
-                hop_length=hop_size,
-                win_length=win_length,
-                window=window,
-            )  # shape: [channels, frames]
-
-    def estimate_tempo(self, waveform: torch.Tensor, sample_rate: int) -> float:
-        if waveform.dim() == 3:
-            waveform = waveform.squeeze(0)
-        if waveform.dim() != 2:
-            raise TypeError(f"Expected waveform to be [channels, frames], got {waveform.shape}")
-
-        onset_env = librosa.onset.onset_strength(
-            y=waveform.numpy(),
-            sr=sample_rate,
-            aggregate=np.median,
-        )
-
-        tempo, _= librosa.beat.beat_track(
-            onset_envelope=onset_env,
-            sr=sample_rate,
-            tightness=110,
-            sparse=False,
-            trim=True,
-        ) # [[channel 1 tempo], [channel 2 tempo], ...], _
-
-        mean_tempo = np.mean(tempo.flatten())
-        return max(mean_tempo, 1.0)
-
     def main(
         self,
         audio_1: AUDIO,
         audio_2: AUDIO,
     ) -> Tuple[AUDIO, AUDIO]:
-        waveform_1: torch.Tensor = audio_1["waveform"].squeeze(0)
-        input_sample_rate_1: int = audio_1["sample_rate"]
+        waveform_1 = audio_1["waveform"].squeeze(0)
+        input_sample_rate_1 = audio_1["sample_rate"]
 
-        waveform_2: torch.Tensor = audio_2["waveform"].squeeze(0)
-        input_sample_rate_2: int = audio_2["sample_rate"]
+        waveform_2 = audio_2["waveform"].squeeze(0)
+        input_sample_rate_2 = audio_2["sample_rate"]
 
-        tempo_1 = self.estimate_tempo(waveform_1, input_sample_rate_1)
-        tempo_2 = self.estimate_tempo(waveform_2, input_sample_rate_2)
+        tempo_1 = estimate_tempo(waveform_1, input_sample_rate_1)
+        tempo_2 = estimate_tempo(waveform_2, input_sample_rate_2)
         avg_tempo = (tempo_1 + tempo_2) / 2
 
         rate_1 = avg_tempo / tempo_1
         rate_2 = avg_tempo / tempo_2
 
-        waveform_1 = self.time_shift(waveform_1, rate_1)
-        waveform_2 = self.time_shift(waveform_2, rate_2)
+        waveform_1 = time_shift(waveform_1, rate_1)
+        waveform_2 = time_shift(waveform_2, rate_2)
 
         return (
             {

diff --git a/src/time_shift.py b/src/time_shift.py
@@ -0,0 +1,36 @@
+from .utils import time_shift
+
+from typing import Tuple
+from ._types import AUDIO
+
+
+class TimeShift:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "audio": ("AUDIO",),
+                "rate": ("FLOAT", {"default": 1.0, "min": 0.1, "max": 10.0}),
+            },
+        }
+
+    FUNCTION = "main"
+    RETURN_TYPES = ("AUDIO",)
+    CATEGORY = "audio"
+
+    def main(
+        self,
+        audio: AUDIO,
+        rate: float,
+    ) -> Tuple[AUDIO, AUDIO]:
+        waveform = audio["waveform"].squeeze(0)
+        sample_rate = audio["sample_rate"]
+        rate = min(max(rate, 0.1), 10.0)
+        shifted = time_shift(waveform, rate)
+
+        return (
+            {
+                "waveform": shifted.unsqueeze(0),
+                "sample_rate": sample_rate,
+            },
+        )