diff --git a/.gitignore b/.gitignore index 9a91213..c0f1f11 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ samples lut video-editing/ video-editing/**/** +testing-all-nodes-megaworkflow.json \ No newline at end of file diff --git a/README.md b/README.md index dc1af72..9d42c9d 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,6 @@ ![worklow picture](./wiki/pics/Selection_016.png) - https://github.com/user-attachments/assets/c5cf20de-a17f-438d-81ac-0c392af669cf @@ -26,51 +25,66 @@ https://github.com/user-attachments/assets/c5af418e-7137-4c36-b86e-3352cf558ea8 -#### *`Separating Song Vocals`* -
+#### *`Replacing BGM with StableAudio-Generated BGM`* +
+   Show -[workflow.json](./wiki/workflows/isolate-vocals-song.json) +*For example, to replace copyrighted BGM with new music that has the same mood*. -https://github.com/user-attachments/assets/c5cf20de-a17f-438d-81ac-0c392af669cf +*NOTE*: In order to load videos into the LoadAudio Node, change [this line](https://github.com/comfyanonymous/ComfyUI/blob/faa57430b0ff882275b1afcf6610e8e9f8a5929b/comfy_extras/nodes_audio.py#L185) in your comfy install to include the `.ext` (e.g., `.mp4`) + +[workflow json](./wiki/workflows/replace-bgm.json) + +https://github.com/user-attachments/assets/a7d5656b-5f8b-439a-936f-6ebb6a0d538a
+#### *`Remixing Songs with StableAudio`* -#### *`Replacing BGM with Generated BGM`*
- +   Show -*For example, to replace copyrighted BGM with new music that has the same mood*. +- [workflow json](./wiki/workflows/remix-songs.json) +- [example output (audio file) with embedded workflow](./wiki/examples/ComfyUI_temp_iaepj_00001_.flac) +- [example output (audio file) with embedded workflow](./wiki/examples/ComfyUI_00002_.flac) -*NOTE*: In order to load videos into the LoadAudio Node, change [this line](https://github.com/comfyanonymous/ComfyUI/blob/faa57430b0ff882275b1afcf6610e8e9f8a5929b/comfy_extras/nodes_audio.py#L185) in your comfy install to include the `.ext` (e.g., `.mp4`) +
-[workflow json](./wiki/workflows/replace-bgm.json) -https://github.com/user-attachments/assets/a7d5656b-5f8b-439a-936f-6ebb6a0d538a +#### *`Separating Song Vocals`* + +
+ +   Show + +[workflow.json](./wiki/workflows/isolate-vocals-song.json) + +https://github.com/user-attachments/assets/c5cf20de-a17f-438d-81ac-0c392af669cf
-#### *`Remixing Songs with StableAudio`* + + +#### *`Extracting Instrumentals from Songs`*
  Show -- [workflow json](./wiki/workflows/remix-songs.json) -- [example output (audio file) with embedded workflow](./wiki/examples/ComfyUI_temp_ksudt_00002_.flac) -- [example output (audio file) with embedded workflow](./wiki/examples/ComfyUI_00002_.flac) +- [workflow json](./wiki/workflows/extract-instrumental.json)
+  # Requirements diff --git a/__init__.py b/__init__.py index 73b1af3..a919586 100644 --- a/__init__.py +++ b/__init__.py @@ -3,6 +3,8 @@ from .src.crop import AudioCrop from .src.combine import AudioCombine from .src.combine_video_with_audio import AudioVideoCombine +from .src.time_shift import TimeShift +from .src.get_tempo import GetTempo NODE_CLASS_MAPPINGS = { @@ -11,4 +13,6 @@ "AudioCombine": AudioCombine, "AudioTempoMatch": TempoMatch, "AudioVideoCombine": AudioVideoCombine, + "AudioSpeedShift": TimeShift, + "AudioGetTempo": GetTempo, } \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 6231549..88b39f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "audio-separation-nodes-comfyui" description = "Separate audio track into stems (vocals, bass, drums, other). Along with tools to recombine, tempo match, slice/crop audio" -version = "1.2.1" +version = "1.3.0" license = "LICENSE" dependencies = ["librosa==0.10.2", "numpy", "torchaudio>=2.3.0", "moviepy"] diff --git a/src/get_tempo.py b/src/get_tempo.py new file mode 100644 index 0000000..cde58b7 --- /dev/null +++ b/src/get_tempo.py @@ -0,0 +1,29 @@ +from .utils import estimate_tempo + +from typing import Tuple +from ._types import AUDIO + + +class GetTempo: + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "audio": ("AUDIO",), + }, + } + + FUNCTION = "main" + RETURN_TYPES = ("STRING", "FLOAT", "INTEGER") + RETURN_NAMES = ("tempo_string", "tempo_float", "tempo_integer") + CATEGORY = "audio" + + def main( + self, + audio: AUDIO, + ) -> Tuple[AUDIO, AUDIO]: + waveform = audio["waveform"].squeeze(0) + sample_rate = audio["sample_rate"] + tempo = estimate_tempo(waveform, sample_rate) + + return (f"{int(round(tempo))}", tempo, int(tempo)) diff --git a/src/resample.py b/src/resample.py index a71a4da..aefe0cc 100644 --- a/src/resample.py +++ b/src/resample.py @@ -38,7 +38,6 @@ def __init__( self.new_freq = self.orig_freq * self.LOWER_CLAMP diff = abs(1 - change_ratio) - print(f"Change Ratio: {change_ratio}, Diff: {diff}") if diff > 0.08: self.chunk_size_seconds = min(self.chunk_size_seconds, 1) elif diff > 0.002: @@ -46,13 +45,9 @@ def __init__( else: self.chunk_size_seconds = min(self.chunk_size_seconds, 4) - print(f"Chunk Size: {self.chunk_size_seconds}") - # If the frequencies are float, try to convert to int while # maintaining ratio (https://github.com/pytorch/audio/issues/1487). self.orig_freq, self.new_freq = ChunkResampler.reduce_ratio(orig_freq, new_freq) - print(f"Orig Freq: {self.orig_freq}, New Freq: {self.new_freq}") - self.device = comfy.model_management.get_torch_device() self.resample = Resample(self.orig_freq, self.new_freq).to(self.device) diff --git a/src/separation.py b/src/separation.py index 86e464e..8fdae74 100644 --- a/src/separation.py +++ b/src/separation.py @@ -107,7 +107,7 @@ def separate_sources( chunk_fade_shape: str = "linear", ) -> torch.Tensor: """ - https://pytorch.org/audio/stable/tutorials/hybrid_demucs_tutorial.html + From: https://pytorch.org/audio/stable/tutorials/hybrid_demucs_tutorial.html Apply model to a given mixture. Use fade, and add segments together in order to add model segment by segment. diff --git a/src/tempo_match.py b/src/tempo_match.py index 129b148..df838ed 100644 --- a/src/tempo_match.py +++ b/src/tempo_match.py @@ -1,9 +1,4 @@ -import math -import numpy as np -import torch -import librosa - -import torchaudio.functional as F +from .utils import estimate_tempo, time_shift from typing import Tuple from ._types import AUDIO @@ -23,113 +18,26 @@ def INPUT_TYPES(cls): RETURN_TYPES = ("AUDIO", "AUDIO") CATEGORY = "audio" - def time_shift( - self, - waveform: torch.Tensor, - rate: float, - fft_size: int = 2048, - hop_size: int = None, - win_length: int = None, - ) -> torch.Tensor: - """ - Args: - waveform (torch.Tensor): Time-domain input of shape [channels, frames] - rate (float): rate to shift the waveform by - fft_size (int): Size of the FFT to be used (power of 2) - hop_size (int): Hop length for overlap (e.g., fft_size // 4) - win_length (int): Window size (often equal to fft_size) - - Returns: - torch.Tensor: Time-domain output of same shape/type as input [channels, frames] - - """ - if hop_size is None: - hop_size = fft_size // 4 - if win_length is None: - win_length = fft_size - - window = torch.hann_window( - win_length, device=waveform.device - ) # shape: [win_length] - - with torch.no_grad(): - complex_spectogram = torch.stft( - waveform, - n_fft=fft_size, - hop_length=hop_size, - win_length=win_length, - window=window, - return_complex=True, - ) # shape: [channels, freq, time] - - if complex_spectogram.dtype != torch.cfloat: - raise TypeError(f"Expected complex-valued STFT for phase vocoder, got dtype {complex_spectogram.dtype}") - - phase_advance = torch.linspace(0, math.pi * hop_size, complex_spectogram.shape[1])[ - ..., None - ] # shape: [freq, 1] - - stretched_spectogram = F.phase_vocoder( - complex_spectogram, rate, phase_advance - ) # shape: [channels, freq, stretched_time] - - expected_time = math.ceil(complex_spectogram.shape[2] / rate) - assert ( - abs(stretched_spectogram.shape[2] - expected_time) < 3 - ), f"Expected Time: {expected_time}, Stretched Time: {stretched_spectogram.shape[2]}" - - # Convert back to time basis with inverse STFT - return torch.istft( - stretched_spectogram, - n_fft=fft_size, - hop_length=hop_size, - win_length=win_length, - window=window, - ) # shape: [channels, frames] - - def estimate_tempo(self, waveform: torch.Tensor, sample_rate: int) -> float: - if waveform.dim() == 3: - waveform = waveform.squeeze(0) - if waveform.dim() != 2: - raise TypeError(f"Expected waveform to be [channels, frames], got {waveform.shape}") - - onset_env = librosa.onset.onset_strength( - y=waveform.numpy(), - sr=sample_rate, - aggregate=np.median, - ) - - tempo, _= librosa.beat.beat_track( - onset_envelope=onset_env, - sr=sample_rate, - tightness=110, - sparse=False, - trim=True, - ) # [[channel 1 tempo], [channel 2 tempo], ...], _ - - mean_tempo = np.mean(tempo.flatten()) - return max(mean_tempo, 1.0) - def main( self, audio_1: AUDIO, audio_2: AUDIO, ) -> Tuple[AUDIO, AUDIO]: - waveform_1: torch.Tensor = audio_1["waveform"].squeeze(0) - input_sample_rate_1: int = audio_1["sample_rate"] + waveform_1 = audio_1["waveform"].squeeze(0) + input_sample_rate_1 = audio_1["sample_rate"] - waveform_2: torch.Tensor = audio_2["waveform"].squeeze(0) - input_sample_rate_2: int = audio_2["sample_rate"] + waveform_2 = audio_2["waveform"].squeeze(0) + input_sample_rate_2 = audio_2["sample_rate"] - tempo_1 = self.estimate_tempo(waveform_1, input_sample_rate_1) - tempo_2 = self.estimate_tempo(waveform_2, input_sample_rate_2) + tempo_1 = estimate_tempo(waveform_1, input_sample_rate_1) + tempo_2 = estimate_tempo(waveform_2, input_sample_rate_2) avg_tempo = (tempo_1 + tempo_2) / 2 rate_1 = avg_tempo / tempo_1 rate_2 = avg_tempo / tempo_2 - waveform_1 = self.time_shift(waveform_1, rate_1) - waveform_2 = self.time_shift(waveform_2, rate_2) + waveform_1 = time_shift(waveform_1, rate_1) + waveform_2 = time_shift(waveform_2, rate_2) return ( { diff --git a/src/time_shift.py b/src/time_shift.py new file mode 100644 index 0000000..34ed0a2 --- /dev/null +++ b/src/time_shift.py @@ -0,0 +1,36 @@ +from .utils import time_shift + +from typing import Tuple +from ._types import AUDIO + + +class TimeShift: + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "audio": ("AUDIO",), + "rate": ("FLOAT", {"default": 1.0, "min": 0.1, "max": 10.0}), + }, + } + + FUNCTION = "main" + RETURN_TYPES = ("AUDIO",) + CATEGORY = "audio" + + def main( + self, + audio: AUDIO, + rate: float, + ) -> Tuple[AUDIO, AUDIO]: + waveform = audio["waveform"].squeeze(0) + sample_rate = audio["sample_rate"] + rate = min(max(rate, 0.1), 10.0) + shifted = time_shift(waveform, rate) + + return ( + { + "waveform": shifted.unsqueeze(0), + "sample_rate": sample_rate, + }, + ) diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..65af7cf --- /dev/null +++ b/src/utils.py @@ -0,0 +1,99 @@ +import librosa +import torch +import math +import torchaudio.functional as F +import numpy as np + + +def time_shift( + waveform: torch.Tensor, + rate: float, + fft_size: int = 2048, + hop_size: int = None, + win_length: int = None, +) -> torch.Tensor: + """ + Args: + waveform (torch.Tensor): Time-domain input of shape [channels, frames] + rate (float): rate to shift the waveform by + fft_size (int): Size of the FFT to be used (power of 2) + hop_size (int): Hop length for overlap (e.g., fft_size // 4) + win_length (int): Window size (often equal to fft_size) + + Returns: + torch.Tensor: Time-domain output of same shape/type as input [channels, frames] + + """ + if hop_size is None: + hop_size = fft_size // 4 + if win_length is None: + win_length = fft_size + + window = torch.hann_window( + win_length, device=waveform.device + ) # shape: [win_length] + + with torch.no_grad(): + complex_spectogram = torch.stft( + waveform, + n_fft=fft_size, + hop_length=hop_size, + win_length=win_length, + window=window, + return_complex=True, + ) # shape: [channels, freq, time] + + if complex_spectogram.dtype != torch.cfloat: + raise TypeError( + f"Expected complex-valued STFT for phase vocoder, got dtype {complex_spectogram.dtype}" + ) + + phase_advance = torch.linspace( + 0, math.pi * hop_size, complex_spectogram.shape[1] + )[ + ..., None + ] # shape: [freq, 1] + + stretched_spectogram = F.phase_vocoder( + complex_spectogram, rate, phase_advance + ) # shape: [channels, freq, stretched_time] + + expected_time = math.ceil(complex_spectogram.shape[2] / rate) + assert ( + abs(stretched_spectogram.shape[2] - expected_time) < 3 + ), f"Expected Time: {expected_time}, Stretched Time: {stretched_spectogram.shape[2]}" + + # Convert back to time basis with inverse STFT + return torch.istft( + stretched_spectogram, + n_fft=fft_size, + hop_length=hop_size, + win_length=win_length, + window=window, + ) # shape: [channels, frames] + + +def estimate_tempo(waveform: torch.Tensor, sample_rate: int) -> float: + if waveform.dim() == 3: + waveform = waveform.squeeze(0) + if waveform.dim() != 2: + raise TypeError( + f"Expected waveform to be [channels, frames], got {waveform.shape}" + ) + + onset_env = librosa.onset.onset_strength( + y=waveform.numpy(), + sr=sample_rate, + aggregate=np.median, + ) + + tempo, _ = librosa.beat.beat_track( + onset_envelope=onset_env, + sr=sample_rate, + tightness=110, + sparse=False, + trim=True, + ) # [[channel 1 tempo], [channel 2 tempo], ...], _ + + mean_tempo = np.mean(tempo.flatten()) + return max(mean_tempo, 1.0) diff --git a/wiki/examples/ComfyUI_temp_iaepj_00001_.flac b/wiki/examples/ComfyUI_temp_iaepj_00001_.flac new file mode 100644 index 0000000..be435bb Binary files /dev/null and b/wiki/examples/ComfyUI_temp_iaepj_00001_.flac differ diff --git a/wiki/workflows/extract-instrumental.json b/wiki/workflows/extract-instrumental.json new file mode 100644 index 0000000..bcc67a1 --- /dev/null +++ b/wiki/workflows/extract-instrumental.json @@ -0,0 +1,278 @@ +{ + "last_node_id": 21, + "last_link_id": 26, + "nodes": [ + { + "id": 11, + "type": "AudioSeparation", + "pos": [ + 45, + 494 + ], + "size": { + "0": 315, + "1": 166 + }, + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "name": "audio", + "type": "AUDIO", + "link": 25 + } + ], + "outputs": [ + { + "name": "Bass", + "type": "AUDIO", + "links": [ + 20 + ], + "shape": 3, + "slot_index": 0 + }, + { + "name": "Drums", + "type": "AUDIO", + "links": [ + 21 + ], + "shape": 3, + "slot_index": 1 + }, + { + "name": "Other", + "type": "AUDIO", + "links": [ + 23 + ], + "shape": 3, + "slot_index": 2 + }, + { + "name": "Vocals", + "type": "AUDIO", + "links": [], + "shape": 3, + "slot_index": 3 + } + ], + "properties": { + "Node name for S&R": "AudioSeparation" + }, + "widgets_values": [ + "half_sine", + 16, + 0.1 + ] + }, + { + "id": 19, + "type": "AudioCombine", + "pos": [ + 474, + 503 + ], + "size": { + "0": 315, + "1": 78 + }, + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "audio_1", + "type": "AUDIO", + "link": 20 + }, + { + "name": "audio_2", + "type": "AUDIO", + "link": 21 + } + ], + "outputs": [ + { + "name": "AUDIO", + "type": "AUDIO", + "links": [ + 22 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "AudioCombine" + }, + "widgets_values": [ + "add" + ] + }, + { + "id": 10, + "type": "LoadAudio", + "pos": [ + -441, + 496 + ], + "size": { + "0": 381.5714111328125, + "1": 144 + }, + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "AUDIO", + "type": "AUDIO", + "links": [ + 25 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "LoadAudio" + }, + "widgets_values": [ + "06 Madonna.mp3", + null, + "" + ] + }, + { + "id": 20, + "type": "AudioCombine", + "pos": [ + 859, + 584 + ], + "size": { + "0": 315, + "1": 78 + }, + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "audio_1", + "type": "AUDIO", + "link": 22 + }, + { + "name": "audio_2", + "type": "AUDIO", + "link": 23 + } + ], + "outputs": [ + { + "name": "AUDIO", + "type": "AUDIO", + "links": [ + 26 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "AudioCombine" + }, + "widgets_values": [ + "add" + ] + }, + { + "id": 21, + "type": "SaveAudio", + "pos": [ + 1255, + 585 + ], + "size": { + "0": 315, + "1": 100 + }, + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "audio", + "type": "AUDIO", + "link": 26 + } + ], + "properties": { + "Node name for S&R": "SaveAudio" + }, + "widgets_values": [ + "audio/instrumentals/track", + null + ] + } + ], + "links": [ + [ + 20, + 11, + 0, + 19, + 0, + "AUDIO" + ], + [ + 21, + 11, + 1, + 19, + 1, + "AUDIO" + ], + [ + 22, + 19, + 0, + 20, + 0, + "AUDIO" + ], + [ + 23, + 11, + 2, + 20, + 1, + "AUDIO" + ], + [ + 25, + 10, + 0, + 11, + 0, + "AUDIO" + ], + [ + 26, + 20, + 0, + 21, + 0, + "AUDIO" + ] + ], + "groups": [], + "config": {}, + "extra": {}, + "version": 0.4 +} \ No newline at end of file