Skip to content

Commit

Permalink
Added examples, TimeShift and GetTempo nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
christian-byrne committed Jul 19, 2024
1 parent a5107dd commit b7d6600
Show file tree
Hide file tree
Showing 12 changed files with 487 additions and 123 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ samples
lut
video-editing/
video-editing/**/**
testing-all-nodes-megaworkflow.json
44 changes: 29 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
![worklow picture](./wiki/pics/Selection_016.png)



https://github.com/user-attachments/assets/c5cf20de-a17f-438d-81ac-0c392af669cf


Expand All @@ -26,51 +25,66 @@ https://github.com/user-attachments/assets/c5af418e-7137-4c36-b86e-3352cf558ea8



#### *`Separating Song Vocals`*

<details>
#### *`Replacing BGM with StableAudio-Generated BGM`*

<details>

<summary> &nbsp; Show </summary>

[workflow.json](./wiki/workflows/isolate-vocals-song.json)
*For example, to replace copyrighted BGM with new music that has the same mood*.

https://github.com/user-attachments/assets/c5cf20de-a17f-438d-81ac-0c392af669cf
*NOTE*: In order to load videos into the LoadAudio Node, change [this line](https://github.com/comfyanonymous/ComfyUI/blob/faa57430b0ff882275b1afcf6610e8e9f8a5929b/comfy_extras/nodes_audio.py#L185) in your comfy install to include the `.ext` (e.g., `.mp4`)

[workflow json](./wiki/workflows/replace-bgm.json)

https://github.com/user-attachments/assets/a7d5656b-5f8b-439a-936f-6ebb6a0d538a

</details>


#### *`Remixing Songs with StableAudio`*

#### *`Replacing BGM with Generated BGM`*

<details>

<summary> &nbsp; Show </summary>

*For example, to replace copyrighted BGM with new music that has the same mood*.
- [workflow json](./wiki/workflows/remix-songs.json)
- [example output (audio file) with embedded workflow](./wiki/examples/ComfyUI_temp_iaepj_00001_.flac)
- [example output (audio file) with embedded workflow](./wiki/examples/ComfyUI_00002_.flac)

*NOTE*: In order to load videos into the LoadAudio Node, change [this line](https://github.com/comfyanonymous/ComfyUI/blob/faa57430b0ff882275b1afcf6610e8e9f8a5929b/comfy_extras/nodes_audio.py#L185) in your comfy install to include the `.ext` (e.g., `.mp4`)
</details>

[workflow json](./wiki/workflows/replace-bgm.json)

https://github.com/user-attachments/assets/a7d5656b-5f8b-439a-936f-6ebb6a0d538a
#### *`Separating Song Vocals`*

<details>

<summary> &nbsp; Show </summary>

[workflow.json](./wiki/workflows/isolate-vocals-song.json)

https://github.com/user-attachments/assets/c5cf20de-a17f-438d-81ac-0c392af669cf

</details>


#### *`Remixing Songs with StableAudio`*


#### *`Extracting Instrumentals from Songs`*


<details>

<summary> &nbsp; Show </summary>

- [workflow json](./wiki/workflows/remix-songs.json)
- [example output (audio file) with embedded workflow](./wiki/examples/ComfyUI_temp_ksudt_00002_.flac)
- [example output (audio file) with embedded workflow](./wiki/examples/ComfyUI_00002_.flac)
- [workflow json](./wiki/workflows/extract-instrumental.json)

</details>


&nbsp;

# Requirements

Expand Down
4 changes: 4 additions & 0 deletions __init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from .src.crop import AudioCrop
from .src.combine import AudioCombine
from .src.combine_video_with_audio import AudioVideoCombine
from .src.time_shift import TimeShift
from .src.get_tempo import GetTempo


NODE_CLASS_MAPPINGS = {
Expand All @@ -11,4 +13,6 @@
"AudioCombine": AudioCombine,
"AudioTempoMatch": TempoMatch,
"AudioVideoCombine": AudioVideoCombine,
"AudioSpeedShift": TimeShift,
"AudioGetTempo": GetTempo,
}
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[project]
name = "audio-separation-nodes-comfyui"
description = "Separate audio track into stems (vocals, bass, drums, other). Along with tools to recombine, tempo match, slice/crop audio"
version = "1.2.1"
version = "1.3.0"
license = "LICENSE"
dependencies = ["librosa==0.10.2", "numpy", "torchaudio>=2.3.0", "moviepy"]

Expand Down
29 changes: 29 additions & 0 deletions src/get_tempo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from .utils import estimate_tempo

from typing import Tuple
from ._types import AUDIO


class GetTempo:
@classmethod
def INPUT_TYPES(cls):
return {
"required": {
"audio": ("AUDIO",),
},
}

FUNCTION = "main"
RETURN_TYPES = ("STRING", "FLOAT", "INTEGER")
RETURN_NAMES = ("tempo_string", "tempo_float", "tempo_integer")
CATEGORY = "audio"

def main(
self,
audio: AUDIO,
) -> Tuple[AUDIO, AUDIO]:
waveform = audio["waveform"].squeeze(0)
sample_rate = audio["sample_rate"]
tempo = estimate_tempo(waveform, sample_rate)

return (f"{int(round(tempo))}", tempo, int(tempo))
5 changes: 0 additions & 5 deletions src/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,21 +38,16 @@ def __init__(
self.new_freq = self.orig_freq * self.LOWER_CLAMP

diff = abs(1 - change_ratio)
print(f"Change Ratio: {change_ratio}, Diff: {diff}")
if diff > 0.08:
self.chunk_size_seconds = min(self.chunk_size_seconds, 1)
elif diff > 0.002:
self.chunk_size_seconds = min(self.chunk_size_seconds, 2)
else:
self.chunk_size_seconds = min(self.chunk_size_seconds, 4)

print(f"Chunk Size: {self.chunk_size_seconds}")

# If the frequencies are float, try to convert to int while
# maintaining ratio (https://github.com/pytorch/audio/issues/1487).
self.orig_freq, self.new_freq = ChunkResampler.reduce_ratio(orig_freq, new_freq)
print(f"Orig Freq: {self.orig_freq}, New Freq: {self.new_freq}")

self.device = comfy.model_management.get_torch_device()
self.resample = Resample(self.orig_freq, self.new_freq).to(self.device)

Expand Down
2 changes: 1 addition & 1 deletion src/separation.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def separate_sources(
chunk_fade_shape: str = "linear",
) -> torch.Tensor:
"""
https://pytorch.org/audio/stable/tutorials/hybrid_demucs_tutorial.html
From: https://pytorch.org/audio/stable/tutorials/hybrid_demucs_tutorial.html
Apply model to a given mixture. Use fade, and add segments together in order to add model segment by segment.
Expand Down
110 changes: 9 additions & 101 deletions src/tempo_match.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
import math
import numpy as np
import torch
import librosa

import torchaudio.functional as F
from .utils import estimate_tempo, time_shift

from typing import Tuple
from ._types import AUDIO
Expand All @@ -23,113 +18,26 @@ def INPUT_TYPES(cls):
RETURN_TYPES = ("AUDIO", "AUDIO")
CATEGORY = "audio"

def time_shift(
self,
waveform: torch.Tensor,
rate: float,
fft_size: int = 2048,
hop_size: int = None,
win_length: int = None,
) -> torch.Tensor:
"""
Args:
waveform (torch.Tensor): Time-domain input of shape [channels, frames]
rate (float): rate to shift the waveform by
fft_size (int): Size of the FFT to be used (power of 2)
hop_size (int): Hop length for overlap (e.g., fft_size // 4)
win_length (int): Window size (often equal to fft_size)
Returns:
torch.Tensor: Time-domain output of same shape/type as input [channels, frames]
"""
if hop_size is None:
hop_size = fft_size // 4
if win_length is None:
win_length = fft_size

window = torch.hann_window(
win_length, device=waveform.device
) # shape: [win_length]

with torch.no_grad():
complex_spectogram = torch.stft(
waveform,
n_fft=fft_size,
hop_length=hop_size,
win_length=win_length,
window=window,
return_complex=True,
) # shape: [channels, freq, time]

if complex_spectogram.dtype != torch.cfloat:
raise TypeError(f"Expected complex-valued STFT for phase vocoder, got dtype {complex_spectogram.dtype}")

phase_advance = torch.linspace(0, math.pi * hop_size, complex_spectogram.shape[1])[
..., None
] # shape: [freq, 1]

stretched_spectogram = F.phase_vocoder(
complex_spectogram, rate, phase_advance
) # shape: [channels, freq, stretched_time]

expected_time = math.ceil(complex_spectogram.shape[2] / rate)
assert (
abs(stretched_spectogram.shape[2] - expected_time) < 3
), f"Expected Time: {expected_time}, Stretched Time: {stretched_spectogram.shape[2]}"

# Convert back to time basis with inverse STFT
return torch.istft(
stretched_spectogram,
n_fft=fft_size,
hop_length=hop_size,
win_length=win_length,
window=window,
) # shape: [channels, frames]

def estimate_tempo(self, waveform: torch.Tensor, sample_rate: int) -> float:
if waveform.dim() == 3:
waveform = waveform.squeeze(0)
if waveform.dim() != 2:
raise TypeError(f"Expected waveform to be [channels, frames], got {waveform.shape}")

onset_env = librosa.onset.onset_strength(
y=waveform.numpy(),
sr=sample_rate,
aggregate=np.median,
)

tempo, _= librosa.beat.beat_track(
onset_envelope=onset_env,
sr=sample_rate,
tightness=110,
sparse=False,
trim=True,
) # [[channel 1 tempo], [channel 2 tempo], ...], _

mean_tempo = np.mean(tempo.flatten())
return max(mean_tempo, 1.0)

def main(
self,
audio_1: AUDIO,
audio_2: AUDIO,
) -> Tuple[AUDIO, AUDIO]:
waveform_1: torch.Tensor = audio_1["waveform"].squeeze(0)
input_sample_rate_1: int = audio_1["sample_rate"]
waveform_1 = audio_1["waveform"].squeeze(0)
input_sample_rate_1 = audio_1["sample_rate"]

waveform_2: torch.Tensor = audio_2["waveform"].squeeze(0)
input_sample_rate_2: int = audio_2["sample_rate"]
waveform_2 = audio_2["waveform"].squeeze(0)
input_sample_rate_2 = audio_2["sample_rate"]

tempo_1 = self.estimate_tempo(waveform_1, input_sample_rate_1)
tempo_2 = self.estimate_tempo(waveform_2, input_sample_rate_2)
tempo_1 = estimate_tempo(waveform_1, input_sample_rate_1)
tempo_2 = estimate_tempo(waveform_2, input_sample_rate_2)
avg_tempo = (tempo_1 + tempo_2) / 2

rate_1 = avg_tempo / tempo_1
rate_2 = avg_tempo / tempo_2

waveform_1 = self.time_shift(waveform_1, rate_1)
waveform_2 = self.time_shift(waveform_2, rate_2)
waveform_1 = time_shift(waveform_1, rate_1)
waveform_2 = time_shift(waveform_2, rate_2)

return (
{
Expand Down
36 changes: 36 additions & 0 deletions src/time_shift.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from .utils import time_shift

from typing import Tuple
from ._types import AUDIO


class TimeShift:
@classmethod
def INPUT_TYPES(cls):
return {
"required": {
"audio": ("AUDIO",),
"rate": ("FLOAT", {"default": 1.0, "min": 0.1, "max": 10.0}),
},
}

FUNCTION = "main"
RETURN_TYPES = ("AUDIO",)
CATEGORY = "audio"

def main(
self,
audio: AUDIO,
rate: float,
) -> Tuple[AUDIO, AUDIO]:
waveform = audio["waveform"].squeeze(0)
sample_rate = audio["sample_rate"]
rate = min(max(rate, 0.1), 10.0)
shifted = time_shift(waveform, rate)

return (
{
"waveform": shifted.unsqueeze(0),
"sample_rate": sample_rate,
},
)
Loading

0 comments on commit b7d6600

Please sign in to comment.