Skip to content

Commit

Permalink
v0.2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
jstzwj committed Nov 21, 2022
1 parent 10ebd5d commit 02df286
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 17 deletions.
48 changes: 43 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,55 @@ An 48kHz implementation of HiFi-GAN for Voice Conversion.
# Example

```Python
import torch, torchaudio
from hifigan.mel_processing import mel_spectrogram_torch
hifigan = torch.hub.load("vtuber-plan/hifi-gan:main", "hifigan_48k")
import torch
import os
import torchaudio
import torchaudio.transforms as T

class AudioPipeline(torch.nn.Module):
def __init__(
self,
freq=16000,
n_fft=1024,
n_mel=128,
win_length=1024,
hop_length=256,
):
super().__init__()
self.freq=freq
pad = int((n_fft-hop_length)/2)
self.spec = T.Spectrogram(n_fft=n_fft, win_length=win_length, hop_length=hop_length,
pad=pad, power=None,center=False, pad_mode='reflect', normalized=False, onesided=True)

self.mel_scale = T.MelScale(n_mels=n_mel, sample_rate=freq, n_stft=n_fft // 2 + 1)

def forward(self, waveform: torch.Tensor) -> torch.Tensor:
shift_waveform = waveform
# Convert to power spectrogram
spec = self.spec(shift_waveform)
spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6)
# Convert to mel-scale
mel = self.mel_scale(spec)
return mel

device = "cpu"

hifigan = torch.hub.load("vtuber-plan/hifi-gan:v0.2.0", "hifigan_48k", force_reload=True).to(device)

# Load audio
wav, sr = torchaudio.load("test.wav")
assert sr == 48000

mel = mel_spectrogram_torch(wav, 2048, 256, 48000, 512, 2048, 0, None, False)
mel = mel.cuda()
audio_pipeline = AudioPipeline(freq=48000,
n_fft=2048,
n_mel=128,
win_length=2048,
hop_length=512)
mel = audio_pipeline(wav)
out = hifigan(mel)

wav_out = out.squeeze(0).cpu()

torchaudio.save("test_out.wav", wav_out, sr)
```

Expand Down
2 changes: 1 addition & 1 deletion hifigan/hub/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
CKPT_URLS = {
"hifigan-48k": "https://github.com/vtuber-plan/hifi-gan/releases/download/v0.1.3/hifigan-48k-59CB718B329ED0167F3BBD9DDC47F443.pt",
"hifigan-48k": "https://github.com/vtuber-plan/hifi-gan/releases/download/v0.2.0/hifigan-48k-C8FDBD55FE7700384955A6EC41AF1D84.pt",
}
import torch
from ..model.generators.generator import Generator
Expand Down
2 changes: 1 addition & 1 deletion hifigan/model/hifigan.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __init__(self, **kwargs):
n_mel=self.hparams.data.n_mel_channels,
win_length=self.hparams.data.win_length,
hop_length=self.hparams.data.hop_length,
device=self.device)
aug=True)
for param in self.audio_pipeline.parameters():
param.requires_grad = False

Expand Down
7 changes: 4 additions & 3 deletions hifigan/model/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ def __init__(
n_mel=128,
win_length=1024,
hop_length=256,
device="cpu",
aug=False
):
super().__init__()

self.freq=freq
self.device=device
self.aug=aug

pad = int((n_fft-hop_length)/2)
self.spec = T.Spectrogram(n_fft=n_fft, win_length=win_length, hop_length=hop_length,
Expand All @@ -43,7 +43,8 @@ def forward(self, waveform: torch.Tensor) -> torch.Tensor:
spec = self.spec(shift_waveform)
spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6)
# Apply SpecAugment
spec = self.spec_aug(spec)
if self.aug:
spec = self.spec_aug(spec)
# Convert to mel-scale
mel = self.mel_scale(spec)
return mel
48 changes: 41 additions & 7 deletions test.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,40 @@
import torch, torchaudio
import torch
import os
import glob
from hifigan.mel_processing import mel_spectrogram_torch, spectrogram_torch_audio
from hifigan.model.hifigan import HifiGAN


import torchaudio
import torchaudio.transforms as T

class AudioPipeline(torch.nn.Module):
def __init__(
self,
freq=16000,
n_fft=1024,
n_mel=128,
win_length=1024,
hop_length=256,
):
super().__init__()
self.freq=freq
pad = int((n_fft-hop_length)/2)
self.spec = T.Spectrogram(n_fft=n_fft, win_length=win_length, hop_length=hop_length,
pad=pad, power=None,center=False, pad_mode='reflect', normalized=False, onesided=True)

self.mel_scale = T.MelScale(n_mels=n_mel, sample_rate=freq, n_stft=n_fft // 2 + 1)

def forward(self, waveform: torch.Tensor) -> torch.Tensor:
shift_waveform = waveform
# Convert to power spectrogram
spec = self.spec(shift_waveform)
spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6)
# Convert to mel-scale
mel = self.mel_scale(spec)
return mel

def load_local():
from hifigan.model.hifigan import HifiGAN

ckpt_path = None
if os.path.exists("logs/lightning_logs"):
versions = glob.glob("logs/lightning_logs/version_*")
Expand All @@ -21,7 +51,7 @@ def load_local():
return model.net_g

def load_remote():
return torch.hub.load("vtuber-plan/hifi-gan:main", "hifigan_48k", force_reload=True)
return torch.hub.load("vtuber-plan/hifi-gan:v0.2.0", "hifigan_48k", force_reload=True)

device = "cpu"

Expand All @@ -35,9 +65,13 @@ def load_remote():
wav, sr = torchaudio.load("zszy_48k.wav")
assert sr == 48000

mel = mel_spectrogram_torch(wav, 2048, 128, 48000, 512, 2048, 0, None, False)

mel = mel.to(device)
# mel = mel_spectrogram_torch(wav, 2048, 128, 48000, 512, 2048, 0, None, False)
audio_pipeline = AudioPipeline(freq=48000,
n_fft=2048,
n_mel=128,
win_length=2048,
hop_length=512)
mel = audio_pipeline(wav)
out = hifigan(mel)

wav_out = out.squeeze(0).cpu()
Expand Down

0 comments on commit 02df286

Please sign in to comment.