diff --git a/PaddleAudio/examples/speaker_recognition/README.md b/PaddleAudio/examples/speaker_recognition/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/PaddleAudio/examples/speaker_recognition/augment.py b/PaddleAudio/examples/speaker_recognition/augment.py new file mode 100644 index 0000000000..c39d117b28 --- /dev/null +++ b/PaddleAudio/examples/speaker_recognition/augment.py @@ -0,0 +1,849 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import os + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddleaudio.backends import load as load_audio +from paddleaudio.utils import download_and_decompress +from signal_processing import (compute_amplitude, convolve1d, dB_to_amplitude, + notch_filter, reverberate) + +# TODO: Complete type-hint and doc string. + + +class DropFreq(nn.Layer): + def __init__( + self, + drop_freq_low=1e-14, + drop_freq_high=1, + drop_count_low=1, + drop_count_high=2, + drop_width=0.05, + drop_prob=1, + ): + super(DropFreq, self).__init__() + self.drop_freq_low = drop_freq_low + self.drop_freq_high = drop_freq_high + self.drop_count_low = drop_count_low + self.drop_count_high = drop_count_high + self.drop_width = drop_width + self.drop_prob = drop_prob + + def forward(self, waveforms): + # Don't drop (return early) 1-`drop_prob` portion of the batches + dropped_waveform = waveforms.clone() + if paddle.rand([1]) > self.drop_prob: + return dropped_waveform + + # Add channels dimension + if len(waveforms.shape) == 2: + dropped_waveform = dropped_waveform.unsqueeze(-1) + + # Pick number of frequencies to drop + drop_count = paddle.randint( + low=self.drop_count_low, high=self.drop_count_high + 1, shape=[1]) + + # Pick a frequency to drop + drop_range = self.drop_freq_high - self.drop_freq_low + drop_frequency = ( + paddle.rand([drop_count]) * drop_range + self.drop_freq_low) + + # Filter parameters + filter_length = 101 + pad = filter_length // 2 + + # Start with delta function + drop_filter = paddle.zeros([1, filter_length, 1]) + drop_filter[0, pad, 0] = 1 + + # Subtract each frequency + for frequency in drop_frequency: + notch_kernel = notch_filter(frequency, filter_length, + self.drop_width) + drop_filter = convolve1d(drop_filter, notch_kernel, pad) + + # Apply filter + dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad) + + # Remove channels dimension if added + return dropped_waveform.squeeze(-1) + + +class DropChunk(nn.Layer): + def __init__( + self, + drop_length_low=100, + drop_length_high=1000, + drop_count_low=1, + drop_count_high=10, + drop_start=0, + drop_end=None, + drop_prob=1, + noise_factor=0.0, + ): + super(DropChunk, self).__init__() + self.drop_length_low = drop_length_low + self.drop_length_high = drop_length_high + self.drop_count_low = drop_count_low + self.drop_count_high = drop_count_high + self.drop_start = drop_start + self.drop_end = drop_end + self.drop_prob = drop_prob + self.noise_factor = noise_factor + + # Validate low < high + if drop_length_low > drop_length_high: + raise ValueError("Low limit must not be more than high limit") + if drop_count_low > drop_count_high: + raise ValueError("Low limit must not be more than high limit") + + # Make sure the length doesn't exceed end - start + if drop_end is not None and drop_end >= 0: + if drop_start > drop_end: + raise ValueError("Low limit must not be more than high limit") + + drop_range = drop_end - drop_start + self.drop_length_low = min(drop_length_low, drop_range) + self.drop_length_high = min(drop_length_high, drop_range) + + def forward(self, waveforms, lengths): + # Reading input list + lengths = (lengths * waveforms.shape[1]).astype('int64') + batch_size = waveforms.shape[0] + dropped_waveform = waveforms.clone() + + # Don't drop (return early) 1-`drop_prob` portion of the batches + if paddle.rand([1]) > self.drop_prob: + return dropped_waveform + + # Store original amplitude for computing white noise amplitude + clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1)) + + # Pick a number of times to drop + drop_times = paddle.randint( + low=self.drop_count_low, + high=self.drop_count_high + 1, + shape=[batch_size], + ) + + # Iterate batch to set mask + for i in range(batch_size): + if drop_times[i] == 0: + continue + + # Pick lengths + length = paddle.randint( + low=self.drop_length_low, + high=self.drop_length_high + 1, + shape=[drop_times[i]], + ) + + # Compute range of starting locations + start_min = self.drop_start + if start_min < 0: + start_min += lengths[i] + start_max = self.drop_end + if start_max is None: + start_max = lengths[i] + if start_max < 0: + start_max += lengths[i] + start_max = max(0, start_max - length.max()) + + # Pick starting locations + start = paddle.randint( + low=start_min, + high=start_max + 1, + shape=[drop_times[i]], + ) + + end = start + length + + # Update waveform + if not self.noise_factor: + for j in range(drop_times[i]): + dropped_waveform[i, start[j]:end[j]] = 0.0 + else: + # Uniform distribution of -2 to +2 * avg amplitude should + # preserve the average for normalization + noise_max = 2 * clean_amplitude[i] * self.noise_factor + for j in range(drop_times[i]): + # zero-center the noise distribution + noise_vec = paddle.rand([length[j]], dtype='float32') + + noise_vec = 2 * noise_max * noise_vec - noise_max + dropped_waveform[i, int(start[j]):int(end[j])] = noise_vec + + return dropped_waveform + + +class Resample(nn.Layer): + def __init__( + self, + orig_freq=16000, + new_freq=16000, + lowpass_filter_width=6, + ): + super(Resample, self).__init__() + self.orig_freq = orig_freq + self.new_freq = new_freq + self.lowpass_filter_width = lowpass_filter_width + + # Compute rate for striding + self._compute_strides() + assert self.orig_freq % self.conv_stride == 0 + assert self.new_freq % self.conv_transpose_stride == 0 + + def _compute_strides(self): + # Compute new unit based on ratio of in/out frequencies + base_freq = math.gcd(self.orig_freq, self.new_freq) + input_samples_in_unit = self.orig_freq // base_freq + self.output_samples = self.new_freq // base_freq + + # Store the appropriate stride based on the new units + self.conv_stride = input_samples_in_unit + self.conv_transpose_stride = self.output_samples + + def forward(self, waveforms): + if not hasattr(self, "first_indices"): + self._indices_and_weights(waveforms) + + # Don't do anything if the frequencies are the same + if self.orig_freq == self.new_freq: + return waveforms + + unsqueezed = False + if len(waveforms.shape) == 2: + waveforms = waveforms.unsqueeze(1) + unsqueezed = True + elif len(waveforms.shape) == 3: + waveforms = waveforms.transpose([0, 2, 1]) + else: + raise ValueError("Input must be 2 or 3 dimensions") + + # Do resampling + resampled_waveform = self._perform_resample(waveforms) + + if unsqueezed: + resampled_waveform = resampled_waveform.squeeze(1) + else: + resampled_waveform = resampled_waveform.transpose([0, 2, 1]) + + return resampled_waveform + + def _perform_resample(self, waveforms): + # Compute output size and initialize + batch_size, num_channels, wave_len = waveforms.shape + window_size = self.weights.shape[1] + tot_output_samp = self._output_samples(wave_len) + resampled_waveform = paddle.zeros((batch_size, num_channels, + tot_output_samp)) + + # eye size: (num_channels, num_channels, 1) + eye = paddle.eye(num_channels).unsqueeze(2) + + # Iterate over the phases in the polyphase filter + for i in range(self.first_indices.shape[0]): + wave_to_conv = waveforms + first_index = int(self.first_indices[i].item()) + if first_index >= 0: + # trim the signal as the filter will not be applied + # before the first_index + wave_to_conv = wave_to_conv[:, :, first_index:] + + # pad the right of the signal to allow partial convolutions + # meaning compute values for partial windows (e.g. end of the + # window is outside the signal length) + max_index = (tot_output_samp - 1) // self.output_samples + end_index = max_index * self.conv_stride + window_size + current_wave_len = wave_len - first_index + right_padding = max(0, end_index + 1 - current_wave_len) + left_padding = max(0, -first_index) + wave_to_conv = paddle.nn.functional.pad( + wave_to_conv, [left_padding, right_padding], data_format='NCL') + conv_wave = paddle.nn.functional.conv1d( + x=wave_to_conv, + # weight=self.weights[i].repeat(num_channels, 1, 1), + weight=self.weights[i].expand((num_channels, 1, -1)), + stride=self.conv_stride, + groups=num_channels, + ) + + # we want conv_wave[:, i] to be at + # output[:, i + n*conv_transpose_stride] + dilated_conv_wave = paddle.nn.functional.conv1d_transpose( + conv_wave, eye, stride=self.conv_transpose_stride) + + # pad dilated_conv_wave so it reaches the output length if needed. + left_padding = i + previous_padding = left_padding + dilated_conv_wave.shape[-1] + right_padding = max(0, tot_output_samp - previous_padding) + dilated_conv_wave = paddle.nn.functional.pad( + dilated_conv_wave, [left_padding, right_padding], + data_format='NCL') + dilated_conv_wave = dilated_conv_wave[:, :, :tot_output_samp] + + resampled_waveform += dilated_conv_wave + + return resampled_waveform + + def _output_samples(self, input_num_samp): + samp_in = int(self.orig_freq) + samp_out = int(self.new_freq) + + tick_freq = abs(samp_in * samp_out) // math.gcd(samp_in, samp_out) + ticks_per_input_period = tick_freq // samp_in + + # work out the number of ticks in the time interval + # [ 0, input_num_samp/samp_in ). + interval_length = input_num_samp * ticks_per_input_period + if interval_length <= 0: + return 0 + ticks_per_output_period = tick_freq // samp_out + + # Get the last output-sample in the closed interval, + # i.e. replacing [ ) with [ ]. Note: integer division rounds down. + # See http://en.wikipedia.org/wiki/Interval_(mathematics) for an + # explanation of the notation. + last_output_samp = interval_length // ticks_per_output_period + + # We need the last output-sample in the open interval, so if it + # takes us to the end of the interval exactly, subtract one. + if last_output_samp * ticks_per_output_period == interval_length: + last_output_samp -= 1 + + # First output-sample index is zero, so the number of output samples + # is the last output-sample plus one. + num_output_samp = last_output_samp + 1 + + return num_output_samp + + def _indices_and_weights(self, waveforms): + # Lowpass filter frequency depends on smaller of two frequencies + min_freq = min(self.orig_freq, self.new_freq) + lowpass_cutoff = 0.99 * 0.5 * min_freq + + assert lowpass_cutoff * 2 <= min_freq + window_width = self.lowpass_filter_width / (2.0 * lowpass_cutoff) + + assert lowpass_cutoff < min(self.orig_freq, self.new_freq) / 2 + output_t = paddle.arange(start=0.0, end=self.output_samples) + output_t /= self.new_freq + min_t = output_t - window_width + max_t = output_t + window_width + + min_input_index = paddle.ceil(min_t * self.orig_freq) + max_input_index = paddle.floor(max_t * self.orig_freq) + num_indices = max_input_index - min_input_index + 1 + + max_weight_width = num_indices.max() + j = paddle.arange(max_weight_width, dtype='float32') + input_index = min_input_index.unsqueeze(1) + j.unsqueeze(0) + delta_t = (input_index / self.orig_freq) - output_t.unsqueeze(1) + + weights = paddle.zeros_like(delta_t) + inside_window_indices = delta_t.abs().less_than( + paddle.to_tensor(window_width)) + + # raised-cosine (Hanning) window with width `window_width` + weights[inside_window_indices] = 0.5 * (1 + paddle.cos( + 2 * math.pi * lowpass_cutoff / self.lowpass_filter_width * + delta_t.masked_select(inside_window_indices))) + + t_eq_zero_indices = delta_t.equal(paddle.zeros_like(delta_t)) + t_not_eq_zero_indices = delta_t.not_equal(paddle.zeros_like(delta_t)) + + # sinc filter function + weights = paddle.where( + t_not_eq_zero_indices, + weights * paddle.sin(2 * math.pi * lowpass_cutoff * delta_t) / + (math.pi * delta_t), weights) + + # limit of the function at t = 0 + weights = paddle.where(t_eq_zero_indices, weights * 2 * lowpass_cutoff, + weights) + + # size (output_samples, max_weight_width) + weights /= self.orig_freq + + self.first_indices = min_input_index + self.weights = weights + + +class SpeedPerturb(nn.Layer): + def __init__( + self, + orig_freq, + speeds=[90, 100, 110], + perturb_prob=1.0, + ): + super(SpeedPerturb, self).__init__() + self.orig_freq = orig_freq + self.speeds = speeds + self.perturb_prob = perturb_prob + + # Initialize index of perturbation + self.samp_index = 0 + + # Initialize resamplers + self.resamplers = [] + for speed in self.speeds: + config = { + "orig_freq": self.orig_freq, + "new_freq": self.orig_freq * speed // 100, + } + self.resamplers.append(Resample(**config)) + + def forward(self, waveform): + # Don't perturb (return early) 1-`perturb_prob` portion of the batches + if paddle.rand([1]) > self.perturb_prob: + return waveform.clone() + + # Perform a random perturbation + self.samp_index = paddle.randint(len(self.speeds), shape=[1]).item() + perturbed_waveform = self.resamplers[self.samp_index](waveform) + + return perturbed_waveform + + +class AddNoise(nn.Layer): + def __init__( + self, + noise_dataset=None, # None for white noise + num_workers=0, + snr_low=0, + snr_high=0, + mix_prob=1.0, + start_index=None, + normalize=False, + ): + super(AddNoise, self).__init__() + + self.num_workers = num_workers + self.snr_low = snr_low + self.snr_high = snr_high + self.mix_prob = mix_prob + self.start_index = start_index + self.normalize = normalize + self.noise_dataset = noise_dataset + self.noise_dataloader = None + + def forward(self, waveforms, lengths=None): + if lengths is None: + lengths = paddle.ones([len(waveforms)]) + + # Copy clean waveform to initialize noisy waveform + noisy_waveform = waveforms.clone() + lengths = (lengths * waveforms.shape[1]).astype('int64').unsqueeze(1) + + # Don't add noise (return early) 1-`mix_prob` portion of the batches + if paddle.rand([1]) > self.mix_prob: + return noisy_waveform + + # Compute the average amplitude of the clean waveforms + clean_amplitude = compute_amplitude(waveforms, lengths) + + # Pick an SNR and use it to compute the mixture amplitude factors + SNR = paddle.rand((len(waveforms), 1)) + SNR = SNR * (self.snr_high - self.snr_low) + self.snr_low + noise_amplitude_factor = 1 / (dB_to_amplitude(SNR) + 1) + new_noise_amplitude = noise_amplitude_factor * clean_amplitude + + # Scale clean signal appropriately + noisy_waveform *= 1 - noise_amplitude_factor + + # Loop through clean samples and create mixture + if self.noise_dataset is None: + white_noise = paddle.normal(shape=waveforms.shape) + noisy_waveform += new_noise_amplitude * white_noise + else: + tensor_length = waveforms.shape[1] + noise_waveform, noise_length = self._load_noise( + lengths, + tensor_length, + ) + + # Rescale and add + noise_amplitude = compute_amplitude(noise_waveform, noise_length) + noise_waveform *= new_noise_amplitude / (noise_amplitude + 1e-14) + noisy_waveform += noise_waveform + + # Normalizing to prevent clipping + if self.normalize: + abs_max, _ = paddle.max( + paddle.abs(noisy_waveform), axis=1, keepdim=True) + noisy_waveform = noisy_waveform / abs_max.clip(min=1.0) + + return noisy_waveform + + def _load_noise(self, lengths, max_length): + """ + Load a batch of noises + + args + lengths(Paddle.Tensor): Num samples of waveforms with shape (N, 1). + max_length(int): Width of a batch. + """ + lengths = lengths.squeeze(1) + batch_size = len(lengths) + + # Load a noise batch + if self.noise_dataloader is None: + + def noise_collate_fn(batch): + def pad(x, target_length, mode='constant', **kwargs): + x = np.asarray(x) + w = target_length - x.shape[0] + assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}' + return np.pad(x, [0, w], mode=mode, **kwargs) + + ids = [item['id'] for item in batch] + lengths = np.asarray([item['feat'].shape[0] for item in batch]) + waveforms = list( + map(lambda x: pad(x, max(max_length, + lengths.max().item())), + [item['feat'] for item in batch])) + waveforms = np.stack(waveforms) + return {'ids': ids, 'feats': waveforms, 'lengths': lengths} + + # Create noise data loader. + self.noise_dataloader = paddle.io.DataLoader( + self.noise_dataset, + batch_size=batch_size, + shuffle=True, + num_workers=self.num_workers, + collate_fn=noise_collate_fn, + return_list=True, + ) + self.noise_data = iter(self.noise_dataloader) + + noise_batch, noise_len = self._load_noise_batch_of_size(batch_size) + + # Select a random starting location in the waveform + start_index = self.start_index + if self.start_index is None: + start_index = 0 + max_chop = (noise_len - lengths).min().clip(min=1) + start_index = paddle.randint(high=max_chop, shape=[1]) + + # Truncate noise_batch to max_length + noise_batch = noise_batch[:, start_index:start_index + max_length] + noise_len = (noise_len - start_index).clip(max=max_length).unsqueeze(1) + return noise_batch, noise_len + + def _load_noise_batch_of_size(self, batch_size): + """Concatenate noise batches, then chop to correct size""" + noise_batch, noise_lens = self._load_noise_batch() + + # Expand + while len(noise_batch) < batch_size: + noise_batch = paddle.concat((noise_batch, noise_batch)) + noise_lens = paddle.concat((noise_lens, noise_lens)) + + # Contract + if len(noise_batch) > batch_size: + noise_batch = noise_batch[:batch_size] + noise_lens = noise_lens[:batch_size] + + return noise_batch, noise_lens + + def _load_noise_batch(self): + """Load a batch of noises, restarting iteration if necessary.""" + try: + batch = next(self.noise_data) + except StopIteration: + self.noise_data = iter(self.noise_dataloader) + batch = next(self.noise_data) + + noises, lens = batch['feats'], batch['lengths'] + return noises, lens + + +class AddReverb(nn.Layer): + def __init__( + self, + rir_dataset, + reverb_prob=1.0, + rir_scale_factor=1.0, + num_workers=0, + ): + super(AddReverb, self).__init__() + self.rir_dataset = rir_dataset + self.reverb_prob = reverb_prob + self.rir_scale_factor = rir_scale_factor + + # Create rir data loader. + def rir_collate_fn(batch): + def pad(x, target_length, mode='constant', **kwargs): + x = np.asarray(x) + w = target_length - x.shape[0] + assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}' + return np.pad(x, [0, w], mode=mode, **kwargs) + + ids = [item['id'] for item in batch] + lengths = np.asarray([item['feat'].shape[0] for item in batch]) + waveforms = list( + map(lambda x: pad(x, + lengths.max().item()), + [item['feat'] for item in batch])) + waveforms = np.stack(waveforms) + return {'ids': ids, 'feats': waveforms, 'lengths': lengths} + + self.rir_dataloader = paddle.io.DataLoader( + self.rir_dataset, + collate_fn=rir_collate_fn, + num_workers=num_workers, + shuffle=True, + return_list=True, + ) + + self.rir_data = iter(self.rir_dataloader) + + def forward(self, waveforms, lengths=None): + """ + Arguments + --------- + waveforms : tensor + Shape should be `[batch, time]` or `[batch, time, channels]`. + lengths : tensor + Shape should be a single dimension, `[batch]`. + + Returns + ------- + Tensor of shape `[batch, time]` or `[batch, time, channels]`. + """ + + if lengths is None: + lengths = paddle.ones([len(waveforms)]) + + # Don't add reverb (return early) 1-`reverb_prob` portion of the time + if paddle.rand([1]) > self.reverb_prob: + return waveforms.clone() + + # Add channels dimension if necessary + channel_added = False + if len(waveforms.shape) == 2: + waveforms = waveforms.unsqueeze(-1) + channel_added = True + + # Load and prepare RIR + rir_waveform = self._load_rir() + + # Compress or dilate RIR + if self.rir_scale_factor != 1: + rir_waveform = F.interpolate( + rir_waveform.transpose([0, 2, 1]), + scale_factor=self.rir_scale_factor, + mode="linear", + align_corners=False, + data_format='NCW', + ) + # (N, C, L) -> (N, L, C) + rir_waveform = rir_waveform.transpose([0, 2, 1]) + + rev_waveform = reverberate( + waveforms, + rir_waveform, + self.rir_dataset.sample_rate, + rescale_amp="avg") + + # Remove channels dimension if added + if channel_added: + return rev_waveform.squeeze(-1) + + return rev_waveform + + def _load_rir(self): + try: + batch = next(self.rir_data) + except StopIteration: + self.rir_data = iter(self.rir_dataloader) + batch = next(self.rir_data) + + rir_waveform = batch['feats'] + + # Make sure RIR has correct channels + if len(rir_waveform.shape) == 2: + rir_waveform = rir_waveform.unsqueeze(-1) + + return rir_waveform + + +class AddBabble(nn.Layer): + def __init__( + self, + speaker_count=3, + snr_low=0, + snr_high=0, + mix_prob=1, + ): + super(AddBabble, self).__init__() + self.speaker_count = speaker_count + self.snr_low = snr_low + self.snr_high = snr_high + self.mix_prob = mix_prob + + def forward(self, waveforms, lengths=None): + if lengths is None: + lengths = paddle.ones([len(waveforms)]) + + babbled_waveform = waveforms.clone() + lengths = (lengths * waveforms.shape[1]).unsqueeze(1) + batch_size = len(waveforms) + + # Don't mix (return early) 1-`mix_prob` portion of the batches + if paddle.rand([1]) > self.mix_prob: + return babbled_waveform + + # Pick an SNR and use it to compute the mixture amplitude factors + clean_amplitude = compute_amplitude(waveforms, lengths) + SNR = paddle.rand((batch_size, 1)) + SNR = SNR * (self.snr_high - self.snr_low) + self.snr_low + noise_amplitude_factor = 1 / (dB_to_amplitude(SNR) + 1) + new_noise_amplitude = noise_amplitude_factor * clean_amplitude + + # Scale clean signal appropriately + babbled_waveform *= 1 - noise_amplitude_factor + + # For each speaker in the mixture, roll and add + babble_waveform = waveforms.roll((1, ), axis=0) + babble_len = lengths.roll((1, ), axis=0) + for i in range(1, self.speaker_count): + babble_waveform += waveforms.roll((1 + i, ), axis=0) + babble_len = paddle.concat( + [babble_len, babble_len.roll((1, ), axis=0)], axis=-1).max( + axis=-1, keepdim=True) + + # Rescale and add to mixture + babble_amplitude = compute_amplitude(babble_waveform, babble_len) + babble_waveform *= new_noise_amplitude / (babble_amplitude + 1e-14) + babbled_waveform += babble_waveform + + return babbled_waveform + + +class TimeDomainSpecAugment(nn.Layer): + def __init__( + self, + perturb_prob=1.0, + drop_freq_prob=1.0, + drop_chunk_prob=1.0, + speeds=[95, 100, 105], + sample_rate=16000, + drop_freq_count_low=0, + drop_freq_count_high=3, + drop_chunk_count_low=0, + drop_chunk_count_high=5, + drop_chunk_length_low=1000, + drop_chunk_length_high=2000, + drop_chunk_noise_factor=0, + ): + super(TimeDomainSpecAugment, self).__init__() + self.speed_perturb = SpeedPerturb( + perturb_prob=perturb_prob, + orig_freq=sample_rate, + speeds=speeds, + ) + self.drop_freq = DropFreq( + drop_prob=drop_freq_prob, + drop_count_low=drop_freq_count_low, + drop_count_high=drop_freq_count_high, + ) + self.drop_chunk = DropChunk( + drop_prob=drop_chunk_prob, + drop_count_low=drop_chunk_count_low, + drop_count_high=drop_chunk_count_high, + drop_length_low=drop_chunk_length_low, + drop_length_high=drop_chunk_length_high, + noise_factor=drop_chunk_noise_factor, + ) + + def forward(self, waveforms, lengths=None): + if lengths is None: + lengths = paddle.ones([len(waveforms)]) + + with paddle.no_grad(): + # Augmentation + waveforms = self.speed_perturb(waveforms) + waveforms = self.drop_freq(waveforms) + waveforms = self.drop_chunk(waveforms, lengths) + + return waveforms + + +class EnvCorrupt(nn.Layer): + def __init__( + self, + reverb_prob=1.0, + babble_prob=1.0, + noise_prob=1.0, + rir_dataset=None, + noise_dataset=None, + num_workers=0, + babble_speaker_count=0, + babble_snr_low=0, + babble_snr_high=0, + noise_snr_low=0, + noise_snr_high=0, + rir_scale_factor=1.0, + ): + super(EnvCorrupt, self).__init__() + + # Initialize corrupters + if rir_dataset is not None and reverb_prob > 0.0: + self.add_reverb = AddReverb( + rir_dataset=rir_dataset, + num_workers=num_workers, + reverb_prob=reverb_prob, + rir_scale_factor=rir_scale_factor, + ) + + if babble_speaker_count > 0 and babble_prob > 0.0: + self.add_babble = AddBabble( + speaker_count=babble_speaker_count, + snr_low=babble_snr_low, + snr_high=babble_snr_high, + mix_prob=babble_prob, + ) + + if noise_dataset is not None and noise_prob > 0.0: + self.add_noise = AddNoise( + noise_dataset=noise_dataset, + num_workers=num_workers, + snr_low=noise_snr_low, + snr_high=noise_snr_high, + mix_prob=noise_prob, + ) + + def forward(self, waveforms, lengths=None): + if lengths is None: + lengths = paddle.ones([len(waveforms)]) + + # Augmentation + with paddle.no_grad(): + if hasattr(self, "add_reverb"): + try: + waveforms = self.add_reverb(waveforms, lengths) + except Exception: + pass + if hasattr(self, "add_babble"): + waveforms = self.add_babble(waveforms, lengths) + if hasattr(self, "add_noise"): + waveforms = self.add_noise(waveforms, lengths) + + return waveforms diff --git a/PaddleAudio/examples/speaker_recognition/loss.py b/PaddleAudio/examples/speaker_recognition/loss.py new file mode 100644 index 0000000000..29af1b24d7 --- /dev/null +++ b/PaddleAudio/examples/speaker_recognition/loss.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class AngularMargin(nn.Layer): + def __init__(self, margin=0.0, scale=1.0): + super(AngularMargin, self).__init__() + self.margin = margin + self.scale = scale + + def forward(self, outputs, targets): + outputs = outputs - self.margin * targets + return self.scale * outputs + + +class AdditiveAngularMargin(AngularMargin): + def __init__(self, margin=0.0, scale=1.0, easy_margin=False): + super(AdditiveAngularMargin, self).__init__(margin, scale) + self.easy_margin = easy_margin + + self.cos_m = math.cos(self.margin) + self.sin_m = math.sin(self.margin) + self.th = math.cos(math.pi - self.margin) + self.mm = math.sin(math.pi - self.margin) * self.margin + + def forward(self, outputs, targets): + cosine = outputs.astype('float32') + sine = paddle.sqrt(1.0 - paddle.pow(cosine, 2)) + phi = cosine * self.cos_m - sine * self.sin_m # cos(theta + m) + if self.easy_margin: + phi = paddle.where(cosine > 0, phi, cosine) + else: + phi = paddle.where(cosine > self.th, phi, cosine - self.mm) + outputs = (targets * phi) + ((1.0 - targets) * cosine) + return self.scale * outputs + + +class LogSoftmaxWrapper(nn.Layer): + def __init__(self, loss_fn): + super(LogSoftmaxWrapper, self).__init__() + self.loss_fn = loss_fn + self.criterion = paddle.nn.KLDivLoss(reduction="sum") + + def forward(self, outputs, targets, length=None): + targets = F.one_hot(targets, outputs.shape[1]) + try: + predictions = self.loss_fn(outputs, targets) + except TypeError: + predictions = self.loss_fn(outputs) + + predictions = F.log_softmax(predictions, axis=1) + loss = self.criterion(predictions, targets) / targets.sum() + return loss diff --git a/PaddleAudio/examples/speaker_recognition/metrics.py b/PaddleAudio/examples/speaker_recognition/metrics.py new file mode 100644 index 0000000000..cb58eb5e0a --- /dev/null +++ b/PaddleAudio/examples/speaker_recognition/metrics.py @@ -0,0 +1,29 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +import numpy as np +from sklearn.metrics import roc_curve + + +def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]: + ''' + Compute EER and return score threshold. + ''' + fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores) + fnr = 1 - tpr + eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))] + eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))] + return eer, eer_threshold diff --git a/PaddleAudio/examples/speaker_recognition/model.py b/PaddleAudio/examples/speaker_recognition/model.py new file mode 100644 index 0000000000..ef129b1760 --- /dev/null +++ b/PaddleAudio/examples/speaker_recognition/model.py @@ -0,0 +1,61 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class SpeakerClassifier(nn.Layer): + def __init__( + self, + backbone, + num_class, + lin_blocks=0, + lin_neurons=192, + dropout=0.1, + ): + + super(SpeakerClassifier, self).__init__() + self.backbone = backbone + self.dropout = nn.Dropout(dropout) + + input_size = self.backbone.emb_size + self.blocks = nn.LayerList() + for i in range(lin_blocks): + self.blocks.extend([ + nn.BatchNorm1D(input_size), + nn.Linear(in_features=input_size, out_features=lin_neurons), + ]) + input_size = lin_neurons + + self.weight = paddle.create_parameter( + shape=(input_size, num_class), + dtype='float32', + attr=paddle.ParamAttr(initializer=nn.initializer.XavierUniform()), + ) + + def forward(self, x, lengths=None): + # x.shape: (N, C, L) + x = self.backbone(x, lengths).squeeze( + -1) # (N, emb_size, 1) -> (N, emb_size) + x = self.dropout(x) + + for fc in self.blocks: + x = fc(x) + + # KP: W和x的向量归一化,输出为余弦相似度,供Additive Angular Margin计算loss + logits = F.linear(F.normalize(x), F.normalize(self.weight, axis=0)) + + return logits diff --git a/PaddleAudio/examples/speaker_recognition/signal_processing.py b/PaddleAudio/examples/speaker_recognition/signal_processing.py new file mode 100644 index 0000000000..6dd31f0fa8 --- /dev/null +++ b/PaddleAudio/examples/speaker_recognition/signal_processing.py @@ -0,0 +1,223 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import numpy as np +import paddle + +# TODO: Complete type-hint and doc string. + + +def blackman_window(win_len, dtype=np.float32): + arcs = np.pi * np.arange(win_len) / float(win_len) + win = np.asarray( + [0.42 - 0.5 * np.cos(2 * arc) + 0.08 * np.cos(4 * arc) for arc in arcs], + dtype=dtype) + return paddle.to_tensor(win) + + +def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"): + if len(waveforms.shape) == 1: + waveforms = waveforms.unsqueeze(0) + + assert amp_type in ["avg", "peak"] + assert scale in ["linear", "dB"] + + if amp_type == "avg": + if lengths is None: + out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True) + else: + wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True) + out = wav_sum / lengths + elif amp_type == "peak": + out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True) + else: + raise NotImplementedError + + if scale == "linear": + return out + elif scale == "dB": + return paddle.clip(20 * paddle.log10(out), min=-80) + else: + raise NotImplementedError + + +def dB_to_amplitude(SNR): + return 10**(SNR / 20) + + +def convolve1d( + waveform, + kernel, + padding=0, + pad_type="constant", + stride=1, + groups=1, +): + if len(waveform.shape) != 3: + raise ValueError("Convolve1D expects a 3-dimensional tensor") + + # Padding can be a tuple (left_pad, right_pad) or an int + if isinstance(padding, list): + waveform = paddle.nn.functional.pad( + x=waveform, + pad=padding, + mode=pad_type, + data_format='NLC', + ) + + # Move time dimension last, which pad and fft and conv expect. + # (N, L, C) -> (N, C, L) + waveform = waveform.transpose([0, 2, 1]) + kernel = kernel.transpose([0, 2, 1]) + + convolved = paddle.nn.functional.conv1d( + x=waveform, + weight=kernel, + stride=stride, + groups=groups, + padding=padding if not isinstance(padding, list) else 0, + ) + + # Return time dimension to the second dimension. + return convolved.transpose([0, 2, 1]) + + +def notch_filter(notch_freq, filter_width=101, notch_width=0.05): + # Check inputs + assert 0 < notch_freq <= 1 + assert filter_width % 2 != 0 + pad = filter_width // 2 + inputs = paddle.arange(filter_width, dtype='float32') - pad + + # Avoid frequencies that are too low + notch_freq += notch_width + + # Define sinc function, avoiding division by zero + def sinc(x): + def _sinc(x): + return paddle.sin(x) / x + + # The zero is at the middle index + res = paddle.concat( + [_sinc(x[:pad]), + paddle.ones([1]), + _sinc(x[pad + 1:])]) + return res + + # Compute a low-pass filter with cutoff frequency notch_freq. + hlpf = sinc(3 * (notch_freq - notch_width) * inputs) + hlpf *= blackman_window(filter_width) + hlpf /= paddle.sum(hlpf) + + # Compute a high-pass filter with cutoff frequency notch_freq. + hhpf = sinc(3 * (notch_freq + notch_width) * inputs) + hhpf *= blackman_window(filter_width) + hhpf /= -paddle.sum(hhpf) + hhpf[pad] += 1 + + # Adding filters creates notch filter + return (hlpf + hhpf).reshape([1, -1, 1]) + + +def reverberate(waveforms, + rir_waveform, + sample_rate, + impulse_duration=0.3, + rescale_amp="avg"): + orig_shape = waveforms.shape + + if len(waveforms.shape) > 3 or len(rir_waveform.shape) > 3: + raise NotImplementedError + + # if inputs are mono tensors we reshape to 1, samples + if len(waveforms.shape) == 1: + waveforms = waveforms.unsqueeze(0).unsqueeze(-1) + elif len(waveforms.shape) == 2: + waveforms = waveforms.unsqueeze(-1) + + if len(rir_waveform.shape) == 1: # convolve1d expects a 3d tensor ! + rir_waveform = rir_waveform.unsqueeze(0).unsqueeze(-1) + elif len(rir_waveform.shape) == 2: + rir_waveform = rir_waveform.unsqueeze(-1) + + # Compute the average amplitude of the clean + orig_amplitude = compute_amplitude(waveforms, waveforms.shape[1], + rescale_amp) + + # Compute index of the direct signal, so we can preserve alignment + impulse_index_start = rir_waveform.abs().argmax(axis=1).item() + impulse_index_end = min( + impulse_index_start + int(sample_rate * impulse_duration), + rir_waveform.shape[1]) + rir_waveform = rir_waveform[:, impulse_index_start:impulse_index_end, :] + rir_waveform = rir_waveform / paddle.norm(rir_waveform, p=2) + rir_waveform = paddle.flip(rir_waveform, [1]) + + waveforms = convolve1d( + waveform=waveforms, + kernel=rir_waveform, + padding=[rir_waveform.shape[1] - 1, 0], + ) + + # Rescale to the peak amplitude of the clean waveform + waveforms = rescale(waveforms, waveforms.shape[1], orig_amplitude, + rescale_amp) + + if len(orig_shape) == 1: + waveforms = waveforms.squeeze(0).squeeze(-1) + if len(orig_shape) == 2: + waveforms = waveforms.squeeze(-1) + + return waveforms + + +def rescale(waveforms, lengths, target_lvl, amp_type="avg", scale="linear"): + assert amp_type in ["peak", "avg"] + assert scale in ["linear", "dB"] + + batch_added = False + if len(waveforms.shape) == 1: + batch_added = True + waveforms = waveforms.unsqueeze(0) + + waveforms = normalize(waveforms, lengths, amp_type) + + if scale == "linear": + out = target_lvl * waveforms + elif scale == "dB": + out = dB_to_amplitude(target_lvl) * waveforms + + else: + raise NotImplementedError("Invalid scale, choose between dB and linear") + + if batch_added: + out = out.squeeze(0) + + return out + + +def normalize(waveforms, lengths=None, amp_type="avg", eps=1e-14): + assert amp_type in ["avg", "peak"] + + batch_added = False + if len(waveforms.shape) == 1: + batch_added = True + waveforms = waveforms.unsqueeze(0) + + den = compute_amplitude(waveforms, lengths, amp_type) + eps + if batch_added: + waveforms = waveforms.squeeze(0) + return waveforms / den diff --git a/PaddleAudio/examples/speaker_recognition/speaker_verification.py b/PaddleAudio/examples/speaker_recognition/speaker_verification.py new file mode 100644 index 0000000000..fafcd68a06 --- /dev/null +++ b/PaddleAudio/examples/speaker_recognition/speaker_verification.py @@ -0,0 +1,263 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import ast +import os + +import numpy as np +import paddle +import paddle.nn.functional as F +from metrics import compute_eer +from model import SpeakerClassifier +from paddleaudio.datasets import VoxCeleb1 +from paddleaudio.models.ecapa_tdnn import ECAPA_TDNN +from paddleaudio.transforms import LogMelSpectrogram +from paddleaudio.utils import Timer, get_logger +from tqdm import tqdm + +logger = get_logger() + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.") +parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") +parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.") +parser.add_argument("--load_checkpoint", type=str, default='', help="Directory to load model checkpoint to contiune trainning.") +parser.add_argument("--global_embedding_norm", type=ast.literal_eval, default=True, help="Apply global normalization on speaker embeddings.") +parser.add_argument("--embedding_mean_norm", type=ast.literal_eval, default=True, help="Apply mean normalization on speaker embeddings.") +parser.add_argument("--embedding_std_norm", type=ast.literal_eval, default=False, help="Apply std normalization on speaker embeddings.") +parser.add_argument("--score_norm", type=ast.literal_eval, default=True, help="Apply score normalization.") +parser.add_argument("--norm_size", type=int, default=400000, help="Number of samples in train data used for score normalization.") +parser.add_argument("--norm_top_k", type=int, default=20000, help="Top k scores for score normalization.") +args = parser.parse_args() +# yapf: enable + + +def pad_right(x, target_length, mode='constant', **kwargs): + x = np.asarray(x) + w = target_length - len(x) + assert w >= 0, f'Target length {target_length} is less than origin length {len(x)}' + + pad_width = [0, w] + return np.pad(x, pad_width, mode=mode, **kwargs) + + +def waveform_collate_fn(batch): + ids = [item['id'] for item in batch] + lengths = np.asarray([item['feat'].shape[0] for item in batch]) + waveforms = list( + map(lambda x: pad_right(x, lengths.max()), + [item['feat'] for item in batch])) + waveforms = np.stack(waveforms) + + # Converts into ratios. + lengths = (lengths / lengths.max()).astype(np.float32) + + return {'ids': ids, 'waveforms': waveforms, 'lengths': lengths} + + +def feature_normalize(feats: paddle.Tensor, + lengths: paddle.Tensor, + mean_norm: bool = True, + std_norm: bool = True): + + # Features normalization if needed + lengths = (lengths * feats.shape[-1]).astype('int64') + for i in range(len(feats)): + feat = feats[i, :, :lengths[i].item()] # Excluding pad values. + mean = feat.mean(axis=-1, keepdim=True) if mean_norm else 0 + std = feat.std(axis=-1, keepdim=True) if std_norm else 1 + feats[i, :, :lengths[i].item()] = (feat - mean) / std + + return feats + + +if __name__ == "__main__": + paddle.set_device(args.device) + + feature_extractor = LogMelSpectrogram( + sr=16000, n_fft=400, hop_length=160, n_mels=80, f_min=50) + + model_conf = { + "input_size": 80, + "channels": [1024, 1024, 1024, 1024, 3072], + "kernel_sizes": [5, 3, 3, 3, 1], + "dilations": [1, 2, 3, 4, 1], + "attention_channels": 128, + "lin_neurons": 192, + } + ecapa_tdnn = ECAPA_TDNN(**model_conf) + model = SpeakerClassifier( + backbone=ecapa_tdnn, num_class=VoxCeleb1.num_speakers) + + args.load_checkpoint = os.path.abspath( + os.path.expanduser(args.load_checkpoint)) + + # load model checkpoint + state_dict = paddle.load( + os.path.join(args.load_checkpoint, 'model.pdparams')) + model.set_state_dict(state_dict) + logger.info(f'Checkpoint loaded from {args.load_checkpoint}') + + enrol_ds = VoxCeleb1(subset='enrol', random_chunk=False) + enrol_sampler = paddle.io.BatchSampler( + enrol_ds, batch_size=args.batch_size, + shuffle=True) # Shuffle to make embedding normalization more robust. + enrol_loader = paddle.io.DataLoader( + enrol_ds, + batch_sampler=enrol_sampler, + collate_fn=waveform_collate_fn, + num_workers=args.num_workers, + return_list=True, + ) + + test_ds = VoxCeleb1(subset='test', random_chunk=False) + test_sampler = paddle.io.BatchSampler( + test_ds, batch_size=args.batch_size, shuffle=True) + test_loader = paddle.io.DataLoader( + test_ds, + batch_sampler=test_sampler, + collate_fn=waveform_collate_fn, + num_workers=args.num_workers, + return_list=True, + ) + + if args.score_norm: + norm_ds = VoxCeleb1(subset='train', random_chunk=False) + norm_sampler = paddle.io.BatchSampler( + norm_ds, batch_size=args.batch_size, shuffle=True) + norm_loader = paddle.io.DataLoader( + norm_ds, + batch_sampler=norm_sampler, + collate_fn=waveform_collate_fn, + num_workers=args.num_workers, + return_list=True, + ) + + # Compute embeddings of audios in enrol and test dataset from model. + model.eval() + + if args.global_embedding_norm: + embedding_mean = None + embedding_std = None + mean_norm = args.embedding_mean_norm + std_norm = args.embedding_std_norm + batch_count = 0 + + id2embedding = {} + # Run multi times to make embedding normalization more stable. + for i in range(2): + for dl in [enrol_loader, test_loader]: + logger.info( + f'Loop {[i+1]}: Computing embeddings on {dl.dataset.subset} dataset' + ) + with paddle.no_grad(): + for batch_idx, batch in enumerate(tqdm(dl)): + ids, waveforms, lengths = batch['ids'], batch[ + 'waveforms'], batch['lengths'] + feats = feature_extractor(waveforms) # Features extraction + feats = feature_normalize( + feats, lengths, mean_norm=True, + std_norm=False) # Features normalization + embeddings = model.backbone(feats, lengths).squeeze( + -1) # (N, emb_size, 1) -> (N, emb_size) + + # Global embedding normalization. + if args.global_embedding_norm: + batch_count += 1 + mean = embeddings.mean(axis=0) if mean_norm else 0 + std = embeddings.std(axis=0) if std_norm else 1 + # Update global mean and std. + if embedding_mean is None and embedding_std is None: + embedding_mean, embedding_std = mean, std + else: + weight = 1 / batch_count # Weight decay by batches. + embedding_mean = ( + 1 - weight) * embedding_mean + weight * mean + embedding_std = ( + 1 - weight) * embedding_std + weight * std + # Apply global embedding normalization. + embeddings = ( + embeddings - embedding_mean) / embedding_std + + # Update embedding dict. + id2embedding.update(dict(zip(ids, embeddings))) + + # Compute cosine scores. + labels = [] + enrol_ids = [] + test_ids = [] + with open(VoxCeleb1.veri_test_file, 'r') as f: + for line in f.readlines(): + label, enrol_id, test_id = line.strip().split(' ') + labels.append(int(label)) + enrol_ids.append(enrol_id.split('.')[0].replace('/', '-')) + test_ids.append(test_id.split('.')[0].replace('/', '-')) + + cos_sim_func = paddle.nn.CosineSimilarity(axis=1) + enrol_embeddings, test_embeddings = map( + lambda ids: paddle.stack([id2embedding[id] for id in ids]), + [enrol_ids, test_ids]) # (N, emb_size) + scores = cos_sim_func(enrol_embeddings, test_embeddings) + + if args.score_norm: + n_step = args.norm_size // args.batch_size + 1 # Approximate size + norm_data = iter(norm_loader) + id2embedding_norm = {} + logger.info( + f'Computing {args.norm_size} train embeddings for score norm.') + with paddle.no_grad(): + for i in tqdm(range(n_step)): + batch = next(norm_data) + ids, waveforms, lengths = batch['ids'], batch[ + 'waveforms'], batch['lengths'] + feats = feature_extractor(waveforms) + feats = feature_normalize( + feats, lengths, mean_norm=True, std_norm=False) + embeddings = model.backbone(feats, lengths).squeeze(-1) + + id2embedding_norm.update(dict(zip(ids, embeddings))) + + # Score normalization based on trainning samples. + norm_embeddings = paddle.stack(list(id2embedding_norm.values()), axis=0) + logger.info(f'Applying score norm...') + for idx in tqdm(range(len(scores))): + enrol_id, test_id = enrol_ids[idx], test_ids[idx] + + enrol_embedding, test_embedding = id2embedding[ + enrol_id], id2embedding[test_id] + enrol_embeddings, test_embeddings = \ + map(lambda e: paddle.tile(e, [norm_embeddings.shape[0], 1]), [enrol_embedding, test_embedding]) + scores_e_norm = cos_sim_func(enrol_embeddings, + norm_embeddings).topk( + args.norm_top_k, axis=0)[0] + scores_t_norm = cos_sim_func(test_embeddings, norm_embeddings).topk( + args.norm_top_k, axis=0)[0] + + # Enrol norm + score_e = ( + scores[idx] - paddle.mean(scores_e_norm, axis=0)) / paddle.std( + scores_e_norm, axis=0) + # Test norm + score_t = ( + scores[idx] - paddle.mean(scores_t_norm, axis=0)) / paddle.std( + scores_t_norm, axis=0) + + scores[idx] = (score_e + score_t) / 2 + + EER, threshold = compute_eer(np.asarray(labels), scores.numpy()) + logger.info( + f'EER of verification test: {EER*100:.4f}%, score threshold: {threshold:.5f}' + ) diff --git a/PaddleAudio/examples/speaker_recognition/train.py b/PaddleAudio/examples/speaker_recognition/train.py new file mode 100644 index 0000000000..c8ed40e6ef --- /dev/null +++ b/PaddleAudio/examples/speaker_recognition/train.py @@ -0,0 +1,303 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import ast +import os +from typing import List + +import numpy as np +import paddle +import paddle.nn.functional as F +from augment import EnvCorrupt, TimeDomainSpecAugment +from loss import AdditiveAngularMargin, LogSoftmaxWrapper +from model import SpeakerClassifier +from paddleaudio.datasets import OpenRIRNoise, VoxCeleb1 +from paddleaudio.models.ecapa_tdnn import ECAPA_TDNN +from paddleaudio.transforms import LogMelSpectrogram +from paddleaudio.utils import Timer, get_logger + +logger = get_logger() + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument('--device', choices=['cpu', 'gpu'], default="cpu", help="Select which device to train model, defaults to gpu.") +parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.") +parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") +parser.add_argument("--batch_size", type=int, default=64, help="Total examples' number in batch for training.") +parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.") +parser.add_argument("--augment", type=ast.literal_eval, default=True, help="Apply audio augments.") +parser.add_argument("--checkpoint_dir", type=str, default='./checkpoint', help="Directory to save model checkpoints.") +parser.add_argument("--load_checkpoint", type=str, default='', help="Directory to load model checkpoint to contiune trainning.") +parser.add_argument("--save_freq", type=int, default=10, help="Save checkpoint every n epoch.") +parser.add_argument("--log_freq", type=int, default=10, help="Log the training infomation every n steps.") +args = parser.parse_args() +# yapf: enable + + +def build_augment_pipeline() -> List[paddle.nn.Layer]: + noise_dataset = OpenRIRNoise('noise') + rir_dataset = OpenRIRNoise('rir') + + wavedrop = TimeDomainSpecAugment( + sample_rate=16000, + speeds=[100], + ) + speed_perturb = TimeDomainSpecAugment( + sample_rate=16000, + speeds=[95, 100, 105], + ) + add_noise = EnvCorrupt( + noise_dataset=noise_dataset, + reverb_prob=0.0, + noise_prob=1.0, + noise_snr_low=0, + noise_snr_high=15, + rir_scale_factor=1.0, + ) + add_rev = EnvCorrupt( + rir_dataset=rir_dataset, + reverb_prob=1.0, + noise_prob=0.0, + rir_scale_factor=1.0, + ) + add_rev_noise = EnvCorrupt( + noise_dataset=noise_dataset, + rir_dataset=rir_dataset, + reverb_prob=1.0, + noise_prob=1.0, + noise_snr_low=0, + noise_snr_high=15, + rir_scale_factor=1.0, + ) + return [wavedrop, speed_perturb, add_noise, add_rev, add_rev_noise] + + +def waveform_augment(waveforms: paddle.Tensor, + augment_pipeline: List[paddle.nn.Layer]) -> paddle.Tensor: + waveforms_aug_list = [waveforms] + for aug in augment_pipeline: + waveforms_aug = aug(waveforms) # (N, L) + if waveforms_aug.shape[1] >= waveforms.shape[1]: + # Trunc + waveforms_aug = waveforms_aug[:, :waveforms.shape[1]] + else: + # Pad + lengths_to_pad = waveforms.shape[1] - waveforms_aug.shape[1] + waveforms_aug = F.pad( + waveforms_aug.unsqueeze(-1), [0, lengths_to_pad], + data_format='NLC').squeeze(-1) + waveforms_aug_list.append(waveforms_aug) + + return paddle.concat(waveforms_aug_list, axis=0) + + +def waveform_collate_fn(batch): + waveforms = np.stack([item['feat'] for item in batch]) + labels = np.stack([item['label'] for item in batch]) + + return {'waveforms': waveforms, 'labels': labels} + + +def feature_normalize(feats: paddle.Tensor, + mean_norm: bool = True, + std_norm: bool = True): + # Features normalization if needed + mean = feats.mean(axis=-1, keepdim=True) if mean_norm else 0 + std = feats.std(axis=-1, keepdim=True) if std_norm else 1 + feats = (feats - mean) / std + + return feats + + +if __name__ == "__main__": + paddle.set_device(args.device) + + paddle.distributed.init_parallel_env() + nranks = paddle.distributed.get_world_size() + local_rank = paddle.distributed.get_rank() + + if args.augment: + augment_pipeline = build_augment_pipeline() + else: + augment_pipeline = [] + + feature_extractor = LogMelSpectrogram( + sr=16000, n_fft=400, hop_length=160, n_mels=80, f_min=50) + + model_conf = { + "input_size": 80, + "channels": [1024, 1024, 1024, 1024, 3072], + "kernel_sizes": [5, 3, 3, 3, 1], + "dilations": [1, 2, 3, 4, 1], + "attention_channels": 128, + "lin_neurons": 192, + } + ecapa_tdnn = ECAPA_TDNN(**model_conf) + model = SpeakerClassifier( + backbone=ecapa_tdnn, num_class=VoxCeleb1.num_speakers) + optimizer = paddle.optimizer.AdamW( + learning_rate=args.learning_rate, parameters=model.parameters()) + criterion = LogSoftmaxWrapper( + loss_fn=AdditiveAngularMargin(margin=0.2, scale=30)) + + start_epoch = 0 + if args.load_checkpoint: + args.load_checkpoint = os.path.abspath( + os.path.expanduser(args.load_checkpoint)) + try: + # load model checkpoint + state_dict = paddle.load( + os.path.join(args.load_checkpoint, 'model.pdparams')) + model.set_state_dict(state_dict) + + # load optimizer checkpoint + state_dict = paddle.load( + os.path.join(args.load_checkpoint, 'model.pdopt')) + optimizer.set_state_dict(state_dict) + if local_rank == 0: + logger.info(f'Checkpoint loaded from {args.load_checkpoint}') + except FileExistsError: + if local_rank == 0: + logger.warning('Train from scratch.') + + try: + start_epoch = int(args.load_checkpoint[-1]) + logger.info(f'Restore training from epoch {start_epoch}.') + except ValueError: + pass + + train_ds = VoxCeleb1('train') + dev_ds = VoxCeleb1('dev') + + train_sampler = paddle.io.DistributedBatchSampler( + train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False) + train_loader = paddle.io.DataLoader( + train_ds, + batch_sampler=train_sampler, + num_workers=args.num_workers, + collate_fn=waveform_collate_fn, + return_list=True, + use_buffer_reader=True, + ) + + steps_per_epoch = len(train_sampler) + timer = Timer(steps_per_epoch * args.epochs) + timer.start() + + for epoch in range(start_epoch + 1, args.epochs + 1): + model.train() + + avg_loss = 0 + num_corrects = 0 + num_samples = 0 + for batch_idx, batch in enumerate(train_loader): + waveforms, labels = batch['waveforms'], batch['labels'] + + if len(augment_pipeline) > 0: # Waveforms augment + waveforms = waveform_augment(waveforms, augment_pipeline) + labels = paddle.concat( + [labels for i in range(len(augment_pipeline) + 1)]) + + feats = feature_extractor(waveforms) # Features extraction + feats = feature_normalize( + feats, mean_norm=True, std_norm=False) # Features normalization + logits = model(feats) + + loss = criterion(logits, labels) + loss.backward() + optimizer.step() + if isinstance(optimizer._learning_rate, + paddle.optimizer.lr.LRScheduler): + optimizer._learning_rate.step() + optimizer.clear_grad() + + # Calculate loss + avg_loss += loss.numpy()[0] + + # Calculate metrics + preds = paddle.argmax(logits, axis=1) + num_corrects += (preds == labels).numpy().sum() + num_samples += feats.shape[0] + + timer.count() + + if (batch_idx + 1) % args.log_freq == 0 and local_rank == 0: + lr = optimizer.get_lr() + avg_loss /= args.log_freq + avg_acc = num_corrects / num_samples + + print_msg = 'Epoch={}/{}, Step={}/{}'.format( + epoch, args.epochs, batch_idx + 1, steps_per_epoch) + print_msg += ' loss={:.4f}'.format(avg_loss) + print_msg += ' acc={:.4f}'.format(avg_acc) + print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format( + lr, timer.timing, timer.eta) + logger.info(print_msg) + + avg_loss = 0 + num_corrects = 0 + num_samples = 0 + + if epoch % args.save_freq == 0 and batch_idx + 1 == steps_per_epoch: + if local_rank != 0: + paddle.distributed.barrier( + ) # Wait for valid step in main process + continue # Resume trainning on other process + + dev_sampler = paddle.io.BatchSampler( + dev_ds, + batch_size=args.batch_size // 4, + shuffle=False, + drop_last=False) + dev_loader = paddle.io.DataLoader( + dev_ds, + batch_sampler=dev_sampler, + collate_fn=waveform_collate_fn, + num_workers=args.num_workers, + return_list=True, + ) + + model.eval() + num_corrects = 0 + num_samples = 0 + logger.info('Evaluate on validation dataset') + with paddle.no_grad(): + for batch_idx, batch in enumerate(dev_loader): + waveforms, labels = batch['waveforms'], batch['labels'] + feats = feature_extractor(waveforms) + feats = feature_normalize( + feats, mean_norm=True, std_norm=False) + logits = model(feats) + + preds = paddle.argmax(logits, axis=1) + num_corrects += (preds == labels).numpy().sum() + num_samples += feats.shape[0] + + print_msg = '[Evaluation result]' + print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples) + + logger.info(print_msg) + + # Save model + save_dir = os.path.join(args.checkpoint_dir, + 'epoch_{}'.format(epoch)) + logger.info('Saving model checkpoint to {}'.format(save_dir)) + paddle.save(model.state_dict(), + os.path.join(save_dir, 'model.pdparams')) + paddle.save(optimizer.state_dict(), + os.path.join(save_dir, 'model.pdopt')) + + if paddle.distributed.get_world_size() > 1: + paddle.distributed.barrier() # Main process diff --git a/PaddleAudio/paddleaudio/datasets/__init__.py b/PaddleAudio/paddleaudio/datasets/__init__.py index b7110a256e..9626bbca19 100644 --- a/PaddleAudio/paddleaudio/datasets/__init__.py +++ b/PaddleAudio/paddleaudio/datasets/__init__.py @@ -17,9 +17,11 @@ from .esc50 import ESC50 from .gtzan import GTZAN from .librispeech import LIBRISPEECH +from .open_rir_noise import OpenRIRNoise from .ravdess import RAVDESS from .tess import TESS from .urban_sound import UrbanSound8K +from .voxceleb1 import VoxCeleb1 __all__ = [ 'AISHELL1', @@ -31,4 +33,6 @@ 'UrbanAudioVisualScenes', 'RAVDESS', 'TESS', + 'VoxCeleb1', + 'OpenRIRNoise', ] diff --git a/PaddleAudio/paddleaudio/datasets/aishell.py b/PaddleAudio/paddleaudio/datasets/aishell.py index 2d47dd2cdc..80328dcb27 100644 --- a/PaddleAudio/paddleaudio/datasets/aishell.py +++ b/PaddleAudio/paddleaudio/datasets/aishell.py @@ -22,11 +22,11 @@ from tqdm import tqdm from ..backends import load as load_audio -from ..utils.download import decompress, download_and_decompress -from ..utils.env import DATA_HOME -from ..utils.log import logger +from ..utils import DATA_HOME, decompress, download_and_decompress, get_logger from .dataset import feat_funcs +logger = get_logger() + __all__ = ['AISHELL1'] @@ -55,14 +55,17 @@ class AISHELL1(Dataset): 'md5': '2f494334227864a8a8fec932999db9d8', }, ] - text_meta = os.path.join('data_aishell', 'transcript', 'aishell_transcript_v0.8.txt') - utt_info = collections.namedtuple('META_INFO', ('file_path', 'utt_id', 'text')) + text_meta = os.path.join('data_aishell', 'transcript', + 'aishell_transcript_v0.8.txt') + utt_info = collections.namedtuple('META_INFO', + ('file_path', 'utt_id', 'text')) audio_path = os.path.join('data_aishell', 'wav') manifest_path = os.path.join('data_aishell', 'manifest') subset = ['train', 'dev', 'test'] def __init__(self, subset: str = 'train', feat_type: str = 'raw', **kwargs): - assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(self.subset, subset) + assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format( + self.subset, subset) self.subset = subset self.feat_type = feat_type self.feat_config = kwargs @@ -73,7 +76,8 @@ def _get_text_info(self) -> Dict[str, str]: ret = {} with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf: for line in rf.readlines()[1:]: - utt_id, text = map(str.strip, line.split(' ', 1)) # utt_id, text + utt_id, text = map(str.strip, line.split(' ', + 1)) # utt_id, text ret.update({utt_id: ''.join(text.split())}) return ret @@ -82,7 +86,8 @@ def _get_data(self): not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)): download_and_decompress(self.archieves, DATA_HOME) # Extract *wav from *.tar.gz. - for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)): + for root, _, files in os.walk( + os.path.join(DATA_HOME, self.audio_path)): for file in files: if file.endswith('.tar.gz'): decompress(os.path.join(root, file)) @@ -91,7 +96,8 @@ def _get_data(self): text_info = self._get_text_info() data = [] - for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path, self.subset)): + for root, _, files in os.walk( + os.path.join(DATA_HOME, self.audio_path, self.subset)): for file in files: if file.endswith('.wav'): utt_id = os.path.splitext(file)[0] @@ -111,9 +117,12 @@ def _convert_to_record(self, idx: int): for field in type(sample)._fields: record[field] = getattr(sample, field) - waveform, sr = load_audio(sample[0]) # The first element of sample is file path + waveform, sr = load_audio( + sample[0]) # The first element of sample is file path feat_func = feat_funcs[self.feat_type] - feat = feat_func(waveform, sample_rate=sr, **self.feat_config) if feat_func else waveform + feat = feat_func( + waveform, sample_rate=sr, ** + self.feat_config) if feat_func else waveform record.update({'feat': feat, 'duration': len(waveform) / sr}) return record @@ -121,18 +130,18 @@ def create_manifest(self, prefix='manifest'): if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)): os.makedirs(os.path.join(DATA_HOME, self.manifest_path)) - manifest_file = os.path.join(DATA_HOME, self.manifest_path, f'{prefix}.{self.subset}') + manifest_file = os.path.join(DATA_HOME, self.manifest_path, + f'{prefix}.{self.subset}') with codecs.open(manifest_file, 'w', 'utf-8') as f: for idx in tqdm(range(len(self))): record = self._convert_to_record(idx) - record_line = json.dumps( - { - 'utt': record['utt_id'], - 'feat': record['file_path'], - 'feat_shape': (record['duration'], ), - 'text': record['text'] - }, - ensure_ascii=False) + record_line = json.dumps({ + 'utt': record['utt_id'], + 'feat': record['file_path'], + 'feat_shape': (record['duration'], ), + 'text': record['text'] + }, + ensure_ascii=False) f.write(record_line + '\n') logger.info(f'Manifest file {manifest_file} created.') diff --git a/PaddleAudio/paddleaudio/datasets/dataset.py b/PaddleAudio/paddleaudio/datasets/dataset.py index 70f83afc0b..18661b51cb 100644 --- a/PaddleAudio/paddleaudio/datasets/dataset.py +++ b/PaddleAudio/paddleaudio/datasets/dataset.py @@ -21,7 +21,7 @@ from tqdm import tqdm from ..backends import load as load_audio -from ..features import melspectrogram, mfcc +from ..utils._librosa import melspectrogram, mfcc feat_funcs = { 'raw': None, @@ -34,6 +34,7 @@ class AudioClassificationDataset(paddle.io.Dataset): """ Base class of audio classification dataset. """ + def __init__(self, files: List[str], labels: List[int], @@ -75,8 +76,8 @@ def _convert_to_record(self, idx): def __getitem__(self, idx): record = self._convert_to_record(idx) - return np.array(record['feat']).transpose(), np.array(record['label'], - dtype=np.int64) + return np.array(record['feat']).transpose(), np.array( + record['label'], dtype=np.int64) def __len__(self): return len(self.files) diff --git a/PaddleAudio/paddleaudio/datasets/dcase.py b/PaddleAudio/paddleaudio/datasets/dcase.py index a07d2efc22..16d09b09aa 100644 --- a/PaddleAudio/paddleaudio/datasets/dcase.py +++ b/PaddleAudio/paddleaudio/datasets/dcase.py @@ -16,8 +16,7 @@ import os from typing import List, Tuple -from ..utils.download import download_and_decompress -from ..utils.env import DATA_HOME +from ..utils import DATA_HOME, download_and_decompress from .dataset import AudioClassificationDataset __all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes'] @@ -111,13 +110,16 @@ class UrbanAcousticScenes(AudioClassificationDataset): 'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'] meta = os.path.join(base_name, 'meta.csv') - meta_info = collections.namedtuple('META_INFO', ('filename', 'scene_label', 'identifier', 'source_label')) + meta_info = collections.namedtuple( + 'META_INFO', ('filename', 'scene_label', 'identifier', 'source_label')) subset_meta = { 'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'), - 'dev': os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'), + 'dev': os.path.join(base_name, 'evaluation_setup', + 'fold1_evaluate.csv'), 'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'), } - subset_meta_info = collections.namedtuple('SUBSET_META_INFO', ('filename', 'scene_label')) + subset_meta_info = collections.namedtuple('SUBSET_META_INFO', + ('filename', 'scene_label')) audio_path = os.path.join(base_name, 'audio') def __init__(self, mode: str = 'train', feat_type: str = 'raw', **kwargs): @@ -129,9 +131,11 @@ def __init__(self, mode: str = 'train', feat_type: str = 'raw', **kwargs): It identifies the feature type that user wants to extrace of an audio file. """ files, labels = self._get_data(mode) - super(UrbanAcousticScenes, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs) + super(UrbanAcousticScenes, self).__init__( + files=files, labels=labels, feat_type=feat_type, **kwargs) - def _get_meta_info(self, subset: str = None, skip_header: bool = True) -> List[collections.namedtuple]: + def _get_meta_info(self, subset: str = None, skip_header: bool = True + ) -> List[collections.namedtuple]: if subset is None: meta_file = self.meta meta_info = self.meta_info @@ -225,13 +229,19 @@ class UrbanAudioVisualScenes(AudioClassificationDataset): meta_base_path = os.path.join(base_name, base_name + '.meta') meta = os.path.join(meta_base_path, 'meta.csv') - meta_info = collections.namedtuple('META_INFO', ('filename_audio', 'filename_video', 'scene_label', 'identifier')) + meta_info = collections.namedtuple( + 'META_INFO', + ('filename_audio', 'filename_video', 'scene_label', 'identifier')) subset_meta = { - 'train': os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'), - 'dev': os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'), - 'test': os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'), + 'train': + os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'), + 'dev': + os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'), + 'test': + os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'), } - subset_meta_info = collections.namedtuple('SUBSET_META_INFO', ('filename_audio', 'filename_video', 'scene_label')) + subset_meta_info = collections.namedtuple( + 'SUBSET_META_INFO', ('filename_audio', 'filename_video', 'scene_label')) audio_path = os.path.join(base_name, 'audio') def __init__(self, mode: str = 'train', feat_type: str = 'raw', **kwargs): @@ -243,9 +253,11 @@ def __init__(self, mode: str = 'train', feat_type: str = 'raw', **kwargs): It identifies the feature type that user wants to extrace of an audio file. """ files, labels = self._get_data(mode) - super(UrbanAudioVisualScenes, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs) + super(UrbanAudioVisualScenes, self).__init__( + files=files, labels=labels, feat_type=feat_type, **kwargs) - def _get_meta_info(self, subset: str = None, skip_header: bool = True) -> List[collections.namedtuple]: + def _get_meta_info(self, subset: str = None, skip_header: bool = True + ) -> List[collections.namedtuple]: if subset is None: meta_file = self.meta meta_info = self.meta_info @@ -264,7 +276,8 @@ def _get_meta_info(self, subset: str = None, skip_header: bool = True) -> List[c def _get_data(self, mode: str) -> Tuple[List[str], List[int]]: if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ not os.path.isfile(os.path.join(DATA_HOME, self.meta)): - download_and_decompress(self.archieves, os.path.join(DATA_HOME, self.base_name)) + download_and_decompress(self.archieves, + os.path.join(DATA_HOME, self.base_name)) meta_info = self._get_meta_info(subset=mode, skip_header=True) diff --git a/PaddleAudio/paddleaudio/datasets/esc50.py b/PaddleAudio/paddleaudio/datasets/esc50.py index 44af7160ab..70055ee180 100644 --- a/PaddleAudio/paddleaudio/datasets/esc50.py +++ b/PaddleAudio/paddleaudio/datasets/esc50.py @@ -16,8 +16,7 @@ import os from typing import List, Tuple -from ..utils.download import download_and_decompress -from ..utils.env import DATA_HOME +from ..utils import DATA_HOME, download_and_decompress from .dataset import AudioClassificationDataset __all__ = ['ESC50'] @@ -99,11 +98,16 @@ class ESC50(AudioClassificationDataset): 'Hand saw', ] meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv') - meta_info = collections.namedtuple('META_INFO', - ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take')) + meta_info = collections.namedtuple( + 'META_INFO', + ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take')) audio_path = os.path.join('ESC-50-master', 'audio') - def __init__(self, mode: str = 'train', split: int = 1, feat_type: str = 'raw', **kwargs): + def __init__(self, + mode: str = 'train', + split: int = 1, + feat_type: str = 'raw', + **kwargs): """ Ags: mode (:obj:`str`, `optional`, defaults to `train`): @@ -114,7 +118,8 @@ def __init__(self, mode: str = 'train', split: int = 1, feat_type: str = 'raw', It identifies the feature type that user wants to extrace of an audio file. """ files, labels = self._get_data(mode, split) - super(ESC50, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs) + super(ESC50, self).__init__( + files=files, labels=labels, feat_type=feat_type, **kwargs) def _get_meta_info(self) -> List[collections.namedtuple]: ret = [] diff --git a/PaddleAudio/paddleaudio/datasets/gtzan.py b/PaddleAudio/paddleaudio/datasets/gtzan.py index 41ad8b94b2..b38f9ca433 100644 --- a/PaddleAudio/paddleaudio/datasets/gtzan.py +++ b/PaddleAudio/paddleaudio/datasets/gtzan.py @@ -17,8 +17,7 @@ import random from typing import List, Tuple -from ..utils.download import download_and_decompress -from ..utils.env import DATA_HOME +from ..utils import DATA_HOME, download_and_decompress from .dataset import AudioClassificationDataset __all__ = ['GTZAN'] @@ -41,12 +40,21 @@ class GTZAN(AudioClassificationDataset): 'md5': '5b3d6dddb579ab49814ab86dba69e7c7', }, ] - label_list = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock'] + label_list = [ + 'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', + 'pop', 'reggae', 'rock' + ] meta = os.path.join('genres', 'input.mf') meta_info = collections.namedtuple('META_INFO', ('file_path', 'label')) audio_path = 'genres' - def __init__(self, mode='train', seed=0, n_folds=5, split=1, feat_type='raw', **kwargs): + def __init__(self, + mode='train', + seed=0, + n_folds=5, + split=1, + feat_type='raw', + **kwargs): """ Ags: mode (:obj:`str`, `optional`, defaults to `train`): @@ -62,7 +70,8 @@ def __init__(self, mode='train', seed=0, n_folds=5, split=1, feat_type='raw', ** """ assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}' files, labels = self._get_data(mode, seed, n_folds, split) - super(GTZAN, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs) + super(GTZAN, self).__init__( + files=files, labels=labels, feat_type=feat_type, **kwargs) def _get_meta_info(self) -> List[collections.namedtuple]: ret = [] @@ -71,14 +80,17 @@ def _get_meta_info(self) -> List[collections.namedtuple]: ret.append(self.meta_info(*line.strip().split('\t'))) return ret - def _get_data(self, mode, seed, n_folds, split) -> Tuple[List[str], List[int]]: + def _get_data(self, mode, seed, n_folds, + split) -> Tuple[List[str], List[int]]: if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ not os.path.isfile(os.path.join(DATA_HOME, self.meta)): download_and_decompress(self.archieves, DATA_HOME) meta_info = self._get_meta_info() random.seed(seed) # shuffle samples to split data - random.shuffle(meta_info) # make sure using the same seed to create train and dev dataset + random.shuffle( + meta_info + ) # make sure using the same seed to create train and dev dataset files = [] labels = [] @@ -90,11 +102,13 @@ def _get_data(self, mode, seed, n_folds, split) -> Tuple[List[str], List[int]]: fold = idx // n_samples_per_fold + 1 if mode == 'train' and int(fold) != split: - files.append(os.path.join(DATA_HOME, self.audio_path, label, filename)) + files.append( + os.path.join(DATA_HOME, self.audio_path, label, filename)) labels.append(target) if mode != 'train' and int(fold) == split: - files.append(os.path.join(DATA_HOME, self.audio_path, label, filename)) + files.append( + os.path.join(DATA_HOME, self.audio_path, label, filename)) labels.append(target) return files, labels diff --git a/PaddleAudio/paddleaudio/datasets/librispeech.py b/PaddleAudio/paddleaudio/datasets/librispeech.py index 8d4bee0ab5..f61a90462c 100644 --- a/PaddleAudio/paddleaudio/datasets/librispeech.py +++ b/PaddleAudio/paddleaudio/datasets/librispeech.py @@ -22,11 +22,11 @@ from tqdm import tqdm from ..backends import load as load_audio -from ..utils.download import decompress, download_and_decompress -from ..utils.env import DATA_HOME -from ..utils.log import logger +from ..utils import DATA_HOME, decompress, download_and_decompress, get_logger from .dataset import feat_funcs +logger = get_logger() + __all__ = ['LIBRISPEECH'] @@ -75,14 +75,19 @@ class LIBRISPEECH(Dataset): }, ] speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT') - utt_info = collections.namedtuple('META_INFO', ('file_path', 'utt_id', 'text', 'spk_id', 'spk_gender')) + utt_info = collections.namedtuple( + 'META_INFO', ('file_path', 'utt_id', 'text', 'spk_id', 'spk_gender')) audio_path = 'LibriSpeech' manifest_path = os.path.join('LibriSpeech', 'manifest') subset = ['train-clean-100', 'train-clean-360', 'train-clean-500', \ 'dev-clean', 'dev-other', 'test-clean', 'test-other'] - def __init__(self, subset: str = 'train-clean-100', feat_type: str = 'raw', **kwargs): - assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(self.subset, subset) + def __init__(self, + subset: str = 'train-clean-100', + feat_type: str = 'raw', + **kwargs): + assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format( + self.subset, subset) self.subset = subset self.feat_type = feat_type self.feat_config = kwargs @@ -95,7 +100,8 @@ def _get_speaker_info(self) -> Dict[str, str]: for line in rf.readlines(): if ';' in line: # Skip dataset abstract continue - spk_id, gender = map(str.strip, line.split('|')[:2]) # spk_id, gender + spk_id, gender = map(str.strip, + line.split('|')[:2]) # spk_id, gender ret.update({spk_id: gender}) return ret @@ -103,27 +109,32 @@ def _get_text_info(self, trans_file) -> Dict[str, str]: ret = {} with open(trans_file, 'r') as rf: for line in rf.readlines(): - utt_id, text = map(str.strip, line.split(' ', 1)) # utt_id, text + utt_id, text = map(str.strip, line.split(' ', + 1)) # utt_id, text ret.update({utt_id: text}) return ret def _get_data(self): if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)): - download_and_decompress(self.archieves, DATA_HOME, len(self.archieves)) + download_and_decompress(self.archieves, DATA_HOME, + len(self.archieves)) # Speaker info speaker_info = self._get_speaker_info() # Text info text_info = {} - for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path, self.subset)): + for root, _, files in os.walk( + os.path.join(DATA_HOME, self.audio_path, self.subset)): for file in files: if file.endswith('.trans.txt'): - text_info.update(self._get_text_info(os.path.join(root, file))) + text_info.update( + self._get_text_info(os.path.join(root, file))) data = [] - for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path, self.subset)): + for root, _, files in os.walk( + os.path.join(DATA_HOME, self.audio_path, self.subset)): for file in files: if file.endswith('.flac'): utt_id = os.path.splitext(file)[0] @@ -134,7 +145,9 @@ def _get_data(self): file_path = os.path.join(root, file) text = text_info[utt_id] spk_gender = speaker_info[spk_id] - data.append(self.utt_info(file_path, utt_id, text, spk_id, spk_gender)) + data.append( + self.utt_info(file_path, utt_id, text, spk_id, + spk_gender)) return data @@ -146,9 +159,12 @@ def _convert_to_record(self, idx: int): for field in type(sample)._fields: record[field] = getattr(sample, field) - waveform, sr = load_audio(sample[0]) # The first element of sample is file path + waveform, sr = load_audio( + sample[0]) # The first element of sample is file path feat_func = feat_funcs[self.feat_type] - feat = feat_func(waveform, sample_rate=sr, **self.feat_config) if feat_func else waveform + feat = feat_func( + waveform, sample_rate=sr, ** + self.feat_config) if feat_func else waveform record.update({'feat': feat, 'duration': len(waveform) / sr}) return record @@ -156,20 +172,20 @@ def create_manifest(self, prefix='manifest'): if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)): os.makedirs(os.path.join(DATA_HOME, self.manifest_path)) - manifest_file = os.path.join(DATA_HOME, self.manifest_path, f'{prefix}.{self.subset}') + manifest_file = os.path.join(DATA_HOME, self.manifest_path, + f'{prefix}.{self.subset}') with codecs.open(manifest_file, 'w', 'utf-8') as f: for idx in tqdm(range(len(self))): record = self._convert_to_record(idx) - record_line = json.dumps( - { - 'utt': record['utt_id'], - 'feat': record['file_path'], - 'feat_shape': (record['duration'], ), - 'text': record['text'], - 'spk': record['spk_id'], - 'gender': record['spk_gender'], - }, - ensure_ascii=False) + record_line = json.dumps({ + 'utt': record['utt_id'], + 'feat': record['file_path'], + 'feat_shape': (record['duration'], ), + 'text': record['text'], + 'spk': record['spk_id'], + 'gender': record['spk_gender'], + }, + ensure_ascii=False) f.write(record_line + '\n') logger.info(f'Manifest file {manifest_file} created.') diff --git a/PaddleAudio/paddleaudio/datasets/open_rir_noise.py b/PaddleAudio/paddleaudio/datasets/open_rir_noise.py new file mode 100644 index 0000000000..ad11cc1e31 --- /dev/null +++ b/PaddleAudio/paddleaudio/datasets/open_rir_noise.py @@ -0,0 +1,192 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import csv +import glob +import os +import random +from typing import Dict, List, Tuple + +from paddle.io import Dataset +from tqdm import tqdm + +from ..backends import load as load_audio +from ..backends import save_wav +from ..utils import DATA_HOME, decompress, download_and_decompress, get_logger +from .dataset import feat_funcs + +logger = get_logger() + +__all__ = ['OpenRIRNoise'] + + +class OpenRIRNoise(Dataset): + archieves = [ + { + 'url': 'http://www.openslr.org/resources/28/rirs_noises.zip', + 'md5': 'e6f48e257286e05de56413b4779d8ffb', + }, + ] + + sample_rate = 16000 + meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav')) + base_path = os.path.join(DATA_HOME, 'open_rir_noise') + wav_path = os.path.join(base_path, 'RIRS_NOISES') + csv_path = os.path.join(base_path, 'csv') + subsets = ['rir', 'noise'] + + def __init__(self, + subset: str = 'rir', + feat_type: str = 'raw', + random_chunk: bool = True, + chunk_duration: float = 3.0, + seed: int = 0, + **kwargs): + + assert subset in self.subsets, \ + 'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset) + + self.subset = subset + self.feat_type = feat_type + self.feat_config = kwargs + self.random_chunk = random_chunk + self.chunk_duration = chunk_duration + self._data = self._get_data() + super(OpenRIRNoise, self).__init__() + + # Set up a seed to reproduce training or predicting result. + random.seed(seed) + + def _get_data(self): + # Download audio files. + if not os.path.isdir(self.base_path): + download_and_decompress( + self.archieves, self.base_path, decompress=True) + + # Data preparation. + if not os.path.isdir(self.csv_path): + os.makedirs(self.csv_path) + self.prepare_data() + + data = [] + with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf: + for line in rf.readlines()[1:]: + audio_id, duration, wav = line.strip().split(',') + data.append(self.meta_info(audio_id, float(duration), wav)) + + return data + + def _convert_to_record(self, idx: int): + sample = self._data[idx] + + record = {} + # To show all fields in a namedtuple: `type(sample)._fields` + for field in type(sample)._fields: + record[field] = getattr(sample, field) + + waveform, sr = load_audio(record['wav']) + + assert self.feat_type in feat_funcs.keys(), \ + f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}" + feat_func = feat_funcs[self.feat_type] + feat = feat_func( + waveform, sr=sr, **self.feat_config) if feat_func else waveform + + record.update({'feat': feat}) + return record + + @staticmethod + def _get_chunks(seg_dur, audio_id, audio_duration): + num_chunks = int(audio_duration / seg_dur) # all in milliseconds + + chunk_lst = [ + audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur) + for i in range(num_chunks) + ] + return chunk_lst + + def _get_audio_info(self, wav_file: str, + split_chunks: bool) -> List[List[str]]: + waveform, sr = load_audio(wav_file) + audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0] + audio_duration = waveform.shape[0] / sr + + ret = [] + if split_chunks and audio_duration > self.chunk_duration: # Split into pieces of self.chunk_duration seconds. + uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id, + audio_duration) + + for idx, chunk in enumerate(uniq_chunks_list): + s, e = chunk.split("_")[-2:] # Timestamps of start and end + start_sample = int(float(s) * sr) + end_sample = int(float(e) * sr) + new_wav_file = os.path.join(self.base_path, + audio_id + f'_chunk_{idx+1:02}.wav') + save_wav(waveform[start_sample:end_sample], sr, new_wav_file) + # id, duration, new_wav + ret.append([chunk, self.chunk_duration, new_wav_file]) + else: # Keep whole audio. + ret.append([audio_id, audio_duration, wav_file]) + return ret + + def generate_csv(self, + wav_files: List[str], + output_file: str, + split_chunks: bool = True): + logger.info(f'Generating csv: {output_file}') + header = ["id", "duration", "wav"] + + infos = list( + tqdm( + map(self._get_audio_info, wav_files, + [split_chunks] * len(wav_files)), + total=len(wav_files))) + + csv_lines = [] + for info in infos: + csv_lines.extend(info) + + with open(output_file, mode="w") as csv_f: + csv_writer = csv.writer( + csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) + csv_writer.writerow(header) + for line in csv_lines: + csv_writer.writerow(line) + + def prepare_data(self): + rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises", + "rir_list") + rir_files = [] + with open(rir_list, 'r') as f: + for line in f.readlines(): + rir_file = line.strip().split(' ')[-1] + rir_files.append(os.path.join(self.base_path, rir_file)) + + noise_list = os.path.join(self.wav_path, "pointsource_noises", + "noise_list") + noise_files = [] + with open(noise_list, 'r') as f: + for line in f.readlines(): + noise_file = line.strip().split(' ')[-1] + noise_files.append(os.path.join(self.base_path, noise_file)) + + self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv')) + self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv')) + + def __getitem__(self, idx): + return self._convert_to_record(idx) + + def __len__(self): + return len(self._data) diff --git a/PaddleAudio/paddleaudio/datasets/ravdess.py b/PaddleAudio/paddleaudio/datasets/ravdess.py index dc3b2b66c0..53b46f6ca2 100644 --- a/PaddleAudio/paddleaudio/datasets/ravdess.py +++ b/PaddleAudio/paddleaudio/datasets/ravdess.py @@ -17,8 +17,7 @@ import random from typing import List, Tuple -from ..utils.download import download_and_decompress -from ..utils.env import DATA_HOME +from ..utils import DATA_HOME, download_and_decompress from .dataset import AudioClassificationDataset __all__ = ['RAVDESS'] @@ -82,10 +81,8 @@ def __init__(self, """ assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}' files, labels = self._get_data(mode, seed, n_folds, split) - super(RAVDESS, self).__init__(files=files, - labels=labels, - feat_type=feat_type, - **kwargs) + super(RAVDESS, self).__init__( + files=files, labels=labels, feat_type=feat_type, **kwargs) def _get_meta_info(self, files) -> List[collections.namedtuple]: ret = [] diff --git a/PaddleAudio/paddleaudio/datasets/tess.py b/PaddleAudio/paddleaudio/datasets/tess.py index 3b8915237e..7404aa7c9b 100644 --- a/PaddleAudio/paddleaudio/datasets/tess.py +++ b/PaddleAudio/paddleaudio/datasets/tess.py @@ -17,9 +17,7 @@ import random from typing import List, Tuple -from ..utils.download import download_and_decompress -from ..utils.env import DATA_HOME -from ..utils.log import logger +from ..utils import DATA_HOME, download_and_decompress from .dataset import AudioClassificationDataset __all__ = ['TESS'] @@ -40,7 +38,8 @@ class TESS(AudioClassificationDataset): archieves = [ { - 'url': 'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip', + 'url': + 'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip', 'md5': '1465311b24d1de704c4c63e4ccc470c7', }, ] @@ -53,10 +52,17 @@ class TESS(AudioClassificationDataset): 'ps', # pleasant surprise 'sad', ] - meta_info = collections.namedtuple('META_INFO', ('speaker', 'word', 'emotion')) + meta_info = collections.namedtuple('META_INFO', + ('speaker', 'word', 'emotion')) audio_path = 'TESS_Toronto_emotional_speech_set' - def __init__(self, mode='train', seed=0, n_folds=5, split=1, feat_type='raw', **kwargs): + def __init__(self, + mode='train', + seed=0, + n_folds=5, + split=1, + feat_type='raw', + **kwargs): """ Ags: mode (:obj:`str`, `optional`, defaults to `train`): @@ -72,7 +78,8 @@ def __init__(self, mode='train', seed=0, n_folds=5, split=1, feat_type='raw', ** """ assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}' files, labels = self._get_data(mode, seed, n_folds, split) - super(TESS, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs) + super(TESS, self).__init__( + files=files, labels=labels, feat_type=feat_type, **kwargs) def _get_meta_info(self, files) -> List[collections.namedtuple]: ret = [] @@ -81,7 +88,8 @@ def _get_meta_info(self, files) -> List[collections.namedtuple]: ret.append(self.meta_info(*basename_without_extend.split('_'))) return ret - def _get_data(self, mode, seed, n_folds, split) -> Tuple[List[str], List[int]]: + def _get_data(self, mode, seed, n_folds, + split) -> Tuple[List[str], List[int]]: if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)): download_and_decompress(self.archieves, DATA_HOME) @@ -92,7 +100,9 @@ def _get_data(self, mode, seed, n_folds, split) -> Tuple[List[str], List[int]]: wav_files.append(os.path.join(root, file)) random.seed(seed) # shuffle samples to split data - random.shuffle(wav_files) # make sure using the same seed to create train and dev dataset + random.shuffle( + wav_files + ) # make sure using the same seed to create train and dev dataset meta_info = self._get_meta_info(wav_files) files = [] diff --git a/PaddleAudio/paddleaudio/datasets/urban_sound.py b/PaddleAudio/paddleaudio/datasets/urban_sound.py index 67555fd71b..9d9d2dc142 100644 --- a/PaddleAudio/paddleaudio/datasets/urban_sound.py +++ b/PaddleAudio/paddleaudio/datasets/urban_sound.py @@ -16,8 +16,7 @@ import os from typing import List, Tuple -from ..utils.download import download_and_decompress -from ..utils.env import DATA_HOME +from ..utils import DATA_HOME, download_and_decompress from .dataset import AudioClassificationDataset __all__ = ['UrbanSound8K'] @@ -37,20 +36,27 @@ class UrbanSound8K(AudioClassificationDataset): archieves = [ { - 'url': 'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz', + 'url': + 'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz', 'md5': '9aa69802bbf37fb986f71ec1483a196e', }, ] label_list = ["air_conditioner", "car_horn", "children_playing", "dog_bark", "drilling", \ "engine_idling", "gun_shot", "jackhammer", "siren", "street_music"] meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv') - meta_info = collections.namedtuple('META_INFO', - ('filename', 'fsid', 'start', 'end', 'salience', 'fold', 'class_id', 'label')) + meta_info = collections.namedtuple( + 'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold', + 'class_id', 'label')) audio_path = os.path.join('UrbanSound8K', 'audio') - def __init__(self, mode: str = 'train', split: int = 1, feat_type: str = 'raw', **kwargs): + def __init__(self, + mode: str = 'train', + split: int = 1, + feat_type: str = 'raw', + **kwargs): files, labels = self._get_data(mode, split) - super(UrbanSound8K, self).__init__(files=files, labels=labels, feat_type=feat_type, **kwargs) + super(UrbanSound8K, self).__init__( + files=files, labels=labels, feat_type=feat_type, **kwargs) """ Ags: mode (:obj:`str`, `optional`, defaults to `train`): @@ -80,11 +86,15 @@ def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]: for sample in meta_info: filename, _, _, _, _, fold, target, _ = sample if mode == 'train' and int(fold) != split: - files.append(os.path.join(DATA_HOME, self.audio_path, f'fold{fold}', filename)) + files.append( + os.path.join(DATA_HOME, self.audio_path, f'fold{fold}', + filename)) labels.append(int(target)) if mode != 'train' and int(fold) == split: - files.append(os.path.join(DATA_HOME, self.audio_path, f'fold{fold}', filename)) + files.append( + os.path.join(DATA_HOME, self.audio_path, f'fold{fold}', + filename)) labels.append(int(target)) return files, labels diff --git a/PaddleAudio/paddleaudio/datasets/voxceleb1.py b/PaddleAudio/paddleaudio/datasets/voxceleb1.py new file mode 100644 index 0000000000..79ba956ff2 --- /dev/null +++ b/PaddleAudio/paddleaudio/datasets/voxceleb1.py @@ -0,0 +1,295 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import csv +import glob +import os +import random +from typing import Dict, List, Tuple + +from paddle.io import Dataset +from tqdm import tqdm + +from ..backends import load as load_audio +from ..utils import DATA_HOME, decompress, download_and_decompress, get_logger +from .dataset import feat_funcs + +logger = get_logger() + +__all__ = ['VoxCeleb1'] + + +class VoxCeleb1(Dataset): + source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/' + archieves_audio_dev = [ + { + 'url': source_url + 'vox1_dev_wav_partaa', + 'md5': 'e395d020928bc15670b570a21695ed96', + }, + { + 'url': source_url + 'vox1_dev_wav_partab', + 'md5': 'bbfaaccefab65d82b21903e81a8a8020', + }, + { + 'url': source_url + 'vox1_dev_wav_partac', + 'md5': '017d579a2a96a077f40042ec33e51512', + }, + { + 'url': source_url + 'vox1_dev_wav_partad', + 'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19', + }, + ] + archieves_audio_test = [ + { + 'url': source_url + 'vox1_test_wav.zip', + 'md5': '185fdc63c3c739954633d50379a3d102', + }, + ] + archieves_meta = [ + { + 'url': + 'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt', + 'md5': 'b73110731c9223c1461fe49cb48dddfc', + }, + ] + + num_speakers = 1211 # For speaker verification task + sample_rate = 16000 + meta_info = collections.namedtuple( + 'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id')) + base_path = os.path.join(DATA_HOME, 'vox1') + wav_path = os.path.join(base_path, 'wav') + meta_path = os.path.join(base_path, 'meta') + veri_test_file = os.path.join(meta_path, 'veri_test2.txt') + csv_path = os.path.join(base_path, 'csv') + subsets = ['train', 'dev', 'enrol', 'test'] + + def __init__(self, + subset: str = 'train', + feat_type: str = 'raw', + random_chunk: bool = True, + chunk_duration: float = 3.0, + split_ratio: float = 0.9, + seed: int = 0, + **kwargs): + + assert subset in self.subsets, \ + 'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset) + + self.subset = subset + self.spk_id2label = {} + self.feat_type = feat_type + self.feat_config = kwargs + self.random_chunk = random_chunk + self.chunk_duration = chunk_duration + self.split_ratio = split_ratio + self._data = self._get_data() + super(VoxCeleb1, self).__init__() + + # Set up a seed to reproduce training or predicting result. + random.seed(seed) + + def _get_data(self): + # Download audio files. + if not os.path.isdir(self.base_path): + download_and_decompress( + self.archieves_audio_dev, self.base_path, decompress=False) + download_and_decompress( + self.archieves_audio_test, self.base_path, decompress=True) + + # Download all parts and concatenate the files into one zip file. + dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip') + logger.info(f'Concatenating all parts to: {dev_zipfile}') + os.system( + f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}' + ) + + # Extract all audio files of dev and test set. + decompress(dev_zipfile, self.base_path) + + # Download meta files. + if not os.path.isdir(self.meta_path): + download_and_decompress( + self.archieves_meta, self.meta_path, decompress=False) + + # Data preparation. + if not os.path.isdir(self.csv_path): + os.makedirs(self.csv_path) + self.prepare_data() + + data = [] + with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf: + for line in rf.readlines()[1:]: + audio_id, duration, wav, start, stop, spk_id = line.strip( + ).split(',') + data.append( + self.meta_info(audio_id, float(duration), wav, int(start), + int(stop), spk_id)) + + with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f: + for line in f.readlines(): + spk_id, label = line.strip().split(' ') + self.spk_id2label[spk_id] = int(label) + + return data + + def _convert_to_record(self, idx: int): + sample = self._data[idx] + + record = {} + # To show all fields in a namedtuple: `type(sample)._fields` + for field in type(sample)._fields: + record[field] = getattr(sample, field) + + waveform, sr = load_audio(record['wav']) + + if self.random_chunk: + num_wav_samples = waveform.shape[0] + num_chunk_samples = int(self.chunk_duration * sr) + start = random.randint(0, num_wav_samples - num_chunk_samples - 1) + stop = start + num_chunk_samples + else: + start = record['start'] + stop = record['stop'] + + waveform = waveform[start:stop] + + assert self.feat_type in feat_funcs.keys(), \ + f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}" + feat_func = feat_funcs[self.feat_type] + feat = feat_func( + waveform, sr=sr, **self.feat_config) if feat_func else waveform + + record.update({'feat': feat}) + if self.subset in ['train', + 'dev']: # Labels are available in train and dev. + record.update({'label': self.spk_id2label[record['spk_id']]}) + + return record + + @staticmethod + def _get_chunks(seg_dur, audio_id, audio_duration): + num_chunks = int(audio_duration / seg_dur) # all in milliseconds + + chunk_lst = [ + audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur) + for i in range(num_chunks) + ] + return chunk_lst + + def _get_audio_info(self, wav_file: str, + split_chunks: bool) -> List[List[str]]: + waveform, sr = load_audio(wav_file) + spk_id, sess_id, utt_id = wav_file.split("/")[-3:] + audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]]) + audio_duration = waveform.shape[0] / sr + + ret = [] + if split_chunks: # Split into pieces of self.chunk_duration seconds. + uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id, + audio_duration) + + for chunk in uniq_chunks_list: + s, e = chunk.split("_")[-2:] # Timestamps of start and end + start_sample = int(float(s) * sr) + end_sample = int(float(e) * sr) + # id, duration, wav, start, stop, spk_id + ret.append([ + chunk, audio_duration, wav_file, start_sample, end_sample, + spk_id + ]) + else: # Keep whole audio. + ret.append([ + audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id + ]) + return ret + + def generate_csv(self, + wav_files: List[str], + output_file: str, + split_chunks: bool = True): + logger.info(f'Generating csv: {output_file}') + header = ["id", "duration", "wav", "start", "stop", "spk_id"] + + infos = list( + tqdm( + map(self._get_audio_info, wav_files, + [split_chunks] * len(wav_files)), + total=len(wav_files))) + + csv_lines = [] + for info in infos: + csv_lines.extend(info) + + with open(output_file, mode="w") as csv_f: + csv_writer = csv.writer( + csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) + csv_writer.writerow(header) + for line in csv_lines: + csv_writer.writerow(line) + + def prepare_data(self): + # Audio of speakers in veri_test_file should not be included in training set. + enrol_files = set() + test_files = set() + with open(self.veri_test_file, 'r') as f: + for line in f.readlines(): + _, enrol_file, test_file = line.strip().split(' ') + enrol_files.add(os.path.join(self.wav_path, enrol_file)) + test_files.add(os.path.join(self.wav_path, test_file)) + enrol_files = sorted(enrol_files) + test_files = sorted(test_files) + + test_spks = set() + for file in (enrol_files + test_files): + spk = file.split('/wav/')[1].split('/')[0] + test_spks.add(spk) + + audio_files = [] + speakers = set() + for file in glob.glob( + os.path.join(self.wav_path, "**", "*.wav"), recursive=True): + spk = file.split('/wav/')[1].split('/')[0] + if spk in test_spks: + continue + speakers.add(spk) + audio_files.append(file) + + with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f: + for label, spk_id in enumerate(sorted(speakers)): # 1211 speakers + f.write(f'{spk_id} {label}\n') + + audio_files = sorted(audio_files) + random.shuffle(audio_files) + split_idx = int(self.split_ratio * len(audio_files)) + train_files, dev_files = audio_files[: + split_idx], audio_files[split_idx:] + + self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv')) + self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv')) + self.generate_csv( + enrol_files, + os.path.join(self.csv_path, 'enrol.csv'), + split_chunks=False) + self.generate_csv( + test_files, + os.path.join(self.csv_path, 'test.csv'), + split_chunks=False) + + def __getitem__(self, idx): + return self._convert_to_record(idx) + + def __len__(self): + return len(self._data) diff --git a/PaddleAudio/paddleaudio/models/ecapa_tdnn.py b/PaddleAudio/paddleaudio/models/ecapa_tdnn.py new file mode 100644 index 0000000000..bf2b1cbe8e --- /dev/null +++ b/PaddleAudio/paddleaudio/models/ecapa_tdnn.py @@ -0,0 +1,414 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import os + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +def length_to_mask(length, max_len=None, dtype=None): + assert len(length.shape) == 1 + + if max_len is None: + max_len = length.max().astype( + 'int').item() # using arange to generate mask + mask = paddle.arange( + max_len, dtype=length.dtype).expand( + (len(length), max_len)) < length.unsqueeze(1) + + if dtype is None: + dtype = length.dtype + + mask = paddle.to_tensor(mask, dtype=dtype) + return mask + + +class Conv1d(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding="same", + dilation=1, + groups=1, + bias=True, + padding_mode="reflect", + ): + super(Conv1d, self).__init__() + + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + self.padding = padding + self.padding_mode = padding_mode + + self.conv = nn.Conv1D( + in_channels, + out_channels, + self.kernel_size, + stride=self.stride, + padding=0, + dilation=self.dilation, + groups=groups, + bias_attr=bias, + ) + + def forward(self, x): + if self.padding == "same": + x = self._manage_padding(x, self.kernel_size, self.dilation, + self.stride) + else: + raise ValueError("Padding must be 'same'. Got {self.padding}") + + return self.conv(x) + + def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int): + L_in = x.shape[-1] # Detecting input shape + padding = self._get_padding_elem(L_in, stride, kernel_size, + dilation) # Time padding + x = F.pad( + x, padding, mode=self.padding_mode, + data_format="NCL") # Applying padding + return x + + def _get_padding_elem(self, L_in: int, stride: int, kernel_size: int, + dilation: int): + if stride > 1: + n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1) + L_out = stride * (n_steps - 1) + kernel_size * dilation + padding = [kernel_size // 2, kernel_size // 2] + else: + L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1 + + padding = [(L_in - L_out) // 2, (L_in - L_out) // 2] + + return padding + + +class BatchNorm1d(nn.Layer): + def __init__( + self, + input_size, + eps=1e-05, + momentum=0.9, + weight_attr=None, + bias_attr=None, + data_format='NCL', + use_global_stats=None, + ): + super(BatchNorm1d, self).__init__() + + self.norm = nn.BatchNorm1D( + input_size, + epsilon=eps, + momentum=momentum, + weight_attr=weight_attr, + bias_attr=bias_attr, + data_format=data_format, + use_global_stats=use_global_stats, + ) + + def forward(self, x): + x_n = self.norm(x) + return x_n + + +class TDNNBlock(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + dilation, + activation=nn.ReLU, + ): + super(TDNNBlock, self).__init__() + self.conv = Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + dilation=dilation, + ) + self.activation = activation() + self.norm = BatchNorm1d(input_size=out_channels) + + def forward(self, x): + return self.norm(self.activation(self.conv(x))) + + +class Res2NetBlock(nn.Layer): + def __init__(self, in_channels, out_channels, scale=8, dilation=1): + super(Res2NetBlock, self).__init__() + assert in_channels % scale == 0 + assert out_channels % scale == 0 + + in_channel = in_channels // scale + hidden_channel = out_channels // scale + + self.blocks = nn.LayerList([ + TDNNBlock( + in_channel, hidden_channel, kernel_size=3, dilation=dilation) + for i in range(scale - 1) + ]) + self.scale = scale + + def forward(self, x): + y = [] + for i, x_i in enumerate(paddle.chunk(x, self.scale, axis=1)): + if i == 0: + y_i = x_i + elif i == 1: + y_i = self.blocks[i - 1](x_i) + else: + y_i = self.blocks[i - 1](x_i + y_i) + y.append(y_i) + y = paddle.concat(y, axis=1) + return y + + +class SEBlock(nn.Layer): + def __init__(self, in_channels, se_channels, out_channels): + super(SEBlock, self).__init__() + + self.conv1 = Conv1d( + in_channels=in_channels, out_channels=se_channels, kernel_size=1) + self.relu = paddle.nn.ReLU() + self.conv2 = Conv1d( + in_channels=se_channels, out_channels=out_channels, kernel_size=1) + self.sigmoid = paddle.nn.Sigmoid() + + def forward(self, x, lengths=None): + L = x.shape[-1] + if lengths is not None: + mask = length_to_mask(lengths * L, max_len=L) + mask = mask.unsqueeze(1) + total = mask.sum(axis=2, keepdim=True) + s = (x * mask).sum(axis=2, keepdim=True) / total + else: + s = x.mean(axis=2, keepdim=True) + + s = self.relu(self.conv1(s)) + s = self.sigmoid(self.conv2(s)) + + return s * x + + +class AttentiveStatisticsPooling(nn.Layer): + def __init__(self, channels, attention_channels=128, global_context=True): + super().__init__() + + self.eps = 1e-12 + self.global_context = global_context + if global_context: + self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1) + else: + self.tdnn = TDNNBlock(channels, attention_channels, 1, 1) + self.tanh = nn.Tanh() + self.conv = Conv1d( + in_channels=attention_channels, + out_channels=channels, + kernel_size=1) + + def forward(self, x, lengths=None): + C, L = x.shape[1], x.shape[2] # KP: (N, C, L) + + def _compute_statistics(x, m, axis=2, eps=self.eps): + mean = (m * x).sum(axis) + std = paddle.sqrt( + (m * (x - mean.unsqueeze(axis)).pow(2)).sum(axis).clip(eps)) + return mean, std + + if lengths is None: + lengths = paddle.ones([x.shape[0]]) + + # Make binary mask of shape [N, 1, L] + mask = length_to_mask(lengths * L, max_len=L) + mask = mask.unsqueeze(1) + + # Expand the temporal context of the pooling layer by allowing the + # self-attention to look at global properties of the utterance. + if self.global_context: + total = mask.sum(axis=2, keepdim=True).astype('float32') + mean, std = _compute_statistics(x, mask / total) + mean = mean.unsqueeze(2).tile((1, 1, L)) + std = std.unsqueeze(2).tile((1, 1, L)) + attn = paddle.concat([x, mean, std], axis=1) + else: + attn = x + + # Apply layers + attn = self.conv(self.tanh(self.tdnn(attn))) + + # Filter out zero-paddings + attn = paddle.where( + mask.tile((1, C, 1)) == 0, + paddle.ones_like(attn) * float("-inf"), attn) + + attn = F.softmax(attn, axis=2) + mean, std = _compute_statistics(x, attn) + + # Append mean and std of the batch + pooled_stats = paddle.concat((mean, std), axis=1) + pooled_stats = pooled_stats.unsqueeze(2) + + return pooled_stats + + +class SERes2NetBlock(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + res2net_scale=8, + se_channels=128, + kernel_size=1, + dilation=1, + activation=nn.ReLU, + ): + super(SERes2NetBlock, self).__init__() + self.out_channels = out_channels + self.tdnn1 = TDNNBlock( + in_channels, + out_channels, + kernel_size=1, + dilation=1, + activation=activation, + ) + self.res2net_block = Res2NetBlock(out_channels, out_channels, + res2net_scale, dilation) + self.tdnn2 = TDNNBlock( + out_channels, + out_channels, + kernel_size=1, + dilation=1, + activation=activation, + ) + self.se_block = SEBlock(out_channels, se_channels, out_channels) + + self.shortcut = None + if in_channels != out_channels: + self.shortcut = Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + ) + + def forward(self, x, lengths=None): + residual = x + if self.shortcut: + residual = self.shortcut(x) + + x = self.tdnn1(x) + x = self.res2net_block(x) + x = self.tdnn2(x) + x = self.se_block(x, lengths) + + return x + residual + + +class ECAPA_TDNN(nn.Layer): + def __init__( + self, + input_size, + lin_neurons=192, + activation=nn.ReLU, + channels=[512, 512, 512, 512, 1536], + kernel_sizes=[5, 3, 3, 3, 1], + dilations=[1, 2, 3, 4, 1], + attention_channels=128, + res2net_scale=8, + se_channels=128, + global_context=True, + ): + + super(ECAPA_TDNN, self).__init__() + assert len(channels) == len(kernel_sizes) + assert len(channels) == len(dilations) + self.channels = channels + self.blocks = nn.LayerList() + self.emb_size = lin_neurons + + # The initial TDNN layer + self.blocks.append( + TDNNBlock( + input_size, + channels[0], + kernel_sizes[0], + dilations[0], + activation, + )) + + # SE-Res2Net layers + for i in range(1, len(channels) - 1): + self.blocks.append( + SERes2NetBlock( + channels[i - 1], + channels[i], + res2net_scale=res2net_scale, + se_channels=se_channels, + kernel_size=kernel_sizes[i], + dilation=dilations[i], + activation=activation, + )) + + # Multi-layer feature aggregation + self.mfa = TDNNBlock( + channels[-1], + channels[-1], + kernel_sizes[-1], + dilations[-1], + activation, + ) + + # Attentive Statistical Pooling + self.asp = AttentiveStatisticsPooling( + channels[-1], + attention_channels=attention_channels, + global_context=global_context, + ) + self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2) + + # Final linear transformation + self.fc = Conv1d( + in_channels=channels[-1] * 2, + out_channels=self.emb_size, + kernel_size=1, + ) + + def forward(self, x, lengths=None): + xl = [] + for layer in self.blocks: + try: + x = layer(x, lengths=lengths) + except TypeError: + x = layer(x) + xl.append(x) + + # Multi-layer feature aggregation + x = paddle.concat(xl[1:], axis=1) + x = self.mfa(x) + + # Attentive Statistical Pooling + x = self.asp(x, lengths=lengths) + x = self.asp_bn(x) + + # Final linear transformation + x = self.fc(x) + + return x diff --git a/PaddleAudio/paddleaudio/utils/utils.py b/PaddleAudio/paddleaudio/utils/utils.py index 56c745204a..f2b1983a8d 100644 --- a/PaddleAudio/paddleaudio/utils/utils.py +++ b/PaddleAudio/paddleaudio/utils/utils.py @@ -17,9 +17,14 @@ 'get_logger', 'Timer', 'seconds_to_hms', + 'decompress', 'download_and_decompress', 'load_state_dict_from_url', 'default_logger', + 'USER_HOME', + 'PPAUDIO_HOME', + 'MODEL_HOME', + 'DATA_HOME', ] import logging @@ -83,6 +88,7 @@ def list_handlers(logger): class Timer(object): '''Calculate runing speed and estimated time of arrival(ETA)''' + def __init__(self, total_step: int): self.total_step = total_step self.last_start_step = 0 @@ -132,7 +138,27 @@ def seconds_to_hms(seconds: int) -> str: return hms_str -def download_and_decompress(archives: List[Dict[str, str]], path: str): +def decompress(file: str, path: str = os.PathLike): + """ + Extracts all files to specific path from a compressed file. + """ + assert os.path.isfile(file), "File: {} not exists.".format(file) + + if path is None: + download._decompress(file) + else: + if not os.path.isdir(path): + os.makedirs(path) + + tmp_file = os.path.join(path, os.path.basename(file)) + os.rename(file, tmp_file) + download._decompress(tmp_file) + os.rename(tmp_file, file) + + +def download_and_decompress(archives: List[Dict[str, str]], + path: os.PathLike, + decompress: bool = True): """ Download archieves and decompress to specific path. """ @@ -143,7 +169,8 @@ def download_and_decompress(archives: List[Dict[str, str]], path: str): assert 'url' in archive and 'md5' in archive, \ 'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}' - download.get_path_from_url(archive['url'], path, archive['md5']) + download.get_path_from_url( + archive['url'], path, archive['md5'], decompress=decompress) def load_state_dict_from_url(url: str, path: str, md5: str = None): @@ -159,3 +186,41 @@ def load_state_dict_from_url(url: str, path: str, md5: str = None): default_logger = get_logger(__file__) download.logger = default_logger + + +def _get_user_home(): + return os.path.expanduser('~') + + +def _get_ppaudio_home(): + if 'PPAUDIO_HOME' in os.environ: + home_path = os.environ['PPAUDIO_HOME'] + if os.path.exists(home_path): + if os.path.isdir(home_path): + return home_path + else: + raise RuntimeError( + 'The environment variable PPAUDIO_HOME {} is not a directory.' + .format(home_path)) + else: + return home_path + return os.path.join(_get_user_home(), '.paddleaudio') + + +def _get_sub_home(directory): + home = os.path.join(_get_ppaudio_home(), directory) + if not os.path.exists(home): + os.makedirs(home) + return home + + +''' +PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the +├ default value through the PPAUDIO_HOME environment variable. +├─ MODEL_HOME --> Store model files. +└─ DATA_HOME --> Store automatically downloaded datasets. +''' +USER_HOME = _get_user_home() +PPAUDIO_HOME = _get_ppaudio_home() +MODEL_HOME = _get_sub_home('models') +DATA_HOME = _get_sub_home('datasets')