Skip to content

Commit

Permalink
newest code
Browse files Browse the repository at this point in the history
  • Loading branch information
Lam Nguyen Tùng Lam committed Mar 18, 2023
1 parent abd567d commit e57871c
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 72 deletions.
70 changes: 17 additions & 53 deletions skeleton/data/datapipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,28 +85,22 @@ def decode_wav(value: StreamWrapper) -> t.Tensor:
assert isinstance(value, StreamWrapper)

value, sample_rate = torchaudio.load(value)
choice = randomize_effect()
if choice == 'inject_noise':
value = inject_noise(value, 0.01)
elif choice == 'rd_speed_change':
value = random_speed_change(value, sample_rate)

# choice = randomize_effect()
# if choice == 'inject_noise':
# value = inject_noise(value, 0.01)
# # value = torch.cat((original_value, value))
# # value = value.resize_(original_value.shape)

# elif choice == 'rd_speed_change':
# value = random_speed_change(value, sample_rate)
# value = torch.cat((original_value, value))
# value = value.resize_(original_value.shape)
# elif choice == 'rand_gain':
# value= random_gain_aug(value, minimum=0.1, maximum=0.12)
# elif choice == 'reverb':
# value= reverb_aug(value,sample_rate)

assert sample_rate == 16_000

# make sure that audio has 1 dimension
value = torch.squeeze(value)

return value

def decode_wav_original(value: StreamWrapper) -> t.Tensor:
assert isinstance(value, StreamWrapper)

value, sample_rate = torchaudio.load(value)

assert sample_rate == 16_000

# make sure that audio has 1 dimension
Expand Down Expand Up @@ -136,19 +130,6 @@ def decode(element: Tuple[str, StreamWrapper]):

return key, value

def decode_original(element: Tuple[str, StreamWrapper]):
assert isinstance(element, tuple) and len(element) == 2
key, value = element
assert isinstance(key, str)
assert isinstance(value, StreamWrapper)

if key.endswith(".wav"):
value = decode_wav_original(value)

if key.endswith(".json"):
value = decode_json(value)

return key, value

########################################################################################
# default pipeline loading data from tar files into a tuple (sample_id, x, y)
Expand All @@ -157,7 +138,6 @@ def decode_original(element: Tuple[str, StreamWrapper]):


def construct_sample_datapipe(
is_augmented: bool,
shard_folder: pathlib.Path,
num_workers: int,
buffer_size: int = 0,
Expand Down Expand Up @@ -193,38 +173,21 @@ def construct_sample_datapipe(
dp = TarArchiveLoader(dp, mode="r")

# decode each file in the tar to the expected python dataformat
if is_augmented:
dp = Mapper(dp, decode)
else:
dp = Mapper(dp, decode_original)
dp = Mapper(dp, decode)

# each file in the tar is expected to have the format `{key}.{ext}
# this groups all files with the same key into one dictionary
dp = WebDataset(dp)

# transform the dictionaries into tuple (sample_id, x, y)
if is_augmented:
dp = Mapper(dp, map_dict_to_tuple)
else:
dp = Mapper(dp, map_dict_to_tuple_original)
dp = Mapper(dp, map_dict_to_tuple)


# buffer tuples to increase variability
if buffer_size > 0:
dp = Shuffler(dp, buffer_size=buffer_size)
return dp

def map_dict_to_tuple_original(x: Dict) -> Sample:
sample_id = x[".json"]["sample_id"] + "_org"
wav = x[".wav"]

class_idx = x[".json"]["class_idx"]
if class_idx is None:
gt = None
else:
gt = t.tensor(x[".json"]["class_idx"], dtype=t.int64)
return Sample(sample_id, wav, gt)

def map_dict_to_tuple(x: Dict) -> Sample:
sample_id = x[".json"]["sample_id"]
wav = x[".wav"]
Expand Down Expand Up @@ -305,7 +268,9 @@ def pipe_batch_samples(


def _print_sample(dp):
cnt = 0
for sample in dp:
cnt += 1
sample_id, x, y = sample
print(f"{sample_id=}\n")

Expand All @@ -315,6 +280,7 @@ def _print_sample(dp):
print(y)
print(f"{y.shape=}")
print(f"{y.dtype=}\n")
break

def _debug():
shard_path = pathlib.Path(
Expand All @@ -324,9 +290,7 @@ def _debug():
n_mfcc = 40

print("### construct_sample_datapipe ###")
dp = construct_sample_datapipe(True, shard_path, num_workers=0)
dp_org = construct_sample_datapipe(False, shard_path, num_workers=0)
dp = dp_org.concat(dp)
dp = construct_sample_datapipe(shard_path, num_workers=0)
_print_sample(dp)

print("### pipe_chunk_sample ###")
Expand Down
21 changes: 10 additions & 11 deletions skeleton/data/tiny_voxceleb.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,16 @@ def __init__(

def setup(self, stage: Optional[str] = None) -> None:
# train dataloader (non-augmented)
train_dp_original = construct_sample_datapipe(not self.is_augmented,
self.shard_folder / "train", num_workers=self.num_workers_train
)
train_dp_original = pipe_chunk_sample(train_dp_original, self.chunk_length_num_frames)
train_dp_original = pipe_mfcc(train_dp_original, self.n_mfcc)
train_dp_original = pipe_batch_samples(train_dp_original, self.batch_size, drop_last=True)
self.train_dp_original = train_dp_original
# train_dp_original = construct_sample_datapipe(not self.is_augmented,
# self.shard_folder / "train", num_workers=self.num_workers_train
# )
# train_dp_original = pipe_chunk_sample(train_dp_original, self.chunk_length_num_frames)
# train_dp_original = pipe_mfcc(train_dp_original, self.n_mfcc)
# train_dp_original = pipe_batch_samples(train_dp_original, self.batch_size, drop_last=True)
# self.train_dp_original = train_dp_original

# train dataloader (augmented)
train_dp = construct_sample_datapipe(self.is_augmented,
train_dp = construct_sample_datapipe(
self.shard_folder / "train", num_workers=self.num_workers_train
)
# train_dp = train_dp_original.concat(train_dp)
Expand All @@ -77,7 +77,7 @@ def setup(self, stage: Optional[str] = None) -> None:
# self.train_dp = self.train_dp_original.concat(self.train_dp)

# val dataloader
val_dp = construct_sample_datapipe(not self.is_augmented,
val_dp = construct_sample_datapipe(
self.shard_folder / "val", num_workers=self.num_workers_eval
)
val_dp = pipe_chunk_sample(val_dp, self.chunk_length_num_frames)
Expand All @@ -87,15 +87,14 @@ def setup(self, stage: Optional[str] = None) -> None:

# dev dataloader
# we explicitly evaluate with a batch size of 1 and the whole utterance
dev_dp = construct_sample_datapipe(not self.is_augmented,
dev_dp = construct_sample_datapipe(
self.shard_folder / "dev", num_workers=self.num_workers_eval
)
dev_dp = pipe_mfcc(dev_dp, self.n_mfcc)
dev_dp = pipe_batch_samples(dev_dp, batch_size=1, drop_last=False)
self.dev_dp = dev_dp

def train_dataloader(self) -> TRAIN_DATALOADERS:
self.train_dp = torch.utils.data.ChainDataset([self.train_dp_original, self.train_dp])
return torch.utils.data.DataLoader(
self.train_dp, batch_size=None, num_workers=self.num_workers_train
)
Expand Down
2 changes: 0 additions & 2 deletions skeleton/layers/resnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ def __init__(self, triples):
modules.append(nn.Sequential(
nn.AdaptiveAvgPool1d(3),
nn.Flatten(),
nn.LazyLinear(256), nn.ReLU(),
nn.Dropout(p=0.5),
nn.LazyLinear(128)
))

Expand Down
24 changes: 18 additions & 6 deletions skeleton/models/prototype.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import torch as t
import torch.nn as nn
import torch.nn.functional as F
import torch.optim

from pytorch_lightning import LightningModule
from torchmetrics import Accuracy
Expand All @@ -24,6 +25,7 @@
evaluate_speaker_trials,
)
from skeleton.layers.resnet import ResNet
from skeleton.layers.LSTM import TorchLSTMNet

from skeleton.layers.statistical_pooling import MeanStatPool1D

Expand All @@ -49,6 +51,7 @@ def __init__(
self.num_embedding = num_embedding
self.num_speakers = num_speakers
self.learning_rate = learning_rate
self.original_lr = learning_rate

# evaluation data
self.val_trials = val_trials
Expand All @@ -67,6 +70,8 @@ def __init__(
nn.ReLU(),
)

self.lstm = TorchLSTMNet(1, num_embedding)

self.resnet = ResNet(((num_embedding, 2, num_embedding*2),(num_embedding*2, 2, num_embedding*4), (num_embedding*4, 2, num_embedding*8), (num_embedding*8, 2, num_embedding*16)))

# Pooling layer
Expand Down Expand Up @@ -109,28 +114,34 @@ def forward(self, spectrogram: t.Tensor) -> Tuple[t.Tensor, t.Tensor]:
def compute_embedding(self, spectrogram: t.Tensor) -> t.Tensor:
# modify to your liking!
feature_representation = self.embedding_layer(spectrogram) # -> [128,128,239]

resnet_output = self.resnet(feature_representation)


resnet_output = resnet_output[:, :, None] # -> ([128, 128, 1])

embedding = self.pooling_layer(resnet_output) # -> [128, 128]

return embedding

def compute_prediction(self, embedding: t.Tensor) -> t.Tensor:
# modify to your liking!
# embedding = embedding[None, :, :]
prediction = self.prediction_layer(embedding)

# print(prediction.shape)
return prediction


# @property
# def automatic_optimization(self) -> bool:
# return False

def training_step(
self, batch: Tuple[List[str], t.Tensor, t.Tensor], *args, **kwargs
) -> t.Tensor:
# first unwrap the batch into the input tensor and ground truth labels
# opt = self.optimizers()
sample_id, network_input, speaker_labels = batch

# opt = self.optimizers()
# opt.zero_grad()
assert network_input.shape[0] == speaker_labels.shape[0]
assert network_input.shape[1] == self.num_inp_features
assert len(network_input.shape) == 3
Expand All @@ -140,7 +151,8 @@ def training_step(

# based on the output of the forward pass we compute the loss
loss = self.loss_fn(prediction, speaker_labels)

# self.manual_backward(loss)
# opt.step()
# based on the output of the forward pass we compute some metrics
self.train_acc(prediction, speaker_labels)

Expand Down Expand Up @@ -244,7 +256,7 @@ def configure_optimizers(self):
# Adapt schedule to your liking :).
schedule = {
# Required: the scheduler instance.
"scheduler": t.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=1.0),
"scheduler": t.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.8),
# The unit of the scheduler's step size, could also be 'step'.
# 'epoch' updates the scheduler on epoch end whereas 'step'
# updates it after an optimizer update.
Expand Down

0 comments on commit e57871c

Please sign in to comment.