newest code

jonatelintelo · Mar 18, 2023 · e57871c · e57871c
1 parent abd567d
commit e57871c
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 72 deletions.
diff --git a/skeleton/data/datapipe.py b/skeleton/data/datapipe.py
@@ -85,28 +85,22 @@ def decode_wav(value: StreamWrapper) -> t.Tensor:
     assert isinstance(value, StreamWrapper)
 
     value, sample_rate = torchaudio.load(value)
-    choice = randomize_effect()
-    if choice == 'inject_noise':
-        value = inject_noise(value, 0.01)
-    elif choice == 'rd_speed_change':
-        value = random_speed_change(value, sample_rate)
+
+    # choice = randomize_effect()
+    # if choice == 'inject_noise':
+    #     value = inject_noise(value, 0.01)
+    #     # value = torch.cat((original_value, value))
+    #     # value = value.resize_(original_value.shape)
+
+    # elif choice == 'rd_speed_change':
+    #     value = random_speed_change(value, sample_rate)
+        # value = torch.cat((original_value, value))
+        # value = value.resize_(original_value.shape)
     # elif choice == 'rand_gain':
     #     value= random_gain_aug(value, minimum=0.1, maximum=0.12)
     # elif choice == 'reverb':
     #     value= reverb_aug(value,sample_rate)
-
-    assert sample_rate == 16_000
-
-    # make sure that audio has 1 dimension
-    value = torch.squeeze(value)
-
-    return value
-
-def decode_wav_original(value: StreamWrapper) -> t.Tensor:
-    assert isinstance(value, StreamWrapper)
 
-    value, sample_rate = torchaudio.load(value)
-
     assert sample_rate == 16_000
 
     # make sure that audio has 1 dimension
@@ -136,19 +130,6 @@ def decode(element: Tuple[str, StreamWrapper]):
 
     return key, value
 
-def decode_original(element: Tuple[str, StreamWrapper]):
-    assert isinstance(element, tuple) and len(element) == 2
-    key, value = element
-    assert isinstance(key, str)
-    assert isinstance(value, StreamWrapper)
-
-    if key.endswith(".wav"):
-        value = decode_wav_original(value)
-
-    if key.endswith(".json"):
-        value = decode_json(value)
-
-    return key, value
 
 ########################################################################################
 # default pipeline loading data from tar files into a tuple (sample_id, x, y)
@@ -157,7 +138,6 @@ def decode_original(element: Tuple[str, StreamWrapper]):
 
 
 def construct_sample_datapipe(
-    is_augmented: bool,
     shard_folder: pathlib.Path,
     num_workers: int,
     buffer_size: int = 0,
@@ -193,38 +173,21 @@ def construct_sample_datapipe(
     dp = TarArchiveLoader(dp, mode="r")
 
     # decode each file in the tar to the expected python dataformat
-    if is_augmented:
-        dp = Mapper(dp, decode)
-    else:
-        dp = Mapper(dp, decode_original)
+    dp = Mapper(dp, decode)
 
     # each file in the tar is expected to have the format `{key}.{ext}
     # this groups all files with the same key into one dictionary
     dp = WebDataset(dp)
 
     # transform the dictionaries into tuple (sample_id, x, y)
-    if is_augmented:
-        dp = Mapper(dp, map_dict_to_tuple)
-    else:
-        dp = Mapper(dp, map_dict_to_tuple_original)
+    dp = Mapper(dp, map_dict_to_tuple)
 
 
     # buffer tuples to increase variability
     if buffer_size > 0:
         dp = Shuffler(dp, buffer_size=buffer_size)
     return dp
 
-def map_dict_to_tuple_original(x: Dict) -> Sample:
-    sample_id = x[".json"]["sample_id"] + "_org"
-    wav = x[".wav"]
-
-    class_idx = x[".json"]["class_idx"]
-    if class_idx is None:
-        gt = None
-    else:
-        gt = t.tensor(x[".json"]["class_idx"], dtype=t.int64)
-    return Sample(sample_id, wav, gt)
-
 def map_dict_to_tuple(x: Dict) -> Sample:
     sample_id = x[".json"]["sample_id"]
     wav = x[".wav"]
@@ -305,7 +268,9 @@ def pipe_batch_samples(
 
 
 def _print_sample(dp):
+    cnt = 0
     for sample in dp:
+        cnt += 1
         sample_id, x, y = sample
         print(f"{sample_id=}\n")
 
@@ -315,6 +280,7 @@ def _print_sample(dp):
         print(y)
         print(f"{y.shape=}")
         print(f"{y.dtype=}\n")
+        break
 
 def _debug():
     shard_path = pathlib.Path(
@@ -324,9 +290,7 @@ def _debug():
     n_mfcc = 40
 
     print("### construct_sample_datapipe ###")
-    dp = construct_sample_datapipe(True, shard_path, num_workers=0)
-    dp_org = construct_sample_datapipe(False, shard_path, num_workers=0)
-    dp = dp_org.concat(dp)
+    dp = construct_sample_datapipe(shard_path, num_workers=0)
     _print_sample(dp)
 
     print("### pipe_chunk_sample ###")

diff --git a/skeleton/data/tiny_voxceleb.py b/skeleton/data/tiny_voxceleb.py
@@ -56,16 +56,16 @@ def __init__(
 
     def setup(self, stage: Optional[str] = None) -> None:
         # train dataloader (non-augmented)
-        train_dp_original = construct_sample_datapipe(not self.is_augmented,
-            self.shard_folder / "train", num_workers=self.num_workers_train
-        )
-        train_dp_original = pipe_chunk_sample(train_dp_original, self.chunk_length_num_frames)
-        train_dp_original = pipe_mfcc(train_dp_original, self.n_mfcc)
-        train_dp_original = pipe_batch_samples(train_dp_original, self.batch_size, drop_last=True)
-        self.train_dp_original = train_dp_original
+        # train_dp_original = construct_sample_datapipe(not self.is_augmented,
+        #     self.shard_folder / "train", num_workers=self.num_workers_train
+        # )
+        # train_dp_original = pipe_chunk_sample(train_dp_original, self.chunk_length_num_frames)
+        # train_dp_original = pipe_mfcc(train_dp_original, self.n_mfcc)
+        # train_dp_original = pipe_batch_samples(train_dp_original, self.batch_size, drop_last=True)
+        # self.train_dp_original = train_dp_original
 
         # train dataloader (augmented)
-        train_dp = construct_sample_datapipe(self.is_augmented,
+        train_dp = construct_sample_datapipe(
             self.shard_folder / "train", num_workers=self.num_workers_train
         )
         # train_dp = train_dp_original.concat(train_dp)
@@ -77,7 +77,7 @@ def setup(self, stage: Optional[str] = None) -> None:
         # self.train_dp = self.train_dp_original.concat(self.train_dp)
 
         # val dataloader
-        val_dp = construct_sample_datapipe(not self.is_augmented,
+        val_dp = construct_sample_datapipe(
             self.shard_folder / "val", num_workers=self.num_workers_eval
         )
         val_dp = pipe_chunk_sample(val_dp, self.chunk_length_num_frames)
@@ -87,15 +87,14 @@ def setup(self, stage: Optional[str] = None) -> None:
 
         # dev dataloader
         # we explicitly evaluate with a batch size of 1 and the whole utterance
-        dev_dp = construct_sample_datapipe(not self.is_augmented,
+        dev_dp = construct_sample_datapipe(
             self.shard_folder / "dev", num_workers=self.num_workers_eval
         )
         dev_dp = pipe_mfcc(dev_dp, self.n_mfcc)
         dev_dp = pipe_batch_samples(dev_dp, batch_size=1, drop_last=False)
         self.dev_dp = dev_dp
 
     def train_dataloader(self) -> TRAIN_DATALOADERS:
-        self.train_dp = torch.utils.data.ChainDataset([self.train_dp_original, self.train_dp])
         return torch.utils.data.DataLoader(
             self.train_dp, batch_size=None, num_workers=self.num_workers_train
         )

diff --git a/skeleton/layers/resnet.py b/skeleton/layers/resnet.py
@@ -17,8 +17,6 @@ def __init__(self, triples):
         modules.append(nn.Sequential(
             nn.AdaptiveAvgPool1d(3),
             nn.Flatten(),
-            nn.LazyLinear(256), nn.ReLU(),
-            nn.Dropout(p=0.5),
             nn.LazyLinear(128)
         ))
 

diff --git a/skeleton/models/prototype.py b/skeleton/models/prototype.py
@@ -14,6 +14,7 @@
 import torch as t
 import torch.nn as nn
 import torch.nn.functional as F
+import torch.optim
 
 from pytorch_lightning import LightningModule
 from torchmetrics import Accuracy
@@ -24,6 +25,7 @@
     evaluate_speaker_trials,
 )
 from skeleton.layers.resnet import ResNet
+from skeleton.layers.LSTM import TorchLSTMNet
 
 from skeleton.layers.statistical_pooling import MeanStatPool1D
 
@@ -49,6 +51,7 @@ def __init__(
         self.num_embedding = num_embedding
         self.num_speakers = num_speakers
         self.learning_rate = learning_rate
+        self.original_lr = learning_rate
 
         # evaluation data
         self.val_trials = val_trials
@@ -67,6 +70,8 @@ def __init__(
             nn.ReLU(),
         )
 
+        self.lstm = TorchLSTMNet(1, num_embedding)
+
         self.resnet = ResNet(((num_embedding, 2, num_embedding*2),(num_embedding*2, 2, num_embedding*4), (num_embedding*4, 2, num_embedding*8), (num_embedding*8, 2, num_embedding*16)))
 
         # Pooling layer
@@ -109,28 +114,34 @@ def forward(self, spectrogram: t.Tensor) -> Tuple[t.Tensor, t.Tensor]:
     def compute_embedding(self, spectrogram: t.Tensor) -> t.Tensor:
         # modify to your liking!
         feature_representation = self.embedding_layer(spectrogram) # -> [128,128,239]
-
         resnet_output = self.resnet(feature_representation)
 
+
         resnet_output = resnet_output[:, :, None] # -> ([128, 128, 1])
 
         embedding = self.pooling_layer(resnet_output) # -> [128, 128]    
-
         return embedding
 
     def compute_prediction(self, embedding: t.Tensor) -> t.Tensor:
         # modify to your liking!
         # embedding = embedding[None, :, :]
         prediction = self.prediction_layer(embedding)
-
+        # print(prediction.shape)
         return prediction
 
+
+    # @property
+    # def automatic_optimization(self) -> bool:
+    #     return False
+
     def training_step(
         self, batch: Tuple[List[str], t.Tensor, t.Tensor], *args, **kwargs
     ) -> t.Tensor:
         # first unwrap the batch into the input tensor and ground truth labels
+        # opt = self.optimizers()
         sample_id, network_input, speaker_labels = batch
-
+        # opt = self.optimizers()
+        # opt.zero_grad()
         assert network_input.shape[0] == speaker_labels.shape[0]
         assert network_input.shape[1] == self.num_inp_features
         assert len(network_input.shape) == 3
@@ -140,7 +151,8 @@ def training_step(
 
         # based on the output of the forward pass we compute the loss
         loss = self.loss_fn(prediction, speaker_labels)
-
+        # self.manual_backward(loss)
+        # opt.step()
         # based on the output of the forward pass we compute some metrics
         self.train_acc(prediction, speaker_labels)
 
@@ -244,7 +256,7 @@ def configure_optimizers(self):
         # Adapt schedule to your liking :).
         schedule = {
             # Required: the scheduler instance.
-            "scheduler": t.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=1.0),
+            "scheduler": t.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.8),
             # The unit of the scheduler's step size, could also be 'step'.
             # 'epoch' updates the scheduler on epoch end whereas 'step'
             # updates it after an optimizer update.