expands the training set

jonatelintelo · Mar 15, 2023 · ff28d6f · ff28d6f
1 parent 3a0a9e3
commit ff28d6f
Show file tree

Hide file tree

Showing 5 changed files with 85 additions and 45 deletions.
diff --git a/cli_train.py b/cli_train.py
@@ -153,6 +153,7 @@ def main(
         max_epochs=epochs,
         accelerator="gpu" if use_gpu else "cpu",
         devices=1,
+        auto_lr_find=True,
         callbacks=[checkpointer, LearningRateMonitor()],
         logger=[tensorboard_logger, csv_logger],
         default_root_dir="logs",

diff --git a/experiments/experiment_1_cluster.sh b/experiments/experiment_1_cluster.sh
@@ -21,9 +21,9 @@ val_trials_path=./data/tiny-voxceleb/val_trials.txt
 dev_trials_path=./data/tiny-voxceleb/dev_trials.txt
 
 # hyperparameters for optimization
-batch_size=128
-learning_rate=3e-3
-num_epochs=30
+batch_size=64
+learning_rate=2e-3
+num_epochs=40
 num_workers=5
 
 # hyperparameters related to data pre-processing and network architecture

diff --git a/skeleton/data/datapipe.py b/skeleton/data/datapipe.py
@@ -76,20 +76,36 @@ def random_gain_aug(data, minimum=0.1, maximum=0.12): #change the percieved loud
     return data * gain #scale but in amplitude 
 
 def randomize_effect():
-    effects = ['inject_noise', 'rd_speed_change','rand_gain', 'reverb', 'none']
-    choice = np.random.choice(effects, 1, p=[0.1,0.1,0.1,0.2,0.5]) # if aug on everything then not rep of test dataset
+    effects = ['inject_noise', 'rd_speed_change', 'none']
+    choice = np.random.choice(effects, 1, p=[0.05,0.05,0.9]) # if aug on everything then not rep of test dataset
     return choice
 
 
 def decode_wav(value: StreamWrapper) -> t.Tensor:
     assert isinstance(value, StreamWrapper)
 
     value, sample_rate = torchaudio.load(value)
-    # choice = randomize_effect()
-    # if choice == 'inject_noise':
-    #     value = inject_noise(value, 0.01)
-    # elif choice == 'rd_speed_change':
-    #     value = random_speed_change(value, sample_rate)
+    choice = randomize_effect()
+    if choice == 'inject_noise':
+        value = inject_noise(value, 0.01)
+    elif choice == 'rd_speed_change':
+        value = random_speed_change(value, sample_rate)
+    # elif choice == 'rand_gain':
+    #     value= random_gain_aug(value, minimum=0.1, maximum=0.12)
+    # elif choice == 'reverb':
+    #     value= reverb_aug(value,sample_rate)
+
+    assert sample_rate == 16_000
+
+    # make sure that audio has 1 dimension
+    value = torch.squeeze(value)
+
+    return value
+
+def decode_wav_original(value: StreamWrapper) -> t.Tensor:
+    assert isinstance(value, StreamWrapper)
+
+    value, sample_rate = torchaudio.load(value)
 
     assert sample_rate == 16_000
 
@@ -120,6 +136,19 @@ def decode(element: Tuple[str, StreamWrapper]):
 
     return key, value
 
+def decode_original(element: Tuple[str, StreamWrapper]):
+    assert isinstance(element, tuple) and len(element) == 2
+    key, value = element
+    assert isinstance(key, str)
+    assert isinstance(value, StreamWrapper)
+
+    if key.endswith(".wav"):
+        value = decode_wav_original(value)
+
+    if key.endswith(".json"):
+        value = decode_json(value)
+
+    return key, value
 
 ########################################################################################
 # default pipeline loading data from tar files into a tuple (sample_id, x, y)
@@ -128,6 +157,7 @@ def decode(element: Tuple[str, StreamWrapper]):
 
 
 def construct_sample_datapipe(
+    is_augmented: bool,
     shard_folder: pathlib.Path,
     num_workers: int,
     buffer_size: int = 0,
@@ -163,20 +193,37 @@ def construct_sample_datapipe(
     dp = TarArchiveLoader(dp, mode="r")
 
     # decode each file in the tar to the expected python dataformat
-    dp = Mapper(dp, decode)
+    if is_augmented:
+        dp = Mapper(dp, decode)
+    else:
+        dp = Mapper(dp, decode_original)
 
     # each file in the tar is expected to have the format `{key}.{ext}
     # this groups all files with the same key into one dictionary
     dp = WebDataset(dp)
 
     # transform the dictionaries into tuple (sample_id, x, y)
-    dp = Mapper(dp, map_dict_to_tuple)
+    if is_augmented:
+        dp = Mapper(dp, map_dict_to_tuple)
+    else:
+        dp = Mapper(dp, map_dict_to_tuple_original)
+
 
     # buffer tuples to increase variability
     if buffer_size > 0:
         dp = Shuffler(dp, buffer_size=buffer_size)
     return dp
 
+def map_dict_to_tuple_original(x: Dict) -> Sample:
+    sample_id = x[".json"]["sample_id"] + "_org"
+    wav = x[".wav"]
+
+    class_idx = x[".json"]["class_idx"]
+    if class_idx is None:
+        gt = None
+    else:
+        gt = t.tensor(x[".json"]["class_idx"], dtype=t.int64)
+    return Sample(sample_id, wav, gt)
 
 def map_dict_to_tuple(x: Dict) -> Sample:
     sample_id = x[".json"]["sample_id"]
@@ -187,7 +234,7 @@ def map_dict_to_tuple(x: Dict) -> Sample:
         gt = None
     else:
         gt = t.tensor(x[".json"]["class_idx"], dtype=t.int64)
-
+    # print("Augmented: ", Sample(sample_id, wav, gt))
     return Sample(sample_id, wav, gt)
 
 
@@ -268,25 +315,6 @@ def _print_sample(dp):
         print(y)
         print(f"{y.shape=}")
         print(f"{y.dtype=}\n")
-        break
-
-def debug_an():
-    shard_path = pathlib.Path(
-        "/home/anilsson/mlip/tiny-voxceleb-skeleton-2023/data/tiny-voxceleb-shards/train"
-    )
-
-    n_mfcc = 40
-
-    print("### construct_sample_datapipe ###")
-    dp = construct_sample_datapipe(shard_path, num_workers=0)
-    _print_sample(dp)
-
-    print("### pipe_chunk_sample ###")
-    dp = pipe_chunk_sample(dp, 16_000 * 3)  # 3 seconds
-
-    _print_sample(dp)
-
-
 
 def _debug():
     shard_path = pathlib.Path(
@@ -296,12 +324,13 @@ def _debug():
     n_mfcc = 40
 
     print("### construct_sample_datapipe ###")
-    dp = construct_sample_datapipe(shard_path, num_workers=0)
+    dp = construct_sample_datapipe(True, shard_path, num_workers=0)
+    dp_org = construct_sample_datapipe(False, shard_path, num_workers=0)
+    dp = dp_org.concat(dp)
     _print_sample(dp)
 
     print("### pipe_chunk_sample ###")
     dp = pipe_chunk_sample(dp, 16_000 * 3)  # 3 seconds
-
     _print_sample(dp)
 
 
@@ -315,10 +344,5 @@ def _debug():
     _print_sample(dp)
 
 
-
-
-
-
 if __name__ == "__main__":
-    #_debug()
-    debug_an()
+    _debug()
diff --git a/skeleton/data/tiny_voxceleb.py b/skeleton/data/tiny_voxceleb.py
@@ -46,24 +46,38 @@ def __init__(
         self.shard_folder = shard_folder
         self.val_trials_path = val_trials_path
         self.dev_trials_path = dev_trials_path
+        self.is_augmented = True
 
         # init in setup()
+        self.train_dp_original = None
         self.train_dp = None
         self.val_dp = None
         self.dev_dp = None
 
     def setup(self, stage: Optional[str] = None) -> None:
-        # train dataloader
-        train_dp = construct_sample_datapipe(
+        # train dataloader (non-augmented)
+        train_dp_original = construct_sample_datapipe(not self.is_augmented,
             self.shard_folder / "train", num_workers=self.num_workers_train
         )
+        train_dp_original = pipe_chunk_sample(train_dp_original, self.chunk_length_num_frames)
+        train_dp_original = pipe_mfcc(train_dp_original, self.n_mfcc)
+        train_dp_original = pipe_batch_samples(train_dp_original, self.batch_size, drop_last=True)
+        self.train_dp_original = train_dp_original
+
+        # train dataloader (augmented)
+        train_dp = construct_sample_datapipe(self.is_augmented,
+            self.shard_folder / "train", num_workers=self.num_workers_train
+        )
+        # train_dp = train_dp_original.concat(train_dp)
         train_dp = pipe_chunk_sample(train_dp, self.chunk_length_num_frames)
         train_dp = pipe_mfcc(train_dp, self.n_mfcc)
         train_dp = pipe_batch_samples(train_dp, self.batch_size, drop_last=True)
         self.train_dp = train_dp
 
+        # self.train_dp = self.train_dp_original.concat(self.train_dp)
+
         # val dataloader
-        val_dp = construct_sample_datapipe(
+        val_dp = construct_sample_datapipe(not self.is_augmented,
             self.shard_folder / "val", num_workers=self.num_workers_eval
         )
         val_dp = pipe_chunk_sample(val_dp, self.chunk_length_num_frames)
@@ -73,14 +87,15 @@ def setup(self, stage: Optional[str] = None) -> None:
 
         # dev dataloader
         # we explicitly evaluate with a batch size of 1 and the whole utterance
-        dev_dp = construct_sample_datapipe(
+        dev_dp = construct_sample_datapipe(not self.is_augmented,
             self.shard_folder / "dev", num_workers=self.num_workers_eval
         )
         dev_dp = pipe_mfcc(dev_dp, self.n_mfcc)
         dev_dp = pipe_batch_samples(dev_dp, batch_size=1, drop_last=False)
         self.dev_dp = dev_dp
 
     def train_dataloader(self) -> TRAIN_DATALOADERS:
+        self.train_dp = torch.utils.data.ChainDataset([self.train_dp_original, self.train_dp])
         return torch.utils.data.DataLoader(
             self.train_dp, batch_size=None, num_workers=self.num_workers_train
         )

diff --git a/skeleton/layers/resnet.py b/skeleton/layers/resnet.py
@@ -38,7 +38,7 @@ def block(self, in_channels,num_residuals, out_channels):
             if i == 0:
                 blk.append(ResidualBlock(in_channels, out_channels, 16 , 1, use_1x1conv=True))
             else:
-                blk.append(ResidualBlock(in_channels * 2, out_channels, 16, 1, use_1x1conv=True))
+                blk.append(ResidualBlock(in_channels * 2, out_channels, 16, 1))
         return nn.Sequential(*blk)
 
     def forward(self, x):