diff --git a/cli_train.py b/cli_train.py index a6428d7..47fcc19 100755 --- a/cli_train.py +++ b/cli_train.py @@ -153,6 +153,7 @@ def main( max_epochs=epochs, accelerator="gpu" if use_gpu else "cpu", devices=1, + auto_lr_find=True, callbacks=[checkpointer, LearningRateMonitor()], logger=[tensorboard_logger, csv_logger], default_root_dir="logs", diff --git a/experiments/experiment_1_cluster.sh b/experiments/experiment_1_cluster.sh index 7bc18a0..dea89dd 100755 --- a/experiments/experiment_1_cluster.sh +++ b/experiments/experiment_1_cluster.sh @@ -21,9 +21,9 @@ val_trials_path=./data/tiny-voxceleb/val_trials.txt dev_trials_path=./data/tiny-voxceleb/dev_trials.txt # hyperparameters for optimization -batch_size=128 -learning_rate=3e-3 -num_epochs=30 +batch_size=64 +learning_rate=2e-3 +num_epochs=40 num_workers=5 # hyperparameters related to data pre-processing and network architecture diff --git a/skeleton/data/datapipe.py b/skeleton/data/datapipe.py index 5c143f4..f364e74 100644 --- a/skeleton/data/datapipe.py +++ b/skeleton/data/datapipe.py @@ -76,8 +76,8 @@ def random_gain_aug(data, minimum=0.1, maximum=0.12): #change the percieved loud return data * gain #scale but in amplitude def randomize_effect(): - effects = ['inject_noise', 'rd_speed_change','rand_gain', 'reverb', 'none'] - choice = np.random.choice(effects, 1, p=[0.1,0.1,0.1,0.2,0.5]) # if aug on everything then not rep of test dataset + effects = ['inject_noise', 'rd_speed_change', 'none'] + choice = np.random.choice(effects, 1, p=[0.05,0.05,0.9]) # if aug on everything then not rep of test dataset return choice @@ -85,11 +85,27 @@ def decode_wav(value: StreamWrapper) -> t.Tensor: assert isinstance(value, StreamWrapper) value, sample_rate = torchaudio.load(value) - # choice = randomize_effect() - # if choice == 'inject_noise': - # value = inject_noise(value, 0.01) - # elif choice == 'rd_speed_change': - # value = random_speed_change(value, sample_rate) + choice = randomize_effect() + if choice == 'inject_noise': + value = inject_noise(value, 0.01) + elif choice == 'rd_speed_change': + value = random_speed_change(value, sample_rate) + # elif choice == 'rand_gain': + # value= random_gain_aug(value, minimum=0.1, maximum=0.12) + # elif choice == 'reverb': + # value= reverb_aug(value,sample_rate) + + assert sample_rate == 16_000 + + # make sure that audio has 1 dimension + value = torch.squeeze(value) + + return value + +def decode_wav_original(value: StreamWrapper) -> t.Tensor: + assert isinstance(value, StreamWrapper) + + value, sample_rate = torchaudio.load(value) assert sample_rate == 16_000 @@ -120,6 +136,19 @@ def decode(element: Tuple[str, StreamWrapper]): return key, value +def decode_original(element: Tuple[str, StreamWrapper]): + assert isinstance(element, tuple) and len(element) == 2 + key, value = element + assert isinstance(key, str) + assert isinstance(value, StreamWrapper) + + if key.endswith(".wav"): + value = decode_wav_original(value) + + if key.endswith(".json"): + value = decode_json(value) + + return key, value ######################################################################################## # default pipeline loading data from tar files into a tuple (sample_id, x, y) @@ -128,6 +157,7 @@ def decode(element: Tuple[str, StreamWrapper]): def construct_sample_datapipe( + is_augmented: bool, shard_folder: pathlib.Path, num_workers: int, buffer_size: int = 0, @@ -163,20 +193,37 @@ def construct_sample_datapipe( dp = TarArchiveLoader(dp, mode="r") # decode each file in the tar to the expected python dataformat - dp = Mapper(dp, decode) + if is_augmented: + dp = Mapper(dp, decode) + else: + dp = Mapper(dp, decode_original) # each file in the tar is expected to have the format `{key}.{ext} # this groups all files with the same key into one dictionary dp = WebDataset(dp) # transform the dictionaries into tuple (sample_id, x, y) - dp = Mapper(dp, map_dict_to_tuple) + if is_augmented: + dp = Mapper(dp, map_dict_to_tuple) + else: + dp = Mapper(dp, map_dict_to_tuple_original) + # buffer tuples to increase variability if buffer_size > 0: dp = Shuffler(dp, buffer_size=buffer_size) return dp +def map_dict_to_tuple_original(x: Dict) -> Sample: + sample_id = x[".json"]["sample_id"] + "_org" + wav = x[".wav"] + + class_idx = x[".json"]["class_idx"] + if class_idx is None: + gt = None + else: + gt = t.tensor(x[".json"]["class_idx"], dtype=t.int64) + return Sample(sample_id, wav, gt) def map_dict_to_tuple(x: Dict) -> Sample: sample_id = x[".json"]["sample_id"] @@ -187,7 +234,7 @@ def map_dict_to_tuple(x: Dict) -> Sample: gt = None else: gt = t.tensor(x[".json"]["class_idx"], dtype=t.int64) - + # print("Augmented: ", Sample(sample_id, wav, gt)) return Sample(sample_id, wav, gt) @@ -268,25 +315,6 @@ def _print_sample(dp): print(y) print(f"{y.shape=}") print(f"{y.dtype=}\n") - break - -def debug_an(): - shard_path = pathlib.Path( - "/home/anilsson/mlip/tiny-voxceleb-skeleton-2023/data/tiny-voxceleb-shards/train" - ) - - n_mfcc = 40 - - print("### construct_sample_datapipe ###") - dp = construct_sample_datapipe(shard_path, num_workers=0) - _print_sample(dp) - - print("### pipe_chunk_sample ###") - dp = pipe_chunk_sample(dp, 16_000 * 3) # 3 seconds - - _print_sample(dp) - - def _debug(): shard_path = pathlib.Path( @@ -296,12 +324,13 @@ def _debug(): n_mfcc = 40 print("### construct_sample_datapipe ###") - dp = construct_sample_datapipe(shard_path, num_workers=0) + dp = construct_sample_datapipe(True, shard_path, num_workers=0) + dp_org = construct_sample_datapipe(False, shard_path, num_workers=0) + dp = dp_org.concat(dp) _print_sample(dp) print("### pipe_chunk_sample ###") dp = pipe_chunk_sample(dp, 16_000 * 3) # 3 seconds - _print_sample(dp) @@ -315,10 +344,5 @@ def _debug(): _print_sample(dp) - - - - if __name__ == "__main__": - #_debug() - debug_an() + _debug() diff --git a/skeleton/data/tiny_voxceleb.py b/skeleton/data/tiny_voxceleb.py index 38bc9cc..bee059b 100644 --- a/skeleton/data/tiny_voxceleb.py +++ b/skeleton/data/tiny_voxceleb.py @@ -46,24 +46,38 @@ def __init__( self.shard_folder = shard_folder self.val_trials_path = val_trials_path self.dev_trials_path = dev_trials_path + self.is_augmented = True # init in setup() + self.train_dp_original = None self.train_dp = None self.val_dp = None self.dev_dp = None def setup(self, stage: Optional[str] = None) -> None: - # train dataloader - train_dp = construct_sample_datapipe( + # train dataloader (non-augmented) + train_dp_original = construct_sample_datapipe(not self.is_augmented, self.shard_folder / "train", num_workers=self.num_workers_train ) + train_dp_original = pipe_chunk_sample(train_dp_original, self.chunk_length_num_frames) + train_dp_original = pipe_mfcc(train_dp_original, self.n_mfcc) + train_dp_original = pipe_batch_samples(train_dp_original, self.batch_size, drop_last=True) + self.train_dp_original = train_dp_original + + # train dataloader (augmented) + train_dp = construct_sample_datapipe(self.is_augmented, + self.shard_folder / "train", num_workers=self.num_workers_train + ) + # train_dp = train_dp_original.concat(train_dp) train_dp = pipe_chunk_sample(train_dp, self.chunk_length_num_frames) train_dp = pipe_mfcc(train_dp, self.n_mfcc) train_dp = pipe_batch_samples(train_dp, self.batch_size, drop_last=True) self.train_dp = train_dp + # self.train_dp = self.train_dp_original.concat(self.train_dp) + # val dataloader - val_dp = construct_sample_datapipe( + val_dp = construct_sample_datapipe(not self.is_augmented, self.shard_folder / "val", num_workers=self.num_workers_eval ) val_dp = pipe_chunk_sample(val_dp, self.chunk_length_num_frames) @@ -73,7 +87,7 @@ def setup(self, stage: Optional[str] = None) -> None: # dev dataloader # we explicitly evaluate with a batch size of 1 and the whole utterance - dev_dp = construct_sample_datapipe( + dev_dp = construct_sample_datapipe(not self.is_augmented, self.shard_folder / "dev", num_workers=self.num_workers_eval ) dev_dp = pipe_mfcc(dev_dp, self.n_mfcc) @@ -81,6 +95,7 @@ def setup(self, stage: Optional[str] = None) -> None: self.dev_dp = dev_dp def train_dataloader(self) -> TRAIN_DATALOADERS: + self.train_dp = torch.utils.data.ChainDataset([self.train_dp_original, self.train_dp]) return torch.utils.data.DataLoader( self.train_dp, batch_size=None, num_workers=self.num_workers_train ) diff --git a/skeleton/layers/resnet.py b/skeleton/layers/resnet.py index 2feebfe..5da277c 100644 --- a/skeleton/layers/resnet.py +++ b/skeleton/layers/resnet.py @@ -38,7 +38,7 @@ def block(self, in_channels,num_residuals, out_channels): if i == 0: blk.append(ResidualBlock(in_channels, out_channels, 16 , 1, use_1x1conv=True)) else: - blk.append(ResidualBlock(in_channels * 2, out_channels, 16, 1, use_1x1conv=True)) + blk.append(ResidualBlock(in_channels * 2, out_channels, 16, 1)) return nn.Sequential(*blk) def forward(self, x):