diff --git a/skeleton/data/datapipe.py b/skeleton/data/datapipe.py index 854ead5..4013326 100644 --- a/skeleton/data/datapipe.py +++ b/skeleton/data/datapipe.py @@ -59,7 +59,7 @@ def random_speed_change(data, sample_rate): def randomize_effect(): effects = ['inject_noise', 'rd_speed_change', 'none'] - choice = np.random.choice(effects, 1, p=[0.3, 0.2, 0.5]) + choice = np.random.choice(effects, 1, p=[0.1, 0.1, 0.8]) return choice @@ -67,11 +67,11 @@ def decode_wav(value: StreamWrapper) -> t.Tensor: assert isinstance(value, StreamWrapper) value, sample_rate = torchaudio.load(value) - choice = randomize_effect() - if choice == 'inject_noise': - value = inject_noise(value, 0.01) - elif choice == 'rd_speed_change': - value = random_speed_change(value, sample_rate) + # choice = randomize_effect() + # if choice == 'inject_noise': + # value = inject_noise(value, 0.01) + # elif choice == 'rd_speed_change': + # value = random_speed_change(value, sample_rate) assert sample_rate == 16_000 diff --git a/skeleton/layers/residual_block.py b/skeleton/layers/residual_block.py index a07cd62..0ae79b0 100644 --- a/skeleton/layers/residual_block.py +++ b/skeleton/layers/residual_block.py @@ -3,22 +3,30 @@ from torch.nn import functional as F class ResidualBlock(nn.Module): - def __init__(self,out_channels, use_1x1conv=False, strides=1, kernel_size=3, padding=1): + def __init__(self, in_channels, out_channels, groups, bot_mul, use_1x1conv=False, strides=1, kernel_size=3, padding=1): super().__init__() - self.conv1 = nn.LazyConv1d(out_channels, kernel_size=kernel_size, padding=padding, - stride=strides) - self.conv2 = nn.LazyConv1d(out_channels, kernel_size=kernel_size, padding=padding) + bot_channels = int(round(out_channels * bot_mul)) + + self.conv1 = nn.Conv1d(in_channels, bot_channels, kernel_size=kernel_size, padding=2, + stride=1) + self.conv2 = nn.Conv1d(out_channels, bot_channels, kernel_size=kernel_size, stride=strides, padding=padding, groups=bot_channels//groups) + self.conv3 = nn.LazyConv1d(out_channels, kernel_size=kernel_size, stride=1) + if use_1x1conv: - self.conv3 = nn.LazyConv1d(out_channels, kernel_size=1, stride=strides) + self.conv4 = nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=strides) + self.bn4 = nn.BatchNorm1d(out_channels) else: - self.conv3 = None + self.conv4 = None self.bn1 = nn.BatchNorm1d(out_channels) self.bn2 = nn.BatchNorm1d(out_channels) + self.bn3 = nn.BatchNorm1d(out_channels) def forward(self, X): Y = F.relu(self.bn1(self.conv1(X))) - Y = self.bn2(self.conv2(Y)) - if self.conv3: - X = self.conv3(X) + Y = F.relu(self.bn2(self.conv2(Y))) + + Y = self.bn3(self.conv3(Y)) + if self.conv4: + X = self.bn4(self.conv4(X)) Y += X return F.relu(Y) \ No newline at end of file diff --git a/skeleton/layers/resnet.py b/skeleton/layers/resnet.py index 8fd088e..2feebfe 100644 --- a/skeleton/layers/resnet.py +++ b/skeleton/layers/resnet.py @@ -10,12 +10,12 @@ def __init__(self, triples): modules = [] modules.append(self.starting_block(128)) for _, triple in enumerate(triples): - num_residuals, out_channels = triple[0], triple[1] - block = self.block(num_residuals,out_channels) + in_channels, num_residuals, out_channels = triple[0], triple[1], triple[2] + block = self.block(in_channels, num_residuals,out_channels) modules.append(block) - modules.append(nn.Sequential(nn.ReLU(), nn.AdaptiveAvgPool1d(3))) modules.append(nn.Sequential( + nn.AdaptiveAvgPool1d(3), nn.Flatten(), nn.LazyLinear(256), nn.ReLU(), nn.Dropout(p=0.5), @@ -32,10 +32,13 @@ def starting_block(self, input_channels): nn.ReLU(), nn.MaxPool1d(kernel_size=3, stride=2, padding=1)) - def block(self, num_residuals, out_channels): + def block(self, in_channels,num_residuals, out_channels): blk = [] - for _ in range(num_residuals): - blk.append(ResidualBlock(out_channels, use_1x1conv=True)) + for i in range(num_residuals): + if i == 0: + blk.append(ResidualBlock(in_channels, out_channels, 16 , 1, use_1x1conv=True)) + else: + blk.append(ResidualBlock(in_channels * 2, out_channels, 16, 1, use_1x1conv=True)) return nn.Sequential(*blk) def forward(self, x): diff --git a/skeleton/models/prototype.py b/skeleton/models/prototype.py index 80e5c02..9c994e5 100644 --- a/skeleton/models/prototype.py +++ b/skeleton/models/prototype.py @@ -67,7 +67,7 @@ def __init__( nn.ReLU(), ) - self.resnet = ResNet(((2, num_embedding*2), (2,num_embedding*4),(2, num_embedding*8), (2, num_embedding*16))) + self.resnet = ResNet(((num_embedding, 2, num_embedding*2),(num_embedding*2, 2, num_embedding*4), (num_embedding*4, 2, num_embedding*8), (num_embedding*8, 2, num_embedding*16))) # Pooling layer # assuming input of shape [BATCH_SIZE, NUM_EMBEDDING, REDUCED_NUM_FRAMES]