Wav2VecForLinguisticTonalForCTC( (wav2vec2): Wav2Vec2Model( (feature_extractor): Wav2Vec2FeatureEncoder( (conv_layers): ModuleList( (0): Wav2Vec2GroupNormConvLayer( (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False) (activation): GELUActivation() (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True) ) (1-4): 4 x Wav2Vec2NoLayerNormConvLayer( (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False) (activation): GELUActivation() ) (5-6): 2 x Wav2Vec2NoLayerNormConvLayer( (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False) (activation): GELUActivation() ) ) ) (feature_projection): Wav2Vec2FeatureProjection( (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (projection): Linear(in_features=512, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): Wav2Vec2Encoder( (pos_conv_embed): Wav2Vec2PositionalConvEmbedding( (conv): Conv1d(768, 768, kernel_size=(128,), stride=(1,), padding=(64,), groups=16) (padding): Wav2Vec2SamePadLayer() (activation): GELUActivation() ) (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) (layers): ModuleList( (0-11): 12 x Wav2Vec2EncoderLayer( (attention): Wav2Vec2Attention( (k_proj): Linear(in_features=768, out_features=768, bias=True) (v_proj): Linear(in_features=768, out_features=768, bias=True) (q_proj): Linear(in_features=768, out_features=768, bias=True) (out_proj): Linear(in_features=768, out_features=768, bias=True) ) (dropout): Dropout(p=0.1, inplace=False) (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (feed_forward): Wav2Vec2FeedForward( (intermediate_dropout): Dropout(p=0.1, inplace=False) (intermediate_dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() (output_dense): Linear(in_features=3072, out_features=768, bias=True) (output_dropout): Dropout(p=0.1, inplace=False) ) (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) ) ) (ling_emb): Embedding(123, 768, padding_idx=0) (pos_enc): PositionalEncoding() (mha): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (attn_norm): RMSNorm() (ffn_layer): Sequential( (0): RMSNorm() (1): Linear(in_features=768, out_features=1536, bias=True) (2): SwiGLU() (3): Dropout(p=0.1, inplace=False) (4): Linear(in_features=768, out_features=768, bias=True) ) (out_norm): RMSNorm() (lm_head): Linear(in_features=768, out_features=123, bias=True) (tonal_head): Linear(in_features=768, out_features=7, bias=True) )
Wav2VecForLinguisticTonalForCTC( (wav2vec2): Wav2Vec2Model( (feature_extractor): Wav2Vec2FeatureEncoder( (conv_layers): ModuleList( (0): Wav2Vec2GroupNormConvLayer( (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False) (activation): GELUActivation() (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True) ) (1-4): 4 x Wav2Vec2NoLayerNormConvLayer( (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False) (activation): GELUActivation() ) (5-6): 2 x Wav2Vec2NoLayerNormConvLayer( (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False) (activation): GELUActivation() ) ) ) (feature_projection): Wav2Vec2FeatureProjection( (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (projection): Linear(in_features=512, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): Wav2Vec2Encoder( (pos_conv_embed): Wav2Vec2PositionalConvEmbedding( (conv): Conv1d(768, 768, kernel_size=(128,), stride=(1,), padding=(64,), groups=16) (padding): Wav2Vec2SamePadLayer() (activation): GELUActivation() ) (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) (layers): ModuleList( (0-11): 12 x Wav2Vec2EncoderLayer( (attention): Wav2Vec2Attention( (k_proj): Linear(in_features=768, out_features=768, bias=True) (v_proj): Linear(in_features=768, out_features=768, bias=True) (q_proj): Linear(in_features=768, out_features=768, bias=True) (out_proj): Linear(in_features=768, out_features=768, bias=True) ) (dropout): Dropout(p=0.1, inplace=False) (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (feed_forward): Wav2Vec2FeedForward( (intermediate_dropout): Dropout(p=0.1, inplace=False) (intermediate_dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() (output_dense): Linear(in_features=3072, out_features=768, bias=True) (output_dropout): Dropout(p=0.1, inplace=False) ) (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) ) ) (ling_emb): Embedding(123, 768, padding_idx=0) (pos_enc): PositionalEncoding() (mha): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (attn_norm): RMSNorm() (ffn_layer): Sequential( (0): RMSNorm() (1): Linear(in_features=768, out_features=1536, bias=True) (2): SwiGLU() (3): Dropout(p=0.1, inplace=False) (4): Linear(in_features=768, out_features=768, bias=True) ) (out_norm): RMSNorm() (lm_head): Linear(in_features=768, out_features=123, bias=True) )
Wav2VecForLinguisticTonalForCTC( (wav2vec2): Wav2Vec2Model( (feature_extractor): Wav2Vec2FeatureEncoder( (conv_layers): ModuleList( (0): Wav2Vec2GroupNormConvLayer( (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False) (activation): GELUActivation() (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True) ) (1-4): 4 x Wav2Vec2NoLayerNormConvLayer( (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False) (activation): GELUActivation() ) (5-6): 2 x Wav2Vec2NoLayerNormConvLayer( (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False) (activation): GELUActivation() ) ) ) (feature_projection): Wav2Vec2FeatureProjection( (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (projection): Linear(in_features=512, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): Wav2Vec2Encoder( (pos_conv_embed): Wav2Vec2PositionalConvEmbedding( (conv): Conv1d(768, 768, kernel_size=(128,), stride=(1,), padding=(64,), groups=16) (padding): Wav2Vec2SamePadLayer() (activation): GELUActivation() ) (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) (layers): ModuleList( (0-11): 12 x Wav2Vec2EncoderLayer( (attention): Wav2Vec2Attention( (k_proj): Linear(in_features=768, out_features=768, bias=True) (v_proj): Linear(in_features=768, out_features=768, bias=True) (q_proj): Linear(in_features=768, out_features=768, bias=True) (out_proj): Linear(in_features=768, out_features=768, bias=True) ) (dropout): Dropout(p=0.1, inplace=False) (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (feed_forward): Wav2Vec2FeedForward( (intermediate_dropout): Dropout(p=0.1, inplace=False) (intermediate_dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() (output_dense): Linear(in_features=3072, out_features=768, bias=True) (output_dropout): Dropout(p=0.1, inplace=False) ) (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) ) ) (lm_head): Linear(in_features=768, out_features=123, bias=True) (linguistic_head): LinguisticHead( (ling_emb): Embedding(123, 768, padding_idx=0) (pos_enc): PositionalEncoding() (ling_blocks): ModuleList( (0-3): 4 x LinguisticBlock( (attn_norm): RMSNorm() (cross_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (attn_dropout): Dropout(p=0.1, inplace=False) (ffn_norm): RMSNorm() (ffn_layer): Sequential( (0): Linear(in_features=768, out_features=1536, bias=True) (1): SwiGLU() (2): Dropout(p=0.1, inplace=False) (3): Linear(in_features=768, out_features=768, bias=True) ) ) ) (out_norm): RMSNorm() ) )
Wav2VecForLinguisticTonalForCTC( (wav2vec2): Wav2Vec2Model( (feature_extractor): Wav2Vec2FeatureEncoder( (conv_layers): ModuleList( (0): Wav2Vec2GroupNormConvLayer( (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False) (activation): GELUActivation() (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True) ) (1-4): 4 x Wav2Vec2NoLayerNormConvLayer( (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False) (activation): GELUActivation() ) (5-6): 2 x Wav2Vec2NoLayerNormConvLayer( (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False) (activation): GELUActivation() ) ) ) (feature_projection): Wav2Vec2FeatureProjection( (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (projection): Linear(in_features=512, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): Wav2Vec2Encoder( (pos_conv_embed): Wav2Vec2PositionalConvEmbedding( (conv): Conv1d(768, 768, kernel_size=(128,), stride=(1,), padding=(64,), groups=16) (padding): Wav2Vec2SamePadLayer() (activation): GELUActivation() ) (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) (layers): ModuleList( (0-11): 12 x Wav2Vec2EncoderLayer( (attention): Wav2Vec2Attention( (k_proj): Linear(in_features=768, out_features=768, bias=True) (v_proj): Linear(in_features=768, out_features=768, bias=True) (q_proj): Linear(in_features=768, out_features=768, bias=True) (out_proj): Linear(in_features=768, out_features=768, bias=True) ) (dropout): Dropout(p=0.1, inplace=False) (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (feed_forward): Wav2Vec2FeedForward( (intermediate_dropout): Dropout(p=0.1, inplace=False) (intermediate_dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() (output_dense): Linear(in_features=3072, out_features=768, bias=True) (output_dropout): Dropout(p=0.1, inplace=False) ) (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) ) ) (ling_emb): Embedding(123, 768, padding_idx=0) (pos_enc): PositionalEncoding() (mha): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (attn_norm): RMSNorm() (ffn_layer): Sequential( (0): RMSNorm() (1): Linear(in_features=768, out_features=1536, bias=True) (2): SwiGLU() (3): Dropout(p=0.1, inplace=False) (4): Linear(in_features=768, out_features=768, bias=True) ) (out_norm): RMSNorm() (lm_head): Linear(in_features=768, out_features=123, bias=True) (focal_ctc_loss): FocalCTCLoss( (ctc_crit): CTCLoss() ) )