train.py

# -*- coding: utf-8 -*-
"""HF_wav2vec2.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1ZCBHaxBaFxXfxGG_LI00-DHE-IfcoumW

## Installation and Imports
"""

from transformers import Wav2Vec2Tokenizer, Wav2Vec2Model, Wav2Vec2ForCTC
from datasets import load_dataset
import soundfile as sf
import torch
from typing import List, Tuple

"""## Basic Run"""

tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")


ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

# define function to read in sound file

def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

# load dummy dataset and read soundfiles

ds = ds.map(map_to_array)

"""Tokenizer"""

device = torch.device('cuda:0')

class Wav2Vec2Tok(Wav2Vec2Tokenizer):
    """
    Extending the base tokenizer of Wav2Vec2 for the purpose of encoding
    text sequences. 
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    def tokenize(self, text: str, **kwargs) -> List[int]:
        """
        Converts a single str into a sequence of token ids.
        """
        text = ' '.join(text.split(' '))
        text = text.replace(' ', self.word_delimiter_token)
        tokens = [self.bos_token_id]
        
        for char in text:
            tokens.append(self._convert_token_to_id(char))

        tokens.append(self.eos_token_id)
        return tokens
    
    def pad_batch_sentences(self, sentences: List[List[int]], max_length: int=-1) -> Tuple[torch.FloatTensor, torch.IntTensor]:
        """
        Pads all list of token ids, in a batch to the maximum length.
        Truncates all sequences to max_length.
        """
        sentences = [sentence[:max_length] for sentence in sentences]
        lengths = [len(sentence) for sentence in sentences]
        max_len = max(lengths)
        for i, sentence in enumerate(sentences):
            sentences[i] = sentence + [self.pad_token_id]*(max_len-len(sentence))
        return torch.tensor(sentences, dtype=torch.float32, device=device), torch.tensor(lengths, device=device)
    
    def batch_tokenize(self, texts: List[str], **kwargs) -> Tuple[torch.FloatTensor, torch.IntTensor]:
        """
        Tokenizes and batches together a list of texts
        """
        tokenized_sentences = []
        for sentence in texts:
            tokenized_sentences.append(self.tokenize(sentence))
        return self.pad_batch_sentences(tokenized_sentences)

tokenizer = Wav2Vec2Tok.from_pretrained("facebook/wav2vec2-base-960h")

"""## Adding Tokens to Vocabulary"""

# Add all devnagri tokens
for i in range(2304, 2432) :
    tokenizer._add_tokens(chr(i))

"""## Extending LM Head for additional Tokens"""

import torch.nn as nn

pt_wts = model.lm_head.weight
pt_bias = model.lm_head.bias

new_lm_head = nn.Linear(pt_wts.shape[1], len(tokenizer))

init_wts = new_lm_head.weight.clone().detach()
init_bs = new_lm_head.bias.clone().detach()
init_wts[:pt_wts.shape[0], :] = pt_wts.clone().detach()
init_bs[:pt_bias.shape[0]] = pt_bias.clone().detach()

with torch.no_grad():
    new_lm_head.weight = nn.Parameter(init_wts)
    new_lm_head.bias = nn.Parameter(init_bs)

model.lm_head = new_lm_head

## [Check; Confirmed] Transcription remains same with the layer changed :)

"""## Training Loop"""

model =  model.to(device)

"""### Defining Optimizer"""

lr = 1e-5
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

batch_size = 4

"""### Training"""

def find_lengths(logits, pad_id: int) -> torch.FloatTensor:
    """
    Function to find lengths of output sequences
    """
    preds = torch.argmax(logits, dim=-1)
    return torch.sum(torch.where(preds!=pad_id, 1, 0), axis=-1)

ctc_loss = nn.CTCLoss()


for i in range(len(ds)//batch_size-1):
    if i>=9:
        continue
        #Goes OOM beyond this

    optimizer.zero_grad()
        
    input_values = tokenizer(ds["speech"][i*batch_size:(i+1)*batch_size], return_tensors="pt", padding='longest').input_values.cuda()
        
    logits = model(input_values).logits
        
    labels, label_lengths = tokenizer.batch_tokenize(ds['text'][i*batch_size:(i+1)*batch_size])
    loss = ctc_loss(logits.transpose(0,1), labels, find_lengths(logits, tokenizer.pad_token_id), label_lengths)
        
    print("Training loss : ", loss)
        
    loss.backward()
    optimizer.step()


"""## Checking our Trained Model"""

model = model.cpu()

input_values = tokenizer(ds["speech"][:2], return_tensors="pt", padding="longest").input_values  # Batch size 1

# retrieve logits
logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)
print(transcription)