jbdiff-v1.yaml

model:
      vqvae:
            batch_size: 32
            aug_shift: True
            base_tokens: 768
            context_mult: 2
      diffusion:
            2: #Level 2 is bottom level of vqvae
                  net_t: audio_diffusion_pytorch.UNetV0
                  in_channels: 64
                  channels: [64, 128, 128, 256, 256, 512, 512, 1024, 1024]
                  factors: [1, 2, 1, 2, 1, 2, 1, 2, 1]
                  items: [1, 3, 3, 3, 3, 3, 3, 3, 1]
                  attentions: [0, 0, 0, 0, 1, 0, 1, 0, 0]
                  cross_attentions: [0, 1, 0, 1, 0, 1, 0, 1, 1]
                  attention_heads: 8
                  attention_features: 64
                  diffusion_t: audio_diffusion_pytorch.VDiffusion
                  sampler_t: audio_diffusion_pytorch.VSampler
                  embedding_features: 64
                  use_embedding_cfg: True
                  # embedding_max_length: context_mult*base_tokens
                  resnet_dilation_factor: 3
                  resnet_dropout_rate: 0.05
            1: #Level 1 is middle level of vqvae
                  net_t: audio_diffusion_pytorch.UNetV0
                  in_channels: 64
                  channels: [64, 128, 128, 256, 256, 512, 512, 1024, 1024]
                  factors: [1, 2, 1, 2, 1, 2, 1, 2, 1]
                  items: [1, 3, 3, 3, 3, 3, 3, 3, 1]
                  attentions: [0, 0, 0, 0, 1, 0, 1, 0, 0]
                  cross_attentions: [0, 1, 0, 1, 0, 1, 0, 1, 1]
                  attention_heads: 8
                  attention_features: 64
                  diffusion_t: audio_diffusion_pytorch.VDiffusion
                  sampler_t: audio_diffusion_pytorch.VSampler
                  embedding_features: 64
                  use_embedding_cfg: True
                  # embedding_max_length: context_mult*base_tokens
                  resnet_dilation_factor: 3
                  resnet_dropout_rate: 0.00
            0: #Level 0 is top level of vqvae
                  net_t: audio_diffusion_pytorch.UNetV0
                  in_channels: 64
                  channels: [64, 128, 128, 256, 256, 512, 512, 1024, 1024]
                  factors: [1, 2, 1, 2, 1, 2, 1, 2, 1]
                  items: [1, 3, 3, 3, 3, 3, 3, 3, 1]
                  attentions: [0, 0, 0, 0, 1, 0, 1, 0, 0]
                  cross_attentions: [0, 1, 0, 1, 0, 1, 0, 1, 1]
                  attention_heads: 8
                  attention_features: 64
                  diffusion_t: audio_diffusion_pytorch.VDiffusion
                  sampler_t: audio_diffusion_pytorch.VSampler
                  embedding_features: 64
                  use_embedding_cfg: True
                  # embedding_max_length: context_mult*base_tokens
                  resnet_dilation_factor: 3
                  resnet_dropout_rate: 0.00