Hi, I’m training this model GitHub - microsoft/CLAP: Learning audio concepts from natural language supervision (a CLIP-like model that train an audio encoder and a text encoder at the same time using contrastive loss). However, somehow the backward step cannot update the encoders’ parameters.
Here’s the code for the loss function:
import torch
from torch import Tensor
from torch import nn
import torch.nn.functional as F
def contrastive_loss(E_a: Tensor, E_t: Tensor, temperature: float = 0.5, device="cpu") -> Tensor:
sum_term = 0
batch_size = len(E_a)
N = range(batch_size)
for i in N:
pos = torch.exp(F.cosine_similarity(E_a[i], E_t[i], dim=-1) / temperature)
a_t_neg = 0
t_a_neg = 0
for j in N:
a_t_neg = a_t_neg + torch.exp(F.cosine_similarity(E_a[i], E_t[j], dim=-1) / temperature)
t_a_neg = t_a_neg + torch.exp(F.cosine_similarity(E_t[i], E_a[j], dim=-1) / temperature)
a_t = torch.log(pos / a_t_neg)
t_a = torch.log(pos / t_a_neg)
sum_term = sum_term - (a_t + t_a)
loss = 1 / (2*batch_size) * sum_term
loss.to(device)
return loss
class ContrastiveLoss(nn.Module):
def __init__(self):
super().__init__()
def forward(self, input: Tensor, target: Tensor, temperature: float) -> Tensor:
return contrastive_loss(input, target, temperature)
Training loop:
from msclap.models.clap import AudioEncoder, TextEncoder, Projection, CLAP
audio_encoder = AudioEncoder(
audioenc_name="HTSAT",
d_in=768,
d_out=1024,
sample_rate=16000,
window_size=1024,
hop_size=320,
mel_bins=64,
fmin=50,
fmax=8000,
classes_num=527
)
audio_encoder.requires_grad_(True)
text_encoder = TextEncoder(
text_model="gpt2",
d_out=1024,
transformer_embed_dim=768
)
text_encoder.requires_grad_(True)
print("=================")
audio_optimizer = torch.optim.Adam(audio_encoder.parameters(), lr=0.001)
text_optimizer = torch.optim.Adam(text_encoder.parameters(), lr=0.001)
loss_function = ContrastiveLoss()
# loss_function = nn.CrossEntropyLoss()
use_device = "cpu"
epochs = 1
batch_size = 5
limit = 5
audio_encoder.to(device=use_device)
text_encoder.to(device=use_device)
epoch_avg_losses = []
text_encoder.train()
audio_encoder.train()
data_loader = DataLoader(dataset, batch_size=5)
for epoch in range(epoches):
current_losses = []
indices = tqdm(range(0, limit, batch_size), desc=f"Epoch: {epoch}")
for audio_tensor, text_dict_raw in data_loader:
# subsets = dataset[i: i+batch_size]
text_input = {
"input_ids": text_dict_raw["input_ids"].reshape(batch_size, -1),
"attention_mask": text_dict_raw["attention_mask"].reshape(batch_size, -1)}
audio_optimizer.zero_grad()
text_optimizer.zero_grad()
audio_embeded, _ = audio_encoder(audio_tensor.reshape(batch_size, -1))
text_embedded = text_encoder(text_input)
loss_val = loss_function(audio_embeded, text_embedded)
current_losses.append(loss_val.item())
loss_val.backward(retain_graph=True)
audio_optimizer.step()
text_optimizer.step()
indices.set_postfix({"loss_val": loss_val.item()})
epoch_avg_losses.append(sum(current_losses) / len(current_losses))
I suspect that I did something wrong in my loss function so I tested this with the default CrossEntropyLoss, but the two encoder’s parameters were not updated either. I
Is it possible that the problems lie in the models’ code itself? Really appreciate any pointers!