I have two outputs of my model of the shape [1,1024] and want to compute the cosine similarity loss between the vectors to update the weights.
Even after gradient value clipping I get nan after the backward pass.
Please help
epochs=100
batch_size=128
clip=1
model=ReverseDictionaryUnifiedPipeline(d_model).cuda()
for p in model.parameters():
if p.dim() >1:
nn.init.xavier_uniform_(p)
optim=torch.optim.Adam(model.parameters(),lr=0.00001,betas=(0.9, 0.98),eps=1e-9)
loss_func=nn.CosineSimilarity(dim=1,eps=1e-6)
for i in range(0,epochs):
for ind in range(0,(len(X)//batch_size)+1):
X_train=X[ind*batch_size:(ind+1)*batch_size]
Y_train=torch.from_numpy(Y_grnd[ind*batch_size:(ind+1)*batch_size]).cuda(0)
optim.zero_grad()
output=model(X_train)
loss1=1-loss_func(output[0][:,0,:],Y_train[:,0,:])
loss2=1-loss_func(output[1][:,0,:],Y_train[:,1,:])
print((loss1+loss2))
loss=torch.sum(loss1+loss2,dim=0)
print(loss)
loss.backward()
nn.utils.clip_grad_value_(model.parameters(), clip)
optim.step()
torch.cuda.empty_cache()
print("Batch "+str(ind)+" loss:",loss)
print("Epoch "+str(i)+" loss:",loss)