Hello
I am using Google Colab for training contrastive learning model. All the code is from this notebook
When I use PyTorch 1.9.0+cu102 and Torchvision 0.10.0+cu102 I got this error (RuntimeError: expected scalar type Half but found Float ) BUT when I use PyTorch 1.7.1 and Torchvision 0.8.2 everything is fine. Thanks for the help
#################################################################################
RuntimeError Traceback (most recent call last)
in ()
376 print(vars(args))
377
→ 378 main(args)
379
380
3 frames
in main(args)
356 # training loop
357 for epoch in range(epoch_start, args.epochs + 1):
→ 358 train_loss = train(model, train_loader, optimizer, epoch, args)
359 results[‘train_loss’].append(train_loss)
360 #test_acc_1 = test(model.encoder_q, memory_loader, test_loader, epoch, args)
in train(net, data_loader, train_optimizer, epoch, args)
170 if args.grad_accum==True:
171 if ((i + 1) % args.accum_iter == 0) or (i + 1 == len(data_loader)):
→ 172 scaler.scale(loss).backward()
173 scaler.step(train_optimizer)
174 scaler.update()
/usr/local/lib/python3.7/dist-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
253 create_graph=create_graph,
254 inputs=inputs)
→ 255 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
256
257 def register_hook(self, hook):
/usr/local/lib/python3.7/dist-packages/torch/autograd/init.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
147 Variable.execution_engine.run_backward(
148 tensors, grad_tensors, retain_graph, create_graph, inputs,
→ 149 allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
150
151
RuntimeError: expected scalar type Half but found Float
#################################################################################
The loss and the training loop are as the following:
def contrastive_loss(self, im_q, im_k):
# compute query features
q = self.encoder_q(im_q) # queries: NxC
#print('q.dtype',q.dtype)
q = nn.functional.normalize(q, dim=1) # already normalized
#print('q.dtype',q.dtype)
# compute key features
with torch.no_grad(): # no gradient to keys
# shuffle for making use of BN
im_k_, idx_unshuffle = self._batch_shuffle_single_gpu(im_k)
k = self.encoder_k(im_k_) # keys: NxC
#print('k.dtype',k.dtype)
k = nn.functional.normalize(k, dim=1) # already normalized
#print('k.dtype',k.dtype)
# undo shuffle
k = self._batch_unshuffle_single_gpu(k, idx_unshuffle)
#print('k.dtype',k.dtype)
# compute logits
# Einstein sum is more intuitive
# positive logits: Nx1
l_pos = torch.einsum('nc,nc->n', [q, k]).unsqueeze(-1)
#print('l_pos.dtype',l_pos.dtype)
# negative logits: NxK
l_neg = torch.einsum('nc,ck->nk', [q, self.queue.clone().detach()])
#print('l_neg.dtype',l_neg.dtype)
# logits: Nx(1+K)
logits = torch.cat([l_pos, l_neg], dim=1)
#print('logits.dtype',logits.dtype)
# apply temperature
logits /= self.T
#print('logits.dtype',logits.dtype)
# labels: positive key indicators
labels = torch.zeros(logits.shape[0], dtype=torch.long).cuda() #.to(self.device)
#print('labels.dtype',labels.dtype)
###
#labels = labels.type(torch.LongTensor).cuda()
###
loss = nn.CrossEntropyLoss().cuda()(logits, labels) #.to(self.device)(logits, labels)
#print('loss.dtype',loss.dtype)
return loss, q, k
def forward(self, im1, im2):
"""
Input:
im_q: a batch of query images
im_k: a batch of key images
Output:
loss
"""
# update the key encoder
with torch.no_grad(): # no gradient to keys
self._momentum_update_key_encoder()
# compute loss
if self.symmetric: # asymmetric loss
loss_12, q1, k2 = self.contrastive_loss(im1, im2)
loss_21, q2, k1 = self.contrastive_loss(im2, im1)
loss = loss_12 + loss_21
print('loss.dtype',loss.dtype)
k = torch.cat([k1, k2], dim=0)
else: # asymmetric loss
loss, q, k = self.contrastive_loss(im1, im2)
self._dequeue_and_enqueue(k)
return loss
train for one epoch
def train(net, data_loader, train_optimizer, epoch, args):
i = 0
scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)
net.train()
adjust_learning_rate(train_optimizer, epoch, args)
total_loss, total_num, train_bar = 0.0, 0, tqdm(data_loader)
for clip_1, clip_2 in train_bar:
clip_1 = clip_1.cuda(non_blocking=True) #.to(args.device)
clip_2 = clip_2.cuda(non_blocking=True) #.to(args.device)
#print('clip_1.shape', clip_1.shape)
#print('clip_2.shape', clip_2.shape)
with torch.cuda.amp.autocast(enabled=True):
loss = net(clip_1, clip_2)
#print('loss.dtype',loss.dtype)
if args.grad_accum==True and args.batch_size>1:
loss = loss / args.accum_iter
#print('loss',loss)
#print(loss.shape)
if args.grad_accum==False:
train_optimizer.zero_grad()
#loss.backward()
#train_optimizer.step()
scaler.scale(loss).backward()
scaler.step(train_optimizer)
scaler.update()
else:
if args.grad_accum==True:
if ((i + 1) % args.accum_iter == 0) or (i + 1 == len(data_loader)):
scaler.scale(loss).backward()
scaler.step(train_optimizer)
scaler.update()
# zero the parameter gradients
train_optimizer.zero_grad()
total_num += data_loader.batch_size
total_loss += loss.item() * data_loader.batch_size
train_bar.set_description('Train Epoch: [{}/{}], lr: {:.6f}, Loss: {:.4f}'.format(epoch, args.epochs, train_optimizer.param_groups[0]['lr'], total_loss / total_num))
i+=1
if i%30==0:
torch.save({'epoch': epoch, 'state_dict': net.state_dict(), 'optimizer' : train_optimizer.state_dict(),}, args.results_dir + '/model_last.pth')
return total_loss / total_num