Hello everyone, I am trying to train a model using a pretty basic training function. However, when calling the backward method to back propagate the loss value, I get the following error message :
Epoch number 1
0%| | 0/260 [00:12<?, ?it/s]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[25], line 8
4 scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3)
7 torch.cuda.empty_cache()
----> 8 history = train_first_stage(model, train_loader_first_step, val_loader_first_step, n_epochs, optimizer, loss, scheduler)
Cell In[23], line 31, in train_first_stage(model, train_loader, val_loader, epochs, optimizer, loss_fn, scheduler)
27 optimizer.zero_grad()
29 loss = criterion(output, label)
---> 31 loss.backward()
32 optimizer.step()
35 lrs.append(get_lr(optimizer))
File /opt/anaconda3/envs/SPOD_env/lib/python3.12/site-packages/torch/_tensor.py:522, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
512 if has_torch_function_unary(self):
513 return handle_torch_function(
514 Tensor.backward,
515 (self,),
(...)
520 inputs=inputs,
521 )
--> 522 torch.autograd.backward(
523 self, gradient, retain_graph, create_graph, inputs=inputs
524 )
File /opt/anaconda3/envs/SPOD_env/lib/python3.12/site-packages/torch/autograd/__init__.py:266, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
261 retain_graph = create_graph
263 # The reason we repeat the same comment below is that
264 # some Python versions print out the first line of a multi-line function
265 # calls in the traceback and some print out the last line
--> 266 Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
267 tensors,
268 grad_tensors_,
269 retain_graph,
270 create_graph,
271 inputs,
272 allow_unreachable=True,
273 accumulate_grad=True,
274 )
RuntimeError: self must be a matrix
I looked for other forums dealing with this error message, but none of them seems to solve my issue. Here is the training functions I tried to run and its call :
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def train_first_stage(model, train_loader, val_loader, epochs, optimizer, loss_fn, scheduler):
torch.cuda.empty_cache()
train_loss = []
val_loss = []
lrs = []
not_improve = 0
min_loss = 'inf'
for epoch in range(epochs):
print("Epoch number", epoch+1)
for phase in ['train', 'val']:
running_loss = 0.0
criterion = loss_fn
if phase == 'train':
model.train()
for data, label in tqdm(train_loader):
data, label = data.to(device), label.to(device)
output = model(data)
optimizer.zero_grad()
print(output.shape, label.shape)
loss = criterion(output, label)
loss.backward()
optimizer.step()
lrs.append(get_lr(optimizer))
scheduler.step(loss)
running_loss += loss.item()
del data, label, output
gc.collect()
torch.cuda.empty_cache()
num_samples = float(len(train_loader.dataset))
tr_loss_ = running_loss.item()/num_samples
train_loss.append(tr_loss_)
else:
model.eval()
val_loss_ = 0
with torch.no_grad():
for data, label in tqdm(val_loader):
data = data.to(device)
output = model(data)
loss = criterion(output, label)
running_loss += loss.item()
del data, label, output
gc.collect()
torch.cuda.empty_cache()
um_samples = float(len(val_loader.dataset))
val_loss_ = running_loss.item()/num_samples
val_loss.append(val_loss_)
if min_loss > (test_loss/len(val_loader)):
min_loss = (test_loss/len(val_loader))
if (test_loss/len(val_loader)) > min_loss:
not_improve += 1
min_loss = (test_loss/len(val_loader))
print(f'Loss Not Decrease for {not_improve} time')
if not_improve == 5:
print('Loss not decrease for 5 times, Stop Training')
break
history = {'train_loss': train_loss, 'val_loss': val_loss, 'learning_rate': lrs}
return history
n_epochs = 10
loss = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3)
history = train_first_stage(model, train_loader_first_step, val_loader_first_step, n_epochs, optimizer, loss, scheduler)
Did someone ever get the same error and can help me ? My pytorch version is 2.2.1, and I use CUDA 12.1.