I am trying to recreate CNN model from keras and facing runtime error on my loss.bakcward()
I have check the model summary and saw summary was running fine with its channels and tensor, however due to my lack of knowledge I can not locate the bug. Can anyone give me an advice?
Then model :
from typing import List
class DNA_CNN_test2(nn.Module): # deepcre model
def __init__(self,
seq_len: int =1000,
#num_filters: List[int] = [64, 128, 64],
kernel_size: int = 8,
p = 0.25): # drop out value
super().__init__()
self.seq_len = seq_len
window_size = int(seq_len*(8/3000))
# CNN module
self.conv_net = nn.Sequential() # sequential containter. the forward() method of sequential accepts cany input and forwards it to yhe first module it contains
#num_filters = [4] + num_filters
self.model = nn.Sequential(
# conv block 1
nn.Conv1d(4,64,kernel_size=kernel_size, padding='same'),
nn.ReLU(inplace=True),
nn.Conv1d(64,64,kernel_size=kernel_size, padding='same'),
nn.ReLU(inplace=True),
nn.MaxPool1d(kernel_size=window_size),
nn.Dropout(p),
# conv block 2
nn.Conv1d(64,128,kernel_size=kernel_size, padding='same'),
nn.ReLU(inplace=True),
nn.Conv1d(128,128,kernel_size=kernel_size, padding='same'),
nn.ReLU(inplace=True),
nn.MaxPool1d(kernel_size=window_size),
nn.Dropout(p),
# conv block 3
nn.Conv1d(128,64,kernel_size=kernel_size, padding='same'),
nn.ReLU(inplace=True),
nn.Conv1d(64,64,kernel_size=kernel_size, padding='same'),
nn.ReLU(inplace=True),
nn.MaxPool1d(kernel_size=window_size),
nn.Dropout(p),
nn.Flatten(),
nn.Linear(64*(seq_len//window_size**3), 1))
#nn.ReLU(inplace=True),
#nn.Dropout(p),
#nn.Linear(128, 64),
#nn.ReLU(inplace=True),
#nn.Linear(64*seq_len, 1))
def forward(self, xb: torch.Tensor):
"""Forward pass."""
# reshape view to batch_ssize x 4channel x seq_len
# permute to put channel in correct order
means (batch size, 4 channel - OHE(DNA), Seq.length )
xb = xb.permute(0, 2, 1).mean( dim = [1,2], keepdim = True).squeeze(dim= -1)
out = self.conv_net(xb)
return out
loss_batch,train and test step
# +--------------------------------+
# | Training and fitting functions |
# +--------------------------------+
def loss_batch(model, loss_func, xb, yb, opt=None,verbose=False):
'''
Apply loss function to a batch of inputs. If no optimizer
is provided, skip the back prop step.
'''
if verbose:
print('loss batch ****')
print("xb shape:",xb.shape)
print("yb shape:",yb.shape)
print("yb shape:",yb.squeeze(1).shape)
#print("yb",yb)
# get the batch output from the model given your input batch
# ** This is the model's prediction for the y labels! **
xb_out = model(xb.float())
if verbose:
print("model out pre loss", xb_out.shape)
#print('xb_out', xb_out)
print("xb_out:",xb_out.shape)
print("yb:",yb.shape)
print("yb.long:",yb.long().shape)
loss = loss_func(xb_out, yb.float()) # for MSE/regression
# __FOOTNOTE 2__
if opt is not None: # if opt
loss.backward()
opt.step()
opt.zero_grad()
return loss.item(), len(xb)
def train_step(model, train_dl, loss_func, device, opt):
'''
Execute 1 set of batched training within an epoch
'''
# Set model to Training mode
model.train()
tl = [] # train losses
ns = [] # batch sizes, n
# loop through train DataLoader
for xb, yb in train_dl:
# put on GPU
xb, yb = xb.to(device),yb.to(device)
# provide opt so backprop happens
t, n = loss_batch(model, loss_func, xb, yb, opt=opt)
# collect train loss and batch sizes
tl.append(t)
ns.append(n)
# average the losses over all batches
train_loss = np.sum(np.multiply(tl, ns)) / np.sum(ns)
return train_loss
def val_step(model, val_dl, loss_func, device):
'''
Execute 1 set of batched validation within an epoch
'''
# Set model to Evaluation mode
model.eval()
with torch.no_grad():
vl = [] # val losses
ns = [] # batch sizes, n
# loop through validation DataLoader
for xb, yb in val_dl:
# put on GPU
xb, yb = xb.to(device),yb.to(device)
# Do NOT provide opt here, so backprop does not happen
v, n = loss_batch(model, loss_func, xb, yb)
# collect val loss and batch sizes
vl.append(v)
ns.append(n)
# average the losses over all batches
val_loss = np.sum(np.multiply(vl, ns)) / np.sum(ns)
return val_loss
def fit(epochs, model, loss_func, opt, train_dl, val_dl,device,patience=1000):
'''
Fit the model params to the training data, eval on unseen data.
Loop for a number of epochs and keep train of train and val losses
along the way
'''
# keep track of losses
train_losses = []
val_losses = []
# loop through epochs
for epoch in range(epochs):
# take a training step
train_loss = train_step(model,train_dl,loss_func,device,opt)
train_losses.append(train_loss)
# take a validation step
val_loss = val_step(model,val_dl,loss_func,device)
val_losses.append(val_loss)
print(f"E{epoch} | train loss: {train_loss:.3f} | val loss: {val_loss:.3f}")
return train_losses, val_losses
def run_model(train_dl,val_dl,model,device,
lr=1e-2, epochs=50,
lossf=None,opt=None
):
'''
Given train and val DataLoaders and a NN model, fit the mode to the training
data. By default, use MSE loss and an SGD optimizer
'''
# define optimizer
if opt:
optimizer = opt
else: # if no opt provided, just use SGD
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
# define loss function
if lossf:
loss_func = lossf
else: # if no loss function provided, just use MSE
loss_func = torch.nn.MSELoss()
# run the training loop
train_losses, val_losses = fit(
epochs,
model,
loss_func,
optimizer,
train_dl,
val_dl,
device)
return train_losses, val_losses
Error:
RuntimeError Traceback (most recent call last)
Cell In[51], line 5
2 DNA_CNN_test2 = DNA_CNN_test2(seq_len)
3 DNA_CNN_test2.to(device)
----> 5 DNA_CNN_test2_train_losses_lr4, DNA_CNN_test2_val_losses_lr4 = run_model(
6 train_dl,
7 val_dl,
8 DNA_CNN_test2,
9 device,
10 epochs=100,
11 lr= 1e-2
12 )
Cell In[42], line 139, in run_model(train_dl, val_dl, model, device, lr, epochs, lossf, opt)
136 loss_func = torch.nn.MSELoss()
138 # run the training loop
--> 139 train_losses, val_losses = fit(
140 epochs,
141 model,
142 loss_func,
143 optimizer,
144 train_dl,
145 val_dl,
146 device)
148 return train_losses, val_losses
Cell In[42], line 106, in fit(epochs, model, loss_func, opt, train_dl, val_dl, device, patience)
103 # loop through epochs
104 for epoch in range(epochs):
105 # take a training step
--> 106 train_loss = train_step(model,train_dl,loss_func,device,opt)
107 train_losses.append(train_loss)
109 # take a validation step
Cell In[42], line 54, in train_step(model, train_dl, loss_func, device, opt)
51 xb, yb = xb.to(device),yb.to(device)
53 # provide opt so backprop happens
---> 54 t, n = loss_batch(model, loss_func, xb, yb, opt=opt)
56 # collect train loss and batch sizes
57 tl.append(t)
Cell In[42], line 32, in loss_batch(model, loss_func, xb, yb, opt, verbose)
29 # __FOOTNOTE 2__
31 if opt is not None: # if opt
---> 32 loss.backward()
33 opt.step()
34 opt.zero_grad()
File /mnt/biostat/environments/parkj/dna2rna/lib/python3.11/site-packages/torch/_tensor.py:487, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
477 if has_torch_function_unary(self):
478 return handle_torch_function(
479 Tensor.backward,
480 (self,),
(...)
485 inputs=inputs,
486 )
--> 487 torch.autograd.backward(
488 self, gradient, retain_graph, create_graph, inputs=inputs
489 )
File /mnt/biostat/environments/parkj/dna2rna/lib/python3.11/site-packages/torch/autograd/__init__.py:200, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
195 retain_graph = create_graph
197 # The reason we repeat same the comment below is that
198 # some Python versions print out the first line of a multi-line function
199 # calls in the traceback and some print out the last line
--> 200 Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
201 tensors, grad_tensors_, retain_graph, create_graph, inputs,
202 allow_unreachable=True, accumulate_grad=True)
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn