Hello.
I have a multilayer perceptron made of several linear layers and an activation function to be specified when creating the model. Here’s how I implemented this (this is more or less the reimplementation of this in pytorch):
class Linear(nn.Module):
def __init__(self, in_features, out_features, bias = True, omega_0 = 1, activation_function = None, deriv_activation_function = None):
super().__init__()
self.linear = nn.Linear(in_features, out_features, bias=bias)
self.activation_function = activation_function if activation_function else lambda x : x
self.deriv_activation_function = deriv_activation_function if deriv_activation_function else lambda x : x
self.omega_0 = omega_0
def forward(self, x):
z = self.omega_0 * self.linear(x)
y = self.activation_function(z)
return y, z
class MLP(nn.Module):
def __init__(self, in_features, hidden_features, hidden_layers, out_features, activation_function = None, deriv_activation_function = None):
super().__init__()
self.net = []
# input layer
self.net.append(Linear(in_features, hidden_features, activation_function, deriv_activation_function = deriv_activation_function))
# hidden layer(s)
for i in range(hidden_layers):
self.net.append(Linear(hidden_features, hidden_features, activation_function, deriv_activation_function = deriv_activation_function))
# output layer
self.net.append(Linear(hidden_features, out_features,))
self.net = nn.Sequential(*self.net)
self.zs = []
def forward(self, x):
x_grad = x.clone().detach().requires_grad_(True) # allows to take derivative w.r.t. input
for linear_layer in self.net :
x, z = linear_layer(x)
self.zs.append(z)
y = x
return y, x_grad
def backprop(self, y):
zbar = torch.ones_like(y)
for l in range(len(self.net) - 1, 0, -1):
linear_layer = self.net[l]
zbar = torch.matmul(zbar, linear_layer.linear.weight) * linear_layer.deriv_activation_function(self.zs[l-1])
linear_layer = self.net[0]
zbar = torch.matmul(zbar, linear_layer.linear.weight)
xbar = zbar
# dz[L] / dx
return xbar
The backprop method just takes the differential of the output (y) with respect to itself (i.e. 1) and passes it from the last to the first layer of the neural network (the author called it twin_net).
During training, as shown below, I get this error.
RuntimeError: Trying to backward through the graph a second time, but the saved intermediate results have already been freed. Specify retain_graph=True when calling backward the first time.
How can I correct it? I was told here to play with .detach(), but I’m not sure which variable to play with.
len_dl = len(dataloader)
running_loss = 0
r_y, r_dy = 0, 0
for batch in dataloader:
x, y, dy = batch
optimizer.zero_grad()
# Forward pass
y_pred, x = model(x)
# Compute gradient
dy_pred = model.backprop(y_pred)
# Compute Loss
l_y = criterion(y_pred.squeeze(), y)
l_dy = criterion(dy, dy_pred)
loss = l_y + l_dy
running_loss += loss.item()
r_y += l_y.item()
r_dy += l_dy.item()
# Backward pass
loss.backward()
optimizer.step()
running_loss = running_loss/len_dl
r_y = r_y/len_dl
r_dy = r_dy/len_dl