Problem with backpropagation

Hey guys, I am facing a problem with a custom network of mine. It basically a multi-layer perceptron, but for each layer weight matrix say W, I factorise its flattened version as vec(W) = Uv, where U is a given matrix and I only want to learn v.

I want to do multidimensional regression, so I am using the usual mse loss. During training I get the following error:

“Trying to backward through the graph a second time, but the saved intermediate results have already been freed. Specify retain_graph=True when calling backward the first time.”

If I retain_graph it runs ok, but I see no reason why I would get this error. I am only backpropagating once. Any ideas on why this might occur?

Here is my code for the layer and the network. The details about how I create the matrix U are not important.

class InvariantLinear(torch.nn.Module):
    def __init__(self, in_features, out_features, filter_dim, bias=True):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.filter_dim = filter_dim
        self.bias = bias

        self.v = torch.nn.Parameter(torch.randn(filter_dim))
        self.bias = torch.nn.Parameter(torch.randn(out_features))
        l = int(self.in_features * self.out_features)
        self.U_conv = torch.empty(l, filter_dim)
        self.U_lin = torch.empty(l, filter_dim)
        self.pi = torch.nn.Parameter(torch.tensor(1 / 2))


        for i in range(int(l / filter_dim)):

            self.U_lin[filter_dim * i:filter_dim * (i + 1), :] = torch.diag(torch.tensor([1 for i in range(filter_dim)]))
            if (i % 2) == 0:
                self.U_conv[filter_dim * i:filter_dim * (i + 1), :] = torch.diag(
                    torch.tensor([1 for i in range(filter_dim)]))
            else:
                self.U_conv[filter_dim * i:filter_dim * (i + 1), :] = torch.diag(
                    torch.tensor([0 for i in range(filter_dim)]))



    def forward(self, x):

        U = self.pi * self.U_conv + (1 - self.pi) * self.U_lin
        weight = torch.matmul(U, self.v)
        weight = weight.view(self.out_features, self.in_features)
        output = x @ weight.t()+self.bias
        return output


class Densenet(torch.nn.Module):

    def __init__(self, input_size, output_size, filter_dim, width, num_layers):
        super().__init__()

        self.input_layer = InvariantLinear(input_size, width, filter_dim)

        self.hidden_layer1 = InvariantLinear(width, width, filter_dim)

        self.hidden_layer2 = InvariantLinear(width, width, filter_dim)

        self.output_layer = InvariantLinear(width, output_size, filter_dim)

        self.act = torch.nn.ReLU()

    def forward(self, x):

        x1 = self.input_layer(x)
        x2 = self.act(x1)
        x3 = self.hidden_layer1(x2)
        x4 = self.act(x3)
        x5 = self.hidden_layer2(x4)
        x6 = self.act(x5)
        y = self.output_layer(x6)



        return y


Hey @nmourdou (if you still need help) do you have a runnable version where it can be reproduced?

1 Like

Sure. Here it is:

import torch
from torch.optim import Adam
from tqdm import trange
from torch.nn import functional as F



class Conv1_d(torch.nn.Module):

    def __init__(self, inputs,  outputs, kernel):
        super(Conv1_d, self).__init__()

        self.act = torch.nn.ReLU
        self.layer1 = torch.nn.Conv1d(inputs, outputs, kernel_size=kernel, bias=False, stride=1)

    def forward(self, x):
        x = self.layer1(x)

        return x


class Linear(torch.nn.Module):

    def __init__(self, inputs, outputs):
        super(Linear, self).__init__()

        self.linear = torch.nn.Linear(in_features=inputs, out_features=outputs)
        self.act = torch.nn.ReLU

    def forward(self, x):

        x = self.linear(x)



        return x



class InvariantLinear(torch.nn.Module):
    def __init__(self, in_features, out_features, filter_dim, bias=True):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.filter_dim = filter_dim
        self.bias = bias

        self.v = torch.nn.Parameter(torch.randn(filter_dim))
        self.bias = torch.nn.Parameter(torch.randn(out_features))
        l = int(self.in_features * self.out_features)
        self.U_conv = torch.empty(l, filter_dim)
        self.U_lin = torch.empty(l, filter_dim)
        self.pi = torch.nn.Parameter(torch.tensor(1 / 2))


        for i in range(int(l / filter_dim)):

            self.U_lin[filter_dim * i:filter_dim * (i + 1), :] = torch.diag(torch.tensor([1 for i in range(filter_dim)]))
            if (i % 2) == 0:
                self.U_conv[filter_dim * i:filter_dim * (i + 1), :] = torch.diag(
                    torch.tensor([1 for i in range(filter_dim)]))
            else:
                self.U_conv[filter_dim * i:filter_dim * (i + 1), :] = torch.diag(
                    torch.tensor([0 for i in range(filter_dim)]))



    def forward(self, x):

        U = self.pi * self.U_conv + (1 - self.pi) * self.U_lin
        weight = torch.matmul(U, self.v)
        weight = weight.view(self.out_features, self.in_features)
        output = x @ weight.t()+self.bias
        return output


class Densenet(torch.nn.Module):

    def __init__(self, input_size, output_size, filter_dim, width, num_layers):
        super().__init__()

        self.input_layer = InvariantLinear(input_size, width, filter_dim)

        self.hidden_layer1 = InvariantLinear(width, width, filter_dim)

        self.hidden_layer2 = InvariantLinear(width, width, filter_dim)

        self.output_layer = InvariantLinear(width, output_size, filter_dim)

        self.act = torch.nn.ReLU()

    def forward(self, x):

        x1 = self.input_layer(x)
        x2 = self.act(x1)
        x3 = self.hidden_layer1(x2)
        x4 = self.act(x3)
        x5 = self.hidden_layer2(x4)
        x6 = self.act(x5)
        y = self.output_layer(x6)



        return y


# input dimension
n = 300
# filter dimension
k = 3
# output dimension
m = n-k+1
# sample size
s = 5120

conv = Conv1_d(1, 1, k)
lin = Linear(n, m)

batch_size = 256



X = torch.normal(0, 1, (s, 1, n))
y_conv = torch.empty(s, m)
y_lin = torch.empty(s, m)

for i in range(int(s/batch_size)):

    y_conv[i*batch_size:(i+1)*batch_size, :] = torch.squeeze(conv(X[i*batch_size:(i+1)*batch_size, :, :]))
    y_lin[i * batch_size:(i + 1) * batch_size, :] = torch.squeeze(lin(X[i * batch_size:(i + 1) * batch_size, :, :]))


data_conv = torch.utils.data.TensorDataset(torch.squeeze(X), y_conv)
data_lin = torch.utils.data.TensorDataset(torch.squeeze(X), y_lin)


data_conv_train, data_conv_test = torch.utils.data.random_split(data_conv, [int(0.8*s), s-int(0.8*s)])
data_lin_train, data_lin_test = torch.utils.data.random_split(data_lin, [int(0.8*s), s-int(0.8*s)])

train_loader_conv = torch.utils.data.DataLoader(data_conv_train, batch_size=batch_size, shuffle=True)
test_loader_conv = torch.utils.data.DataLoader(data_conv_test, batch_size=batch_size, shuffle=True)
train_loader_lin = torch.utils.data.DataLoader(data_lin_train, batch_size=batch_size, shuffle=True)
test_loader_lin = torch.utils.data.DataLoader(data_lin_test, batch_size=batch_size, shuffle=True)

train_loaders = [train_loader_conv, train_loader_lin]
test_loaders = [test_loader_conv, test_loader_lin]

n_epochs = 10

model_conv = Densenet(n, m, k, 100, 2)
model_lin = Densenet(n, m, k, 100, 2)

models = [model_conv, model_lin]
pbar_update_interval = 100



for m in range(len(models)):
    model = models[m]
    train_loader = train_loaders[m]
    optimizer = Adam(model.parameters(), lr=5e-4)
    model.train()
    pbar = trange(n_epochs)

    for _ in pbar:


        for k, (batch_x, batch_y) in enumerate(train_loader):

            model.zero_grad()
            optimizer.zero_grad()

            outputs = model(batch_x)


            loss = F.mse_loss(outputs, batch_y)


            loss.backward()
            optimizer.step()



            if k % pbar_update_interval == 0:

                pbar.set_postfix(loss=loss.item())

Thank you!

It seems that when you do .backward its also backprop’ing through to the training data, i.e., batch_y. If you detach batch_y, it should be ok.

        for k, (batch_x, batch_y) in enumerate(train_loader):

            model.zero_grad()
            optimizer.zero_grad()

            outputs = model(batch_x)


            # loss = F.mse_loss(outputs, batch_y)
            loss = F.mse_loss(outputs,batch_y.detach())

            loss.backward()
            optimizer.step()

This is because your lin, and conv that you created do require gradients, and thus the datasets you create from that also require gradients.