Inplace operation error with gru

class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=False)
        self.softmax = nn.LogSoftmax(dim=2)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x, h):
        out, h = self.gru(x, h)
        out = self.h2o(out)
        out = self.relu(out)
        out = self.softmax(out)
        return out, h

    def init_hidden(self, batch_size, device):
        weight = next(self.parameters()).data
        hidden = weight.new(self.num_layers, batch_size, self.hidden_size).zero_().to(device)
        return hidden

out, h = self.gru(x, h) occurs this error :
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

and i am dealing with this for straight 5 hours.
i can see no inplace operation in that code .

Hi,

It seems to works quite fine for me using your module:

import torch
from torch import nn

class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=False)
        self.softmax = nn.LogSoftmax(dim=2)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x, h):
        out, h = self.gru(x, h)
        out = self.h2o(out)
        out = self.relu(out)
        out = self.softmax(out)
        return out, h

    def init_hidden(self, batch_size, device):
        weight = next(self.parameters()).data
        hidden = weight.new(self.num_layers, batch_size, self.hidden_size).zero_().to(device)
        return hidden

mod = GRU(4, 3, 3, 2)

inp = torch.rand(3, 1, 4)
h = torch.rand(2, 1, 3)

out, new_h = mod(inp, h)

out.sum().backward()

Could you provide a code sample that reproduces the issue please?

oh my… you are my savior.
here is the code… but dont laugh.

this is the train function.

def train(target, label):
    hidden = model.init_hidden(batch_size, device)
    output = 0
    loss = 0

    for i in range(len(target)):
        optimizer.zero_grad()
        output, hidden = model(target[i], hidden)
        loss = criterion(output, label[i].argmax(1)).to(device)
        loss.backward(retain_graph=True)
        optimizer.step()

        for j in range(0, batch_size):
            sys.stdout.write("batch " + str(j + 1) + " : ")
            for k in range(0, seq_len):
                expected = torch.argmax(target[i][k][j])
                sys.stdout.write(idx_to_word[expected.item()] + " ")

            sys.stdout.write(" -> ")
            for k in range(0, seq_len):
                expected = torch.argmax(output[k][j])
                sys.stdout.write(idx_to_word[expected.item()] + " ")
            sys.stdout.write("\n")
        sys.stdout.write("\n")

    return output, loss.item()

this is where repeats training.

for iter in range(1, n_iter + 1):
    now_epoch = 0

    while now_epoch + batch_size <= len(data):
        target = make_batch(data[now_epoch:now_epoch + batch_size])

        label = target[1:len(target)] + [torch.zeros(seq_len, batch_size, input_size)]

        output, loss = train(target, label)

        if iter % print_every == 0:
            sys.stdout.write("%d %d%% (%s) %.4f" % (iter, iter/n_iter*100, time_since(start), loss))

        if iter % plot_every == 0:
            losses.append(cur_loss/plot_every)
            cur_loss = 0

        now_epoch += batch_size

just in case, this is how i make batch

def make_batch(docs):
    target = []
    now_word = 0
    flag = True

    while flag:
        flag = False
        one_hot_vector = torch.zeros(seq_len, batch_size, input_size)
        for i, doc in enumerate(docs):
            for j in range(0, seq_len):
                try:
                    word = doc[now_word + j]
                    one_hot_vector[j][i][word_to_idx[word]] = 1
                    flag = True
                except IndexError:
                    break
        target.append(one_hot_vector.to(device))
        now_word += seq_len

    return target

i really really thank you for your answer…

oh by the way, it is a text generator using gru model.

Thanks for the code but I can’t really run that. Could you modify it to take randon Tensors as input and make it a single script?

One thing that I see is that you reuse the hidden state directly. Maybe you need to clone it? It is quite hard to say without running the code.

Thank you for your answer. Ive resolved the error by backpropagating loss just once for a for loop. Thank you.