Error: One of the variables needed for gradient computation has been modified by an inplace operation, but cannot find any inplace in the code


I am trying to implement an RNN class. The forward() function below shows error that “one of the variables needed for gradient computation has been modified by an inplace operation”. However I cannot find any inplace in those lines of code. Does anyone see any bug here? Thanks in advance.

class RNN(nn.Module):  # Implement a stacked vanilla RNN with Tanh nonlinearities.
    def __init__(self, emb_size, hidden_size, seq_len, batch_size, vocab_size, num_layers, dp_keep_prob):
        super(RNN, self).__init__()

        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.dp = 1 - dp_keep_prob
        self.batch_size = batch_size

        self.embedding = nn.Embedding(self.vocab_size, self.emb_size)

        self.first_hidden = nn.Linear(self.emb_size, self.hidden_size, True)

        hidden = nn.Linear(self.hidden_size, self.hidden_size, False)
        self.hiddens = clones(hidden, self.num_layers - 1)

        previous_hidden = nn.Linear(self.hidden_size, self.hidden_size, True)
        self.previous_hiddens = clones(previous_hidden, self.num_layers)

        self.output = nn.Linear(self.hidden_size, self.vocab_size, True)

    def forward(self, inputs, hidden):
        logits = torch.zeros([self.seq_len, self.batch_size, self.vocab_size], requires_grad=True).cuda()

        embedded_inputs = self.embedding(inputs).cuda()

        for timestep in range(self.seq_len):

            emb_to_first_hidden = self.first_hidden(embedded_inputs[timestep])
            emb_to_first_hidden = F.dropout(emb_to_first_hidden, self.dp)
            previous_to_first_hidden = self.previous_hiddens[0](hidden[0])
            hidden[0] = F.tanh(emb_to_first_hidden + previous_to_first_hidden)

            for i in range(self.num_layers - 1):

                feed_from_last_layer = self.hiddens[i](hidden[i])
                feed_from_last_layer = F.dropout(feed_from_last_layer, self.dp)

                feed_from_last_time = self.previous_hiddens[i+1](hidden[i+1])
                hidden[i+1] = F.tanh(feed_from_last_layer + feed_from_last_time)

            last_layer_to_output = self.output(hidden[self.num_layers - 1])
            last_layer_to_output = F.dropout(last_layer_to_output, self.dp)
            logits[timestep] = last_layer_to_output

        return logits, hidden

I found that the line

hidden[0] = F.tanh(emb_to_first_hidden + previous_to_first_hidden)

cause error because of the assignment operator.