Hi,
I am trying to implement an RNN class. The forward() function below shows error that “one of the variables needed for gradient computation has been modified by an inplace operation”. However I cannot find any inplace in those lines of code. Does anyone see any bug here? Thanks in advance.
class RNN(nn.Module): # Implement a stacked vanilla RNN with Tanh nonlinearities.
def __init__(self, emb_size, hidden_size, seq_len, batch_size, vocab_size, num_layers, dp_keep_prob):
super(RNN, self).__init__()
self.emb_size = emb_size
self.hidden_size = hidden_size
self.seq_len = seq_len
self.vocab_size = vocab_size
self.num_layers = num_layers
self.dp = 1 - dp_keep_prob
self.batch_size = batch_size
self.embedding = nn.Embedding(self.vocab_size, self.emb_size)
self.first_hidden = nn.Linear(self.emb_size, self.hidden_size, True)
hidden = nn.Linear(self.hidden_size, self.hidden_size, False)
self.hiddens = clones(hidden, self.num_layers - 1)
previous_hidden = nn.Linear(self.hidden_size, self.hidden_size, True)
self.previous_hiddens = clones(previous_hidden, self.num_layers)
self.output = nn.Linear(self.hidden_size, self.vocab_size, True)
def forward(self, inputs, hidden):
logits = torch.zeros([self.seq_len, self.batch_size, self.vocab_size], requires_grad=True).cuda()
embedded_inputs = self.embedding(inputs).cuda()
for timestep in range(self.seq_len):
emb_to_first_hidden = self.first_hidden(embedded_inputs[timestep])
emb_to_first_hidden = F.dropout(emb_to_first_hidden, self.dp)
previous_to_first_hidden = self.previous_hiddens[0](hidden[0])
hidden[0] = F.tanh(emb_to_first_hidden + previous_to_first_hidden)
for i in range(self.num_layers - 1):
feed_from_last_layer = self.hiddens[i](hidden[i])
feed_from_last_layer = F.dropout(feed_from_last_layer, self.dp)
feed_from_last_time = self.previous_hiddens[i+1](hidden[i+1])
hidden[i+1] = F.tanh(feed_from_last_layer + feed_from_last_time)
last_layer_to_output = self.output(hidden[self.num_layers - 1])
last_layer_to_output = F.dropout(last_layer_to_output, self.dp)
logits[timestep] = last_layer_to_output
return logits, hidden