If I define a network named AttnDecoderRNN, when I use AttnDecoderRNN().cuda(), will all temporary Variables created in AttnDecoderRNN().forward() function run in gpu? should I create them with .cuda()?
class Attn(nn.Module):
def __init__(self, method, dim_hidden):
super(Attn, self).__init__()
self.method = method
self.dim_hidden = dim_hidden
self.softmax = nn.Softmax(dim=1)
if self.method == 'general':
self.attn = nn.Linear(self.dim_hidden, dim_hidden)
elif self.method == 'concat':
self.attn = nn.Linear(self.dim_hidden * 2, dim_hidden)
self.other = nn.Parameter(torch.FloatTensor(dim_hidden, 1))
def forward(self, hidden, encoder_outputs):
"""
input:
hidden: B x H
encoder_outputs: B x S x H
return:
B x S
"""
B, S, N = encoder_outputs.size()
# Create variable to store attention energies, BxS
attn_energies = Variable(torch.zeros(B, S)).cuda()
if self.method == 'dot':
attn_energies = encoder_outputs.bmm(hidden.unsqueeze(2)).transpose(2, 1)
elif self.method == 'general':
for i in range(S):
energy = self.attn(encoder_outputs[:, i, :]).unsqueeze(1)
energy = energy.bmm(hidden.unsqueeze(2)).squeeze()
attn_energies[:, i] = energy
attn_energies = attn_energies.unsqueeze(1)
elif self.method == 'concat':
for i in range(S):
energy = self.attn(torch.cat((hidden, encoder_outputs[:, i, :]), 1))
energy = energy.mm(self.other).squeeze()
attn_energies[:, i] = energy
# Normalize energies to weights in range 0 to 1
return self.softmax(attn_energies)
class AttnDecoderRNN(nn.Module):
def __init__(self, dim_image, dim_hidden, dim_output, attn_model='general', n_layers=1, dropout_p=0.1):
super(AttnDecoderRNN, self).__init__()
# Keep parameters for reference
self.attn_model = attn_model
self.dim_image = dim_image
self.dim_hidden = dim_hidden
self.dim_output = dim_output
self.n_layers = n_layers
self.dropout_p = dropout_p
# Define layers
self.im2hid = nn.Linear(dim_image, dim_hidden)
self.embedding = nn.Embedding(dim_output, dim_hidden)
self.rnn = nn.GRU(dim_hidden * 2, dim_hidden, n_layers, dropout=dropout_p, batch_first=True)
self.out = nn.Linear(dim_hidden * 2, dim_output)
self.log_softmax = nn.LogSoftmax(dim=1)
self.attn = Attn(attn_model, dim_hidden)
def forward(self, input_seq, last_context, encoder_outputs, last_hidden=None):
"""
input:
input_seq: B x 1
last_context: B x N
last_hidden: B x n_layers*directions=1 x N
encoder_outputs: B x S x H
return:
output: B x O
context: B x 1 x N
hidden: B x n_layers*directions=1 x N
attn_weights: B x 1 x S
"""
last_hidden = last_hidden.transpose(0, 1)
# Note: we run this one step at a time
B, N = last_context.size()
last_context = last_context.unsqueeze(1)
encoder_outputs = self.im2hid(encoder_outputs)
# Get the embedding of the current input word (last output word)
word_embedded = self.embedding(input_seq) # B x S(=1) x N
# Combine embedded input word and last context, run through RNN
rnn_input = torch.cat((word_embedded, last_context), 2) # B x S x 2N
rnn_output, hidden = self.rnn(rnn_input, last_hidden)
# Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
attn_weights = self.attn(rnn_output.squeeze(1), encoder_outputs).unsqueeze(1) # B x 1 x S
context = attn_weights.bmm(encoder_outputs) # B x 1 x N
# Final output layer (next word prediction) using the RNN hidden state and context vector
rnn_output = rnn_output.squeeze(1) # B x S=1 x N -> B x N
context = context.squeeze(1) # B x S=1 x N -> B x N
output = self.log_softmax(self.out(torch.cat((rnn_output, context), 1)))
hidden = hidden.transpose(0, 1)
# Return final output, hidden state, and attention weights (for visualization)
return output, context, hidden, attn_weights
I want to run this code using gpu, In documents, it says “Moves all model parameters and buffers to the GPU”, but my attn_energies
isn’t created with nn.Parameter
. Should I use .cuda()
when I define attn_energies
in Attn.forward()?