Dear all,
I have some code that works fine on my laptop where I have pytorch version 0.4.0a0+94f439c.
Unfortunately, on the server I have pytorch 0.4.0a0+533beab and when I try to run my code I get this error: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation.
I think that this is could be related to the fact that pytorch do not use anymore the Variable interface, but I’m wondering if there is a guide on how to migrate the code.
Moreover, as you can see in my code there is a huge amount of inplace operations, so I’m wandering if there is a way to understand which is the bug.
class JordanRNNJointAttention(BaseNet):
def __init__(self, input_dim, hidden_dim, output_dim, n_head, time_window, temperature=1, dropout_prob=0.1):
super(JordanRNNJointAttention, self).__init__()
self.NodeRNN = nn.GRUCell(input_dim+3, hidden_dim)
self.NodeRNN_trans = nn.Sequential(nn.Dropout(dropout_prob),
nn.ELU())
self.NeighborRNN = nn.GRUCell(input_dim, hidden_dim)
self.NeighborRNN_trans = nn.Sequential(nn.Dropout(dropout_prob),
nn.ELU())
self.attention = FeatureTransformerLayer(n_head, hidden_dim, hidden_dim, temperature=temperature, dropout=dropout_prob)
self.prj = nn.Sequential(nn.Linear(hidden_dim, output_dim))
# nn.Sequential(nn.Linear(hidden_dim, hidden_dim // 2),
# nn.Tanh(),
# nn.Dropout(dropout_prob),
# nn.Linear(hidden_dim // 2, output_dim))
self.name = "Jordan_RNN" + self.attention.name
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.output_dim = output_dim
self.nlayers = 1
self.time_window = time_window
self.n_head = n_head
def forward(self, node_input, neighbors_input, node_hidden, neighbors_hidden, edge_weights, mask_neight, mask_time, target):
use_cuda = next(self.parameters()).is_cuda
batch_size, neigh_number, time_steps, input_dim = neighbors_input.size()
neighbors_input = torch.cat(torch.split(neighbors_input, 1, dim=1), dim=0)[:, 0]
node_output = Variable(torch.FloatTensor(batch_size, time_steps, self.hidden_dim).zero_())
neighbors_output = Variable(torch.FloatTensor(batch_size, neigh_number, time_steps, self.hidden_dim).zero_())
outputs = Variable(torch.FloatTensor(batch_size, time_steps, self.output_dim).zero_())
rec_outputs = Variable(torch.FloatTensor(batch_size, time_steps, 3).zero_())
attentions = torch.FloatTensor(batch_size, time_steps, (neigh_number+1) * time_steps).zero_()
if use_cuda:
node_output = node_output.cuda()
neighbors_output = neighbors_output.cuda()
outputs = outputs.cuda()
rec_outputs = rec_outputs.cuda()
attentions = attentions.cuda()
att_mask = mask_time.unsqueeze(-2).repeat(1, 1, neigh_number, 1)
mask_neight = mask_neight.unsqueeze(-2).unsqueeze(-1).repeat(1, time_steps, 1, time_steps)
att_mask = att_mask.masked_fill_(mask_neight, 1).view(batch_size, time_steps, -1)
att_mask = torch.cat((mask_time, att_mask), dim=-1)
for i in range(time_steps):
node_hidden = self.NodeRNN(torch.cat((node_input[:, i], rec_outputs[:, i]), dim=-1), node_hidden)
node_enc = self.NodeRNN_trans(node_hidden)
node_output[:, i] = node_enc
neighbors_hidden = self.NeighborRNN(neighbors_input[:, i], neighbors_hidden)
neighbors_enc = self.NeighborRNN_trans(neighbors_hidden)
neighbors_enc = torch.stack(torch.chunk(neighbors_enc, neigh_number, dim=0), dim=1)
neighbors_output[:, :, i] = neighbors_enc
output, attention = self.attention(node_output, neighbors_output, i, att_mask)
output = self.prj(output)
outputs[:, i] = output
attentions[:, i] = attention.squeeze()
# external recurrent connection
upper_bound = i+1
lower_bound = i-2 if i >= 3 else 0
diff = (outputs[:, lower_bound:upper_bound, 0] - target[:, lower_bound:upper_bound, 0])
if i < 2:
rec_outputs[:, i+1, :upper_bound] = diff
elif i < time_steps-1:
rec_outputs[:, i+1] = diff
return outputs, attentions
class FeatureTransformerLayer(nn.Module):
def __init__(self, n_head, input_dim, hidden_dim, temperature=1, dropout=0):
super(FeatureTransformerLayer, self).__init__()
self.name = "_FeatureTransformerAttention"
self.n_head = n_head
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.w_qs = nn.Parameter(torch.FloatTensor(n_head, input_dim, hidden_dim))
self.w_ks = nn.Parameter(torch.FloatTensor(n_head, input_dim, hidden_dim))
self.w_vs = nn.Parameter(torch.FloatTensor(n_head, input_dim, hidden_dim))
self.slf_attn = FeatureMultiHeadAttention(n_head, hidden_dim, temp=temperature, dropout=dropout)
self.layer_norm = LayerNorm(hidden_dim)
def forward(self, node_enc, neigh_enc, current_time_step, attn_mask):
batch_size, neigh_number, time_steps, input_dim = neigh_enc.size()
q = node_enc[:, current_time_step:current_time_step+1]
k = torch.cat((node_enc, torch.cat(torch.split(neigh_enc, 1, dim=1), dim=2)[:, 0]), dim=1)
v = torch.cat((node_enc, torch.cat(torch.split(neigh_enc, 1, dim=1), dim=2)[:, 0]), dim=1)
q_s = q.repeat(self.n_head, 1, 1).view(self.n_head, -1, self.input_dim) # n_head x (mb_size*len_q) x d_model
k_s = k.repeat(self.n_head, 1, 1).view(self.n_head, -1, self.input_dim) # n_head x (mb_size*len_k) x d_model
v_s = v.repeat(self.n_head, 1, 1).view(self.n_head, -1, self.input_dim) # n_head x (mb_size*len_v) x d_model
attn_mask = attn_mask.repeat(self.n_head, 1, 1)
q_s = torch.bmm(q_s, self.w_qs).view(self.n_head * batch_size, -1, self.hidden_dim) # (n_head*mb_size*max_neighbors+1) x 1 x hidden_dim
k_s = torch.bmm(k_s, self.w_ks).view(self.n_head * batch_size, -1, self.hidden_dim) # (n_head*mb_size*max_neighbors+1) x seq_le x hidden_dim
v_s = torch.bmm(v_s, self.w_vs).view(self.n_head * batch_size, -1, self.hidden_dim) # n_head*batch_size, max_neighbors+1*seq_le, hidden_dim
output, slf_attn = self.slf_attn(q_s, k_s, v_s, batch_size, attn_mask[:, current_time_step:current_time_step+1])
output = self.layer_norm(output + node_enc[:, current_time_step])
# output = self.pos_ffn(output)
return output, slf_attn
class FeatureMultiHeadAttention(nn.Module):
def __init__(self, n_head, hidden_dim, temp=1, dropout=0):
super(FeatureMultiHeadAttention, self).__init__()
self.n_head = n_head
self.hidden_dim = hidden_dim
self.softmax = TempSoftmax(temp)
self.prj = nn.Sequential(nn.Linear(hidden_dim * n_head, hidden_dim),
nn.Dropout(dropout),
nn.ELU())
def forward(self, q, k, v, batch_size, attn_mask):
S = torch.bmm(q, k.transpose(1, 2))
S.data.masked_fill_(attn_mask, -float('inf'))
# S /= (self.temperature * (self.hidden_dim ** 0.5))
A = self.softmax(S)
output = torch.bmm(A, v).squeeze()
output = torch.cat(torch.split(output, batch_size, dim=0), dim=-1)
output = self.prj(output)
return output, torch.stack(torch.split(A.data, batch_size, dim=0), dim=0).sum(0)
Thanks,
Sandro