Autograd RuntimeError in version 0.4.0a0+533beab, but not present in version0.4.0a0+94f439c

Dear all,

I have some code that works fine on my laptop where I have pytorch version 0.4.0a0+94f439c.

Unfortunately, on the server I have pytorch 0.4.0a0+533beab and when I try to run my code I get this error: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation.

I think that this is could be related to the fact that pytorch do not use anymore the Variable interface, but I’m wondering if there is a guide on how to migrate the code.

Moreover, as you can see in my code there is a huge amount of inplace operations, so I’m wandering if there is a way to understand which is the bug.

class JordanRNNJointAttention(BaseNet):
    def __init__(self, input_dim, hidden_dim, output_dim, n_head, time_window, temperature=1, dropout_prob=0.1):
        super(JordanRNNJointAttention, self).__init__()

        self.NodeRNN = nn.GRUCell(input_dim+3, hidden_dim)
        self.NodeRNN_trans = nn.Sequential(nn.Dropout(dropout_prob),
                                           nn.ELU())

        self.NeighborRNN = nn.GRUCell(input_dim, hidden_dim)
        self.NeighborRNN_trans = nn.Sequential(nn.Dropout(dropout_prob),
                                               nn.ELU())

        self.attention = FeatureTransformerLayer(n_head, hidden_dim, hidden_dim, temperature=temperature, dropout=dropout_prob)
        self.prj = nn.Sequential(nn.Linear(hidden_dim, output_dim))

            # nn.Sequential(nn.Linear(hidden_dim, hidden_dim // 2),
            #                      nn.Tanh(),
            #                      nn.Dropout(dropout_prob),
            #                      nn.Linear(hidden_dim // 2, output_dim))

        self.name = "Jordan_RNN" + self.attention.name


        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.nlayers = 1
        self.time_window = time_window
        self.n_head = n_head


    def forward(self, node_input, neighbors_input, node_hidden, neighbors_hidden, edge_weights, mask_neight, mask_time, target):
        
        use_cuda = next(self.parameters()).is_cuda
        batch_size, neigh_number, time_steps, input_dim = neighbors_input.size()

        neighbors_input = torch.cat(torch.split(neighbors_input, 1, dim=1), dim=0)[:, 0]
        node_output = Variable(torch.FloatTensor(batch_size, time_steps, self.hidden_dim).zero_())
        neighbors_output = Variable(torch.FloatTensor(batch_size, neigh_number, time_steps, self.hidden_dim).zero_())
        outputs = Variable(torch.FloatTensor(batch_size, time_steps, self.output_dim).zero_())
        rec_outputs = Variable(torch.FloatTensor(batch_size, time_steps, 3).zero_())
        attentions = torch.FloatTensor(batch_size, time_steps, (neigh_number+1) * time_steps).zero_()

        if use_cuda:
            node_output = node_output.cuda()
            neighbors_output = neighbors_output.cuda()
            outputs = outputs.cuda()
            rec_outputs = rec_outputs.cuda()
            attentions = attentions.cuda()

        att_mask = mask_time.unsqueeze(-2).repeat(1, 1, neigh_number, 1)
        mask_neight = mask_neight.unsqueeze(-2).unsqueeze(-1).repeat(1, time_steps, 1, time_steps)
        att_mask = att_mask.masked_fill_(mask_neight, 1).view(batch_size, time_steps, -1)
        att_mask = torch.cat((mask_time, att_mask), dim=-1)


        for i in range(time_steps):
            node_hidden = self.NodeRNN(torch.cat((node_input[:, i], rec_outputs[:, i]), dim=-1), node_hidden)
            node_enc = self.NodeRNN_trans(node_hidden)
            node_output[:, i] = node_enc

            neighbors_hidden = self.NeighborRNN(neighbors_input[:, i], neighbors_hidden)
            neighbors_enc = self.NeighborRNN_trans(neighbors_hidden)
            neighbors_enc = torch.stack(torch.chunk(neighbors_enc, neigh_number, dim=0), dim=1)
            neighbors_output[:, :, i] = neighbors_enc


            output, attention = self.attention(node_output, neighbors_output, i, att_mask)
            output = self.prj(output)

            outputs[:, i] = output
            attentions[:, i] = attention.squeeze()
            
            # external recurrent connection
            upper_bound = i+1
            lower_bound = i-2 if i >= 3 else 0
            diff = (outputs[:, lower_bound:upper_bound, 0] - target[:, lower_bound:upper_bound, 0])
            if i < 2:
                rec_outputs[:, i+1, :upper_bound] = diff
            elif i < time_steps-1:
                rec_outputs[:, i+1] = diff

        return outputs, attentions
class FeatureTransformerLayer(nn.Module):
    def __init__(self, n_head, input_dim, hidden_dim, temperature=1, dropout=0):
        super(FeatureTransformerLayer, self).__init__()
        self.name = "_FeatureTransformerAttention"
        self.n_head = n_head
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim


        self.w_qs = nn.Parameter(torch.FloatTensor(n_head, input_dim, hidden_dim))
        self.w_ks = nn.Parameter(torch.FloatTensor(n_head, input_dim, hidden_dim))
        self.w_vs = nn.Parameter(torch.FloatTensor(n_head, input_dim, hidden_dim))

        self.slf_attn = FeatureMultiHeadAttention(n_head, hidden_dim, temp=temperature, dropout=dropout)
        self.layer_norm = LayerNorm(hidden_dim)




    def forward(self, node_enc, neigh_enc, current_time_step, attn_mask):
        batch_size, neigh_number, time_steps, input_dim = neigh_enc.size()

        q = node_enc[:, current_time_step:current_time_step+1]
        k = torch.cat((node_enc, torch.cat(torch.split(neigh_enc, 1, dim=1), dim=2)[:, 0]), dim=1)
        v = torch.cat((node_enc, torch.cat(torch.split(neigh_enc, 1, dim=1), dim=2)[:, 0]), dim=1)

        q_s = q.repeat(self.n_head, 1, 1).view(self.n_head, -1, self.input_dim)  # n_head x (mb_size*len_q) x d_model
        k_s = k.repeat(self.n_head, 1, 1).view(self.n_head, -1, self.input_dim)  # n_head x (mb_size*len_k) x d_model
        v_s = v.repeat(self.n_head, 1, 1).view(self.n_head, -1, self.input_dim)  # n_head x (mb_size*len_v) x d_model
        attn_mask = attn_mask.repeat(self.n_head, 1, 1)


        q_s = torch.bmm(q_s, self.w_qs).view(self.n_head * batch_size, -1, self.hidden_dim)  # (n_head*mb_size*max_neighbors+1) x 1 x hidden_dim
        k_s = torch.bmm(k_s, self.w_ks).view(self.n_head * batch_size, -1, self.hidden_dim)  # (n_head*mb_size*max_neighbors+1) x seq_le x hidden_dim
        v_s = torch.bmm(v_s, self.w_vs).view(self.n_head * batch_size, -1, self.hidden_dim)  # n_head*batch_size, max_neighbors+1*seq_le, hidden_dim

        output, slf_attn = self.slf_attn(q_s, k_s, v_s, batch_size, attn_mask[:, current_time_step:current_time_step+1])
        output = self.layer_norm(output + node_enc[:, current_time_step])
        # output = self.pos_ffn(output)


        return output, slf_attn
class FeatureMultiHeadAttention(nn.Module):
    def __init__(self, n_head, hidden_dim, temp=1, dropout=0):
        super(FeatureMultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.hidden_dim = hidden_dim

        self.softmax = TempSoftmax(temp)
        self.prj = nn.Sequential(nn.Linear(hidden_dim * n_head, hidden_dim),
                                 nn.Dropout(dropout),
                                 nn.ELU())


    def forward(self, q, k, v, batch_size, attn_mask):
        S = torch.bmm(q, k.transpose(1, 2))
        S.data.masked_fill_(attn_mask, -float('inf'))

        # S /= (self.temperature * (self.hidden_dim ** 0.5))
        A = self.softmax(S)
        output = torch.bmm(A, v).squeeze()
        output = torch.cat(torch.split(output, batch_size, dim=0), dim=-1)
        output = self.prj(output)
        return output, torch.stack(torch.split(A.data, batch_size, dim=0), dim=0).sum(0)

Thanks,
Sandro

Have a look at the Migration Guide.
Also, you should update the the stable 0.4.0 version using the instructions on the website.