Loss.backword() is not work due to masking

imaharu · September 23, 2018, 5:56am

I am coding Encoder Decoder model.
Now, add masking processing.

Before adding, the coding was working.

But, (# -> add) my code doesn’t work.

Could you help me?

class Encoder_Decoder(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(Encoder_Decoder, self).__init__()
        # batch_size = 50 ,  hidden_size = 256
        self.embed_input = nn.Embedding(input_size, hidden_size, padding_idx=0) 
        self.embed_target = nn.Embedding(output_size, hidden_size, padding_idx=0)

        self.lstm_input = nn.LSTMCell(hidden_size, hidden_size)
        self.lstm_target = nn.LSTMCell(hidden_size, hidden_size)

        self.linear = nn.Linear(hidden_size, output_size)

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

    def create_mask(self, input_sentence_words):
        mask = input_sentence_words.eq(0)
        return mask

    def forward(self, input_lines, target_lines):
        global all_loss
        hx = torch.zeros(batch_size, self.hidden_size).cuda()
        cx = torch.zeros(batch_size, self.hidden_size).cuda()

        for input_sentence_words in input_lines:
            before_hx = hx
            before_cx = cx
            input_k = self.embed_input(input_sentence_words)
            hx, cx = self.lstm_input(input_k, (hx, cx) )
            # mask = self.create_mask(input_sentence_words) -> add
            # indices = mask.nonzero() -> add
            # hx[indices]= before_hx[indices] -> add
            # cx[indices] =  before_cx[indices] -> add
        target_lines_not_last = target_lines[:(padding_num-1)]
        target_lines_next = target_lines[1:]
        loss = 0
        k = 0
        for target_sentence_words , target_sentence_words_next in zip(target_lines_not_last, target_lines_next):
            target_k = self.embed_target(target_sentence_words)
            k += 1
            print(k)
            print("target_k", target_k)

            hx, cx = self.lstm_target(target_k, (hx, cx) )
            print("hx", hx)
            print("hx size", hx.size())
            print("cx", cx)
            print("cx size", cx.size())
            print("target_sentence_words_next", target_sentence_words_next)
            print("---------------")
            loss += F.cross_entropy(self.linear(hx), target_sentence_words_next)
        return loss

49
target_k tensor([[ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
        ...,
        [ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  ...,  0.,  0.,  0.]], device='cuda:0')
target_sentence_words tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0], device='cuda:0')
hx tensor(1.00000e-02 *
       [[ 1.8093,  2.3050,  1.3506,  ..., -0.2192,  2.2486, -2.5922],
        [ 1.8108,  2.3046,  1.3504,  ..., -0.2191,  2.2489, -2.5926],
        [ 1.8108,  2.3046,  1.3504,  ..., -0.2191,  2.2489, -2.5926],
        ...,
        [ 1.8108,  2.3046,  1.3504,  ..., -0.2191,  2.2489, -2.5926],
        [ 1.8069,  2.3096,  1.3515,  ..., -0.2181,  2.2485, -2.5900],
        [ 1.8108,  2.3046,  1.3504,  ..., -0.2191,  2.2489, -2.5926]], device='cuda:0')
hx size torch.Size([50, 256])
cx tensor([[ 0.0342,  0.0479,  0.0270,  ..., -0.0043,  0.0449, -0.0510],
        [ 0.0342,  0.0479,  0.0270,  ..., -0.0043,  0.0449, -0.0510],
        [ 0.0342,  0.0479,  0.0270,  ..., -0.0043,  0.0449, -0.0510],
        ...,
        [ 0.0342,  0.0479,  0.0270,  ..., -0.0043,  0.0449, -0.0510],
        [ 0.0342,  0.0480,  0.0270,  ..., -0.0043,  0.0449, -0.0510],
        [ 0.0342,  0.0479,  0.0270,  ..., -0.0043,  0.0449, -0.0510]], device='cuda:0')
cx size torch.Size([50, 256])
target_sentence_words_next tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0], device='cuda:0')
---------------
Traceback (most recent call last):
  File "pytorch.py", line 109, in <module>
    loss.backward()
  File "/home/xx/.pyenv/versions/3.6.3/lib/python3.6/site-packages/torch/tensor.py", line 93, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/home/xx/.pyenv/versions/3.6.3/lib/python3.6/site-packages/torch/autograd/__init__.py", line 89, in backward
    allow_unreachable=True)  # allow_unreachable flag
  File "/home/xx/.pyenv/versions/3.6.3/lib/python3.6/site-packages/torch/autograd/function.py", line 76, in apply
    return self._forward_cls.backward(self, *args)
  File "/home/xx/.pyenv/versions/3.6.3/lib/python3.6/site-packages/torch/autograd/function.py", line 188, in wrapper
    outputs = fn(ctx, *args)
  File "/home/xx/.pyenv/versions/3.6.3/lib/python3.6/site-packages/torch/nn/_functions/thnn/rnnFusedPointwise.py", line 86, in backward
    saved_tens, cx, cy = ctx.saved_tensors
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

pbloem · September 23, 2018, 7:38am

I’m not entirely sure what the mask is for. Is it because you only want to compute the loss over certain elements of your output sequence (i.e. not on the padding)?

If so, you can pass an argument ignore_index=0 to the call to F.cross_entropy(). That way elements with index 0 will not contribute to the loss. See the docs for details.

You also seem to have put your entire training loop inside your model’s forward() function. I’d recommend making your model compute just the mapping from input sequence to output sequence, and taking everything else out.

imaharu · September 24, 2018, 4:56am

It worked

def create_mask(self, input_sentence_words):
        return torch.cat( [ input_sentence_words.unsqueeze(-1) ] * 256, 1)

def forward(self, input_lines, target_lines):
        global all_loss
        hx = torch.zeros(batch_size, self.hidden_size).cuda()
        cx = torch.zeros(batch_size, self.hidden_size).cuda()

        for input_sentence_words in input_lines:
            before_hx = hx
            before_cx = cx
            input_k = self.embed_input(input_sentence_words)
            hx, cx = self.lstm_input(input_k, (hx, cx) )
            mask = self.create_mask(input_sentence_words)
            hx = torch.where(mask == 0, before_hx, hx)
            cx = torch.where(mask == 0, before_cx, cx)