Runtime Error:Inplace operation error with softmax

I am writing the YOLO v2 code.
I want softmax layer would be the last layer of the classification.
But, when I use softmax layer as my last layer, then error like below occurs.
I can’t find what is wrong with it.

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [11, 13, 13, 1, 2]], which is output 0 of SliceBackward, is at version 3; expected version 2 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

And the code is like below

class yolov2(nn.Module):
    def __init__(self, b, c):
        super(yolov2, self).__init__()
        self.b = b
        self.c = c

        self.first_net = nn.Sequential(
            # conv1
            # 3 @ 416 * 416
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1),
            # 32 @ 416 * 416
            nn.BatchNorm2d(32),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            # 32 @ 208 * 208

            # conv2
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            # 64 @ 208 * 208
            nn.BatchNorm2d(64),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            # 64 @ 104 * 104

            # conv3
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            # 128 @ 104 * 104
            nn.BatchNorm2d(128),
            nn.LeakyReLU(),

            # conv4
            nn.Conv2d(in_channels=128, out_channels=64, kernel_size=1, stride=1, padding=0),
            # 64 @ 104 * 104
            nn.BatchNorm2d(64),
            nn.LeakyReLU(),

            # conv5
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            # 128 @ 104 * 104
            nn.BatchNorm2d(128),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            # 128 @ 52 * 52

            # conv6
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            # 256 @ 52*52
            nn.BatchNorm2d(256),
            nn.LeakyReLU(),

            # conv7
            nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, stride=1, padding=0),
            # 128 @ 52*52
            nn.BatchNorm2d(128),
            nn.LeakyReLU(),

            # conv8
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            # 256 @ 52*52
            nn.BatchNorm2d(256),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            # 256 @ 26*26

            # conv9
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            # 512 @ 26*26
            nn.BatchNorm2d(512),
            nn.LeakyReLU(),

            # conv10
            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0),
            # 256 @ 26*26
            nn.BatchNorm2d(256),
            nn.LeakyReLU(),

            # conv11
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            # 512 @ 26*26
            nn.BatchNorm2d(512),
            nn.LeakyReLU(),

            # conv12
            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0),
            # 256 @ 26*26
            nn.BatchNorm2d(256),
            nn.LeakyReLU(),

            # conv13
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            # 512 @ 26*26
            nn.BatchNorm2d(512),
            nn.LeakyReLU()
        )
        self.second_net = nn.Sequential(
            # 512 @ 26 * 26
            # conv14
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1),
            # 1024 @ 26 * 26
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(),

            # conv15
            nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=0),
            # 512 @ 26 * 26
            nn.BatchNorm2d(512),
            nn.LeakyReLU(),

            # conv16
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1),
            # 1024 @ 26 * 26
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            # 1024 @ 13 * 13

            # conv17
            nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=0),
            # 512 @ 13 * 13
            nn.BatchNorm2d(512),
            nn.LeakyReLU(),

            # conv18
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1),
            # 1024 @ 13 * 13
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(),

            # conv19
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1),
            # 1024 @ 13 * 13
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(),

            # conv20
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1),
            # 1024 @ 13 * 13
            nn.BatchNorm2d(1024),
            nn.LeakyReLU()
        )
        self.third_net = nn.Sequential(
            # 3072 @ 13* 13
            # conv21
            nn.Conv2d(in_channels=3072, out_channels=1024, kernel_size=3, stride=1, padding=1),
            # 1024 @ 13* 13
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(),

            # conv22
            nn.Conv2d(in_channels=1024, out_channels= (5+self.c) *self.b, kernel_size=1, stride=1, padding=0),
            # 125 @ 13* 13
            nn.LeakyReLU()

        )

        self.first_net.cuda()
        self.second_net.cuda()
        self.third_net.cuda()

        self._initialize_weights()

    def pass_layer(self,x):
        # 512 @ 26 * 26 -> 2048 @ 13 * 13
        stride = 2
        batch_size, channels, height, width = x.size() # 32, 512, 26, 26
        new_ht = int(height/stride) # 13
        new_wd = int(width/stride) # 13
        new_channels = channels * stride * stride # 2048

        passthrough = x.permute(0, 2, 3, 1) # 32 26 26 512
        passthrough = passthrough.contiguous().view(-1, new_ht, stride, new_wd, stride, channels) # 32 * 13 * 2 * 13 * 2 * 512
        passthrough = passthrough.permute(0, 1, 3, 2, 4, 5) # 32 * 13 * 13 * 2 * 2 * 512
        passthrough = passthrough.contiguous().view(-1, new_ht, new_wd, new_channels) # 32 * 13 * 13 * 2048
        passthrough = passthrough.permute(0, 3, 1, 2) # 32 * 2048 * 13 * 13

        return passthrough

    def forward(self, x, anchor_box, b, c):
        # print('input network size')
        # print(np.shape(x))
        output = self.first_net(x)
        # 512 @ 26 * 26

        passthrough = self.pass_layer(output)
        # 2048 @ 13 * 13
        output = self.second_net(output)
        # 1024 @ 13 * 13
        output = torch.cat([passthrough, output],1)
        # 3072 @ 13 * 13
        output = self.third_net(output)
        # 125 @ 13 * 13
        output = output.permute(0, 2, 3, 1) # [ Batch_size, 13, 13, 125]
        # batch * 13 * 13 * 125
        output = output.view(output.size()[0],output.size()[1],output.size()[2],self.b,5+self.c)
        # batch * 13 * 13 * 5 * 25
        # output = F.sigmoid(output)
        output[:, :, :, :, 2:4] = F.sigmoid(output[:, :, :, :, 2:4])
        output[:, :, :, :, 4] = F.sigmoid(output[:, :, :, :, 4])
        output[:, :, :, :, 5:] = nn.functional.softmax(output[:, :, :, :, 5:])
        # output[:,:,:,:,5:] = F.sigmoid(output[:,:,:,:,5:])

        output = output.view(output.size()[0], output.size()[1], output.size()[2],-1)

        return output

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()