Custom Loss Function Gradient Vector nan

I am writing a simplified version of the YOLO v1 object detection model for face detection. The input for this model is an n x 3 x 300 x 300 tensor of RGB images. The output is a n x 5 x 7 x 7 tensor, where the 5 channels represent x, y, w, h, and prob(center of a face) for 49 different regions. I want the yolo model to produce tensors of the same shape, with the same types of channels.

The simplified yolo module is defined as the following:

class YOLO(nn.Module):
  def __init__(self):
    super(YOLO, self).__init__()
    self.mp = nn.MaxPool2d(kernel_size=2, stride=2)
    self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
    self.conv2 = nn.Conv2d(64, 192, kernel_size=3, padding=1)
    self.conv3 = nn.Conv2d(192, 256, kernel_size=3, padding=1)
    self.conv4 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
    self.conv5 = nn.Conv2d(512, 1024, kernel_size=3, padding=1)
    self.conv6 = nn.Conv2d(1024, 1024, kernel_size=3, padding=1)
    self.conv7 = nn.Conv2d(1024, 1024, kernel_size=3, padding=1)
    self.conv8 = nn.Conv2d(1024, 1024, kernel_size=3, padding=1)
    self.conv9 = nn.Conv2d(1024, 1024, kernel_size=3, padding=1)
    self.conv10 = nn.Conv2d(1024, 1024, kernel_size=3)

    self.relu_downsize = nn.Sequential(
        nn.MaxPool2d(kernel_size=2, stride=2),
        nn.LeakyReLU(.01, inplace=False)
    )
    
    self.linear_layers = nn.Sequential(
        nn.Flatten(),
        nn.Linear(7 * 7 * 1024,  4096),
        nn.LeakyReLU(.01, inplace=False),
        nn.Linear(4096, 7 * 7 * 5),
        nn.LeakyReLU(.01, inplace=False)
    )

  def forward(self, x):

    x = self.relu_downsize(self.conv1(x))
    x = self.relu_downsize(self.conv2(x))
    x = self.relu_downsize(self.conv3(x))
    x = F.leaky_relu(self.conv4(x), .01)
    x = self.relu_downsize(self.conv5(x))
    x = F.leaky_relu(self.conv6(x), .01)
    x = self.relu_downsize(self.conv7(x))
    x = F.leaky_relu(self.conv8(x))
    x = F.leaky_relu(self.conv9(x))
    x = F.leaky_relu(self.conv10(x))
    x = torch.sigmoid(self.linear_layers(x))
    x = x.view(-1, 5, 7, 7)
    return x

And the simplified yolo loss as the following:

class YoloLoss(nn.Module):
  def __init__(self):
    super(YoloLoss, self).__init__()
    self.lmbda_coord = torch.autograd.Variable(torch.tensor(.5), requires_grad=True)
    self.lmbda_noobj = torch.autograd.Variable(torch.tensor(.5), requires_grad=True)

  def forward(self, yhat, y):

    cp_dim = (1, 5, 1, 1)

    region_yhat = yhat * torch.tile(y[:, 4, :, :].unsqueeze(1), cp_dim)
    nonregion_yhat = (1 - yhat) * torch.tile((1 - y[:, 4, :, :].unsqueeze(1)), cp_dim)

    err_loc = F.mse_loss(region_yhat[:, :2, :, :], y[:, :2, :, :])
    err_size = F.mse_loss(torch.sqrt(region_yhat[:, 2:4, :, :]), torch.sqrt(y[:, 2:4, :, :]))
    err_inclass = F.mse_loss(region_yhat[:, 4, :, :], y[:, 4, :, :])
    err_outclass = F.mse_loss(nonregion_yhat[:, 4, :, :], 1 - y[:, 4, :, :])

    return self.lmbda_coord * (err_loc + err_size) + err_inclass + self.lmbda_noobj + err_outclass

The problem is that the gradient vector seems not to be calculated correctly. During training, the first loss is correctly calculated, but after the first gradient descent step, the model only returns nans. The following code block produces the error. Keep in mind that the get_images_and_boxes function returns a tuple of tensor images (input to a model) and tensor bounding boxes (actual output to predict).

Code:

mod = YOLO()
loss_f = YoloLoss()
optimizer = torch.optim.Adam(params=mod.parameters(), lr=1e-3)

x1, y1 = get_images_and_boxes(range(4))
yhat1 = mod(x1)
print(x1.shape, y1.shape, yhat1.shape)

loss = loss_f(yhat1, y1)
print(loss)

loss.backward()
optimizer.step()

x2, y2 = get_images_and_boxes(range(4))
yhat2 = mod(x2)
print(x2.shape, y2.shape, yhat2.shape)
print(yhat2)

Output:

torch.Size([4, 3, 300, 300]) torch.Size([4, 5, 7, 7]) torch.Size([4, 5, 7, 7])
tensor(0.7533, grad_fn=<AddBackward0>)
torch.Size([4, 3, 300, 300]) torch.Size([4, 5, 7, 7]) torch.Size([4, 5, 7, 7])
tensor([[[[nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan],
          [nan, nan, nan, nan, nan, nan, nan]],
          ...

I am not sure why backpropagation is not working on my custom loss function.