Tensor size mismatch error

Hello,

I am working on a CNN model for the first time, to estimate the per pixel pose. This is my model:

class PoseNet(nn.Module):
def init(self, nb_ref_imgs=4, output_exp=True, output_pixel_pose=True, output_disp=True, alpha=10, beta=0.01):
super(PoseNet, self).init()

    self.nb_ref_imgs = nb_ref_imgs
    self.output_exp = output_exp
    self.output_pixel_pose = output_pixel_pose
    self.output_disp = output_disp   
    self.alpha = alpha
    self.beta = beta   

    resnet50 = models.resnet50(weights='ResNet50_Weights.DEFAULT')
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    resnet50.to(self.device)

    for param in resnet50.parameters():
        param.requires_grad = False

    self.res_encoder = nn.Sequential(
        resnet50.conv1,
        resnet50.bn1,
        resnet50.relu,
        resnet50.maxpool,
        resnet50.layer1,
        resnet50.layer2
    )

    

    upconv_planes = [512, 256, 128, 64, 32, 16]
    # upconv_planes = [512, 512, 256, 128, 64, 3]

    self.upconvP4 = upconv(upconv_planes[0], upconv_planes[1])
    self.upconvP3 = upconv(upconv_planes[1], upconv_planes[2])
    self.upconvP2 = upconv(upconv_planes[2], upconv_planes[3])
    self.upconvP1 = upconv(upconv_planes[3], upconv_planes[4])
    self.upconvP0 = upconv(upconv_planes[4], upconv_planes[5])

    if self.output_exp:
        self.predict_mask4 = nn.Conv2d(upconv_planes[5], self.nb_ref_imgs, kernel_size=3, padding=1)
        self.predict_mask3 = nn.Conv2d(upconv_planes[4], self.nb_ref_imgs, kernel_size=3, padding=1)
        self.predict_mask2 = nn.Conv2d(upconv_planes[3], self.nb_ref_imgs, kernel_size=3, padding=1)
        self.predict_mask1 = nn.Conv2d(upconv_planes[2], self.nb_ref_imgs, kernel_size=3, padding=1)

    if self.output_pixel_pose:
        self.predict_pose4 = nn.Conv2d(upconv_planes[5], self.nb_ref_imgs * 6, kernel_size=3, padding=1)
        self.predict_pose3 = nn.Conv2d(upconv_planes[4], self.nb_ref_imgs * 6, kernel_size=3, padding=1)
        self.predict_pose2 = nn.Conv2d(upconv_planes[3], self.nb_ref_imgs * 6, kernel_size=3, padding=1)
        self.predict_pose1 = nn.Conv2d(upconv_planes[2], self.nb_ref_imgs * 6, kernel_size=3, padding=1)

    if self.output_disp:
        self.predict_disp4 = nn.Conv2d(upconv_planes[5], 1, kernel_size=3, padding=1)
        self.predict_disp3 = nn.Conv2d(upconv_planes[4], 1, kernel_size=3, padding=1)
        self.predict_disp2 = nn.Conv2d(upconv_planes[3], 1, kernel_size=3, padding=1)
        self.predict_disp1 = nn.Conv2d(upconv_planes[2], 1, kernel_size=3, padding=1)

    self.pose_pred = nn.Conv2d(upconv_planes[5], 6 * self.nb_ref_imgs, kernel_size=1, padding=0)

def init_weights(self):
    for m in self.modules():
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
            nn.init.xavier_uniform_(m.weight)
            if m.bias is not None:
                m.bias.data.zero_()

def forward(self, target_image, ref_imgs):
    input = torch.cat([target_image] + ref_imgs, 1)
    # print("input 1 {}".format(input.shape))

    reduc_layer = nn.Conv2d(in_channels=15, out_channels=3, kernel_size=1).to(self.device)
    input = reduc_layer(input)
    # print("input 2 {}".format(input.shape))

    with torch.no_grad():
        conv1 = self.res_encoder[0](input)
        bn1 = self.res_encoder[1](conv1)
        relu = self.res_encoder[2](bn1)
        maxpool = self.res_encoder[3](relu)
        layer1 = self.res_encoder[4](maxpool)
        layer2 = self.res_encoder[5](layer1)

    up4 = self.upconvP4(layer2)
    up3 = self.upconvP3(up4)
    up2 = self.upconvP2(up3)
    up1 = self.upconvP1(up2)
    up0 = self.upconvP0(up1)

    # print("self.nb_ref_imgs {}".format(self.nb_ref_imgs))
    # print("up0 : {}".format(up0.shape))
    poses = self.pose_pred(up0)
    # print("poses 1 : {}".format(poses.size(0)))
    # print("poses 1 : {}".format(poses.size(1)))
    poses = poses.mean(3).mean(2)
    # print("poses 2 : {}".format(poses.size(0)))
    # print("poses 2 : {}".format(poses.size(1)))
    poses = 0.01 * poses.view(poses.size(0), self.nb_ref_imgs, 6)
    # print("poses 3 : {}".format(poses.size(0)))
    # print("poses 3 : {}".format(poses.size(1)))

    exp_masks = None
    # print("up0 {}".format(up0.shape))
    # print("up1 {}".format(up1.shape))
    # print("up2 {}".format(up2.shape))
    # print("up3 {}".format(up3.shape))
    # print("up4 {}".format(up4.shape))
    if self.output_exp:
        mask4 = torch.sigmoid(self.predict_mask4(up0))
        # print("mask4 {}".format(mask4.shape))
        mask3 = torch.sigmoid(self.predict_mask3(up1))
        mask2 = torch.sigmoid(self.predict_mask2(up2))
        mask1 = torch.sigmoid(self.predict_mask1(up3))
        exp_masks = [mask1, mask2, mask3, mask4]

    exp2_mask4 = None
    exp2_mask3 = None
    exp2_mask2 = None
    exp2_mask1 = None

    pixel_poses = None
    if self.output_pixel_pose:

        pose_tmp = poses.view(poses.size(0), -1, 1, 1)

        pixel_pose4 = 0.01 * self.predict_pose4(up0) + pose_tmp
        pixel_pose3 = 0.01 * self.predict_pose3(up1) + pose_tmp
        pixel_pose2 = 0.01 * self.predict_pose2(up2) + pose_tmp
        pixel_pose1 = 0.01 * self.predict_pose1(up3) + pose_tmp
        pixel_poses = [pixel_pose1, pixel_pose2, pixel_pose3, pixel_pose4]

    disps = None
    if self.output_disp:
        disp4 = self.alpha * torch.sigmoid(self.predict_disp4(up0)) + self.beta
        disp3 = self.alpha * torch.sigmoid(self.predict_disp3(up1)) + self.beta
        disp2 = self.alpha * torch.sigmoid(self.predict_disp2(up2)) + self.beta
        disp1 = self.alpha * torch.sigmoid(self.predict_disp1(up3)) + self.beta
        disps = [disp1, disp2, disp3, disp4]

    # if self.training:
    #     # print("Pose train")
    # print("exp masks {}".format(len(exp_masks)))
    # print('pose 0 {}'.format(poses.size(0)))
    # print('pose 1 {}'.format(poses.size(1)))
    #     return exp_masks, poses
    # else:
    #     return mask1, exp2_mask1, pixel_pose1, disp1, poses
    return exp_masks, poses

I am using a function:
def tensor2array(tensor, max_value=255, colormap=‘rainbow’):
if max_value is None:
max_value = tensor.max().item()

#print("tensor size utils : {}".format(tensor.ndimension()))
if tensor.ndimension() == 2 or tensor.size(0) == 1:
    try:
        import cv2
        if cv2.__version__.startswith('3'):
            color_cvt = cv2.COLOR_BGR2RGB
        else:  # 2.4
            #color_cvt = cv2.cv.CV_BGR2RGB
            color_cvt = cv2.COLOR_BGR2RGB
        if colormap == 'rainbow':
            colormap = cv2.COLORMAP_RAINBOW
        elif colormap == 'bone':
            colormap = cv2.COLORMAP_BONE
        array = (tensor.squeeze().numpy()*255./max_value).clip(0, 255).astype(np.uint8)
        colored_array = cv2.applyColorMap(array, colormap)
        array = cv2.cvtColor(colored_array, color_cvt).astype(np.float32)/255
    except ImportError:
        if tensor.ndimension() == 2:
            tensor.unsqueeze_(2)
        array = (tensor.expand(tensor.size(0), tensor.size(1), 3).numpy()/max_value).clip(0,1)

elif tensor.ndimension() == 3:
    
    print("print tensor size 0 {}".format(tensor.size(0)))
    if tensor.size(0) != 3:
        tensor = tensor.unsqueeze(0).expand(3, -1, -1)
    assert(tensor.size(0) == 3)
    array = 0.5 + tensor.numpy().transpose(1, 2, 0)*0.5

#for tensorboardx 1.4
array=array.transpose(2,0,1)

return array

where I get a size mismatch error:
RuntimeError: The expanded size of the tensor (176) must match the existing size (346) at non-singleton dimension 3. Target sizes: [2, 3, 132, 176]. Tensor sizes: [2, 1, 260, 346]

However, when I fixed that, by expanding the tensor, I got similar errors at multiple points in my Training process.

Is there any solution/suggestion to work through getting this type of mismatch error?

Also, it would be great if someone could review my model for a segmentation task, as I am following a U-Net Architecture with the encoder as ResNet50 network.