Hello,
I am working on a CNN model for the first time, to estimate the per pixel pose. This is my model:
class PoseNet(nn.Module):
def init(self, nb_ref_imgs=4, output_exp=True, output_pixel_pose=True, output_disp=True, alpha=10, beta=0.01):
super(PoseNet, self).init()
self.nb_ref_imgs = nb_ref_imgs
self.output_exp = output_exp
self.output_pixel_pose = output_pixel_pose
self.output_disp = output_disp
self.alpha = alpha
self.beta = beta
resnet50 = models.resnet50(weights='ResNet50_Weights.DEFAULT')
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
resnet50.to(self.device)
for param in resnet50.parameters():
param.requires_grad = False
self.res_encoder = nn.Sequential(
resnet50.conv1,
resnet50.bn1,
resnet50.relu,
resnet50.maxpool,
resnet50.layer1,
resnet50.layer2
)
upconv_planes = [512, 256, 128, 64, 32, 16]
# upconv_planes = [512, 512, 256, 128, 64, 3]
self.upconvP4 = upconv(upconv_planes[0], upconv_planes[1])
self.upconvP3 = upconv(upconv_planes[1], upconv_planes[2])
self.upconvP2 = upconv(upconv_planes[2], upconv_planes[3])
self.upconvP1 = upconv(upconv_planes[3], upconv_planes[4])
self.upconvP0 = upconv(upconv_planes[4], upconv_planes[5])
if self.output_exp:
self.predict_mask4 = nn.Conv2d(upconv_planes[5], self.nb_ref_imgs, kernel_size=3, padding=1)
self.predict_mask3 = nn.Conv2d(upconv_planes[4], self.nb_ref_imgs, kernel_size=3, padding=1)
self.predict_mask2 = nn.Conv2d(upconv_planes[3], self.nb_ref_imgs, kernel_size=3, padding=1)
self.predict_mask1 = nn.Conv2d(upconv_planes[2], self.nb_ref_imgs, kernel_size=3, padding=1)
if self.output_pixel_pose:
self.predict_pose4 = nn.Conv2d(upconv_planes[5], self.nb_ref_imgs * 6, kernel_size=3, padding=1)
self.predict_pose3 = nn.Conv2d(upconv_planes[4], self.nb_ref_imgs * 6, kernel_size=3, padding=1)
self.predict_pose2 = nn.Conv2d(upconv_planes[3], self.nb_ref_imgs * 6, kernel_size=3, padding=1)
self.predict_pose1 = nn.Conv2d(upconv_planes[2], self.nb_ref_imgs * 6, kernel_size=3, padding=1)
if self.output_disp:
self.predict_disp4 = nn.Conv2d(upconv_planes[5], 1, kernel_size=3, padding=1)
self.predict_disp3 = nn.Conv2d(upconv_planes[4], 1, kernel_size=3, padding=1)
self.predict_disp2 = nn.Conv2d(upconv_planes[3], 1, kernel_size=3, padding=1)
self.predict_disp1 = nn.Conv2d(upconv_planes[2], 1, kernel_size=3, padding=1)
self.pose_pred = nn.Conv2d(upconv_planes[5], 6 * self.nb_ref_imgs, kernel_size=1, padding=0)
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
nn.init.xavier_uniform_(m.weight)
if m.bias is not None:
m.bias.data.zero_()
def forward(self, target_image, ref_imgs):
input = torch.cat([target_image] + ref_imgs, 1)
# print("input 1 {}".format(input.shape))
reduc_layer = nn.Conv2d(in_channels=15, out_channels=3, kernel_size=1).to(self.device)
input = reduc_layer(input)
# print("input 2 {}".format(input.shape))
with torch.no_grad():
conv1 = self.res_encoder[0](input)
bn1 = self.res_encoder[1](conv1)
relu = self.res_encoder[2](bn1)
maxpool = self.res_encoder[3](relu)
layer1 = self.res_encoder[4](maxpool)
layer2 = self.res_encoder[5](layer1)
up4 = self.upconvP4(layer2)
up3 = self.upconvP3(up4)
up2 = self.upconvP2(up3)
up1 = self.upconvP1(up2)
up0 = self.upconvP0(up1)
# print("self.nb_ref_imgs {}".format(self.nb_ref_imgs))
# print("up0 : {}".format(up0.shape))
poses = self.pose_pred(up0)
# print("poses 1 : {}".format(poses.size(0)))
# print("poses 1 : {}".format(poses.size(1)))
poses = poses.mean(3).mean(2)
# print("poses 2 : {}".format(poses.size(0)))
# print("poses 2 : {}".format(poses.size(1)))
poses = 0.01 * poses.view(poses.size(0), self.nb_ref_imgs, 6)
# print("poses 3 : {}".format(poses.size(0)))
# print("poses 3 : {}".format(poses.size(1)))
exp_masks = None
# print("up0 {}".format(up0.shape))
# print("up1 {}".format(up1.shape))
# print("up2 {}".format(up2.shape))
# print("up3 {}".format(up3.shape))
# print("up4 {}".format(up4.shape))
if self.output_exp:
mask4 = torch.sigmoid(self.predict_mask4(up0))
# print("mask4 {}".format(mask4.shape))
mask3 = torch.sigmoid(self.predict_mask3(up1))
mask2 = torch.sigmoid(self.predict_mask2(up2))
mask1 = torch.sigmoid(self.predict_mask1(up3))
exp_masks = [mask1, mask2, mask3, mask4]
exp2_mask4 = None
exp2_mask3 = None
exp2_mask2 = None
exp2_mask1 = None
pixel_poses = None
if self.output_pixel_pose:
pose_tmp = poses.view(poses.size(0), -1, 1, 1)
pixel_pose4 = 0.01 * self.predict_pose4(up0) + pose_tmp
pixel_pose3 = 0.01 * self.predict_pose3(up1) + pose_tmp
pixel_pose2 = 0.01 * self.predict_pose2(up2) + pose_tmp
pixel_pose1 = 0.01 * self.predict_pose1(up3) + pose_tmp
pixel_poses = [pixel_pose1, pixel_pose2, pixel_pose3, pixel_pose4]
disps = None
if self.output_disp:
disp4 = self.alpha * torch.sigmoid(self.predict_disp4(up0)) + self.beta
disp3 = self.alpha * torch.sigmoid(self.predict_disp3(up1)) + self.beta
disp2 = self.alpha * torch.sigmoid(self.predict_disp2(up2)) + self.beta
disp1 = self.alpha * torch.sigmoid(self.predict_disp1(up3)) + self.beta
disps = [disp1, disp2, disp3, disp4]
# if self.training:
# # print("Pose train")
# print("exp masks {}".format(len(exp_masks)))
# print('pose 0 {}'.format(poses.size(0)))
# print('pose 1 {}'.format(poses.size(1)))
# return exp_masks, poses
# else:
# return mask1, exp2_mask1, pixel_pose1, disp1, poses
return exp_masks, poses
I am using a function:
def tensor2array(tensor, max_value=255, colormap=‘rainbow’):
if max_value is None:
max_value = tensor.max().item()
#print("tensor size utils : {}".format(tensor.ndimension()))
if tensor.ndimension() == 2 or tensor.size(0) == 1:
try:
import cv2
if cv2.__version__.startswith('3'):
color_cvt = cv2.COLOR_BGR2RGB
else: # 2.4
#color_cvt = cv2.cv.CV_BGR2RGB
color_cvt = cv2.COLOR_BGR2RGB
if colormap == 'rainbow':
colormap = cv2.COLORMAP_RAINBOW
elif colormap == 'bone':
colormap = cv2.COLORMAP_BONE
array = (tensor.squeeze().numpy()*255./max_value).clip(0, 255).astype(np.uint8)
colored_array = cv2.applyColorMap(array, colormap)
array = cv2.cvtColor(colored_array, color_cvt).astype(np.float32)/255
except ImportError:
if tensor.ndimension() == 2:
tensor.unsqueeze_(2)
array = (tensor.expand(tensor.size(0), tensor.size(1), 3).numpy()/max_value).clip(0,1)
elif tensor.ndimension() == 3:
print("print tensor size 0 {}".format(tensor.size(0)))
if tensor.size(0) != 3:
tensor = tensor.unsqueeze(0).expand(3, -1, -1)
assert(tensor.size(0) == 3)
array = 0.5 + tensor.numpy().transpose(1, 2, 0)*0.5
#for tensorboardx 1.4
array=array.transpose(2,0,1)
return array
where I get a size mismatch error:
RuntimeError: The expanded size of the tensor (176) must match the existing size (346) at non-singleton dimension 3. Target sizes: [2, 3, 132, 176]. Tensor sizes: [2, 1, 260, 346]
However, when I fixed that, by expanding the tensor, I got similar errors at multiple points in my Training process.
Is there any solution/suggestion to work through getting this type of mismatch error?
Also, it would be great if someone could review my model for a segmentation task, as I am following a U-Net Architecture with the encoder as ResNet50 network.