Mysterious 4th dimension is added to the Colour Channel


I am trying to write some Class Activation Maps to tensorboard.
To obtain the CAMs I am basing it on Utkuozbulak’s ScoreCam implementation

I believe that it is creating desired results, however I am having problems writing them to TensorBoard.

I get the following error:

AssertionError                            Traceback (most recent call last)
<ipython-input-24-f63c18ac42f9> in <module>
      9 custom_model.train(loss_func = criterion, optimizer = optimizer, lr_scheduler = lr_scheduler, 
     10            learning_rate = learning_rate, epochs = 10, trainloader = train_loader,
---> 11            valloader = val_loader, eval_period = 2)

<ipython-input-3-eb7a0260a28f> in train(self, loss_func, optimizer, lr_scheduler, learning_rate, epochs, trainloader, valloader, eval_period)
    262                             all_layers_maps = np.hstack(all_layers_maps)
--> 264                             self.writer.add_image("Class Activation Maps, Layers 1-4", torch.from_numpy(all_layers_maps), global_step = epoch)

~/miniconda3/lib/python3.6/site-packages/torch/utils/tensorboard/ in add_image(self, tag, img_tensor, global_step, walltime, dataformats)
    538             img_tensor = workspace.FetchBlob(img_tensor)
    539         self._get_file_writer().add_summary(
--> 540             image(tag, img_tensor, dataformats=dataformats), global_step, walltime)
    542     def add_images(self, tag, img_tensor, global_step=None, walltime=None, dataformats='NCHW'):

~/miniconda3/lib/python3.6/site-packages/torch/utils/tensorboard/ in image(tag, tensor, rescale, dataformats)
    300     """
    301     tensor = make_np(tensor)
--> 302     tensor = convert_to_HWC(tensor, dataformats)
    303     # Do not assume that user passes in values in [0, 255], use data type to detect
    304     scale_factor = _calc_scale_factor(tensor)

~/miniconda3/lib/python3.6/site-packages/torch/utils/tensorboard/ in convert_to_HWC(tensor, input_format)
     99         input_format: {}".format(input_format)
    100     assert(len(tensor.shape) == len(input_format)), "size of input tensor and input format are different. \
--> 101         tensor shape: {}, input_format: {}".format(tensor.shape, input_format)
    102     input_format = input_format.upper()

AssertionError: size of input tensor and input format are different.         tensor shape: (4, 1200, 300, 4), input_format: CHW

There are 4 images 300x300, stacked horizontally (hence 1200), but I am not entirely certain where the 4th channel came in in the last channel (unless that is correct and I simply have to permute the axis to obtain the desired input format?)

The only other idea thatr’s coming to my head is that it could’ve been potentially added in the:

image = torch.unsqueeze(image,0).cuda()

I had to add the batch dimensionality channel as previously I was getting an error.

The Score-CAM class looks as follows:

import os
import copy
import numpy as np
from PIL import Image, ImageFilter
import as mpl_color_map

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class CamExtractor():
        Extracts cam features from the model
    def __init__(self, model, target_layer,device):
        self.model =
        self.target_layer = target_layer

    def forward_pass_on_convolutions(self, x):
            Does a forward pass on convolutions, hooks the function at given layer
        conv_output = None
        # print("x type", type(x))
        # print(x)

        # x = torch.unsqueeze(x,0).cuda()
        # print("x shape after unsqueeze", x.shape)
        for module_pos, module in self.model._modules.items():
            # print(module_pos)
            x = module(x)  # Forward
            # print(f"x shape after {module_pos}", x.shape)
            if module_pos == self.target_layer:
                conv_output = x  # Save the convolution output on that layer
                return conv_output, x

    def forward_pass(self, x):

        # Forward pass on the convolutions
        # print("X shape before forward pass", x.shape)
        conv_output, x = self.forward_pass_on_convolutions(x)

        # Forward pass on the classifier
        # print("X shape in forward pass", x.shape)

        x = self.model.avgpool(x)
        # Redefine the FC to match the
        #conv layer and num of classes
        fc_in_feaures = x.shape[1]
        self.model.fc = nn.Linear(fc_in_feaures,65).cuda()

        # print("x shape before fc",x.shape)
        x = self.model.fc(x)
        return conv_output, x

class ScoreCam():
        Produces class activation map
    def __init__(self, model, target_layer, device):
        self.model =
        # Define extractor
        self.extractor = CamExtractor(self.model, target_layer, device)

    def apply_colormap_on_image(self, filename, activation, input_image ,colormap_name="gnuplot2"):

        # print("original image type", type(filename))

        map_size = input_image.shape[2:]
        org_im ='RGB')
        org_im = org_im.resize(map_size)

            Apply heatmap on image
            org_img (PIL img): Original image
            activation_map (numpy arr): Activation map (grayscale) 0-255
            colormap_name (str): Name of the colormap
        # Get colormap
        color_map = mpl_color_map.get_cmap(colormap_name)
        no_trans_heatmap = color_map(activation)
        # Change alpha channel in colormap to make sure original image is displayed
        heatmap = copy.copy(no_trans_heatmap)
        heatmap[:, :, 3] = 0.65
        heatmap = Image.fromarray((heatmap*255).astype(np.uint8))
        no_trans_heatmap = Image.fromarray((no_trans_heatmap*255).astype(np.uint8))

        # Apply heatmap on iamge
        heatmap_on_image ="RGBA",map_size)
        heatmap_on_image = Image.alpha_composite(heatmap_on_image, org_im.convert('RGBA'))
        # print("shape of heatmap_on_image", heatmap_on_image.size)
        # print("shape of heatmap", heatmap.size)

        heatmap_on_image = Image.alpha_composite(heatmap_on_image, heatmap)
        return no_trans_heatmap, heatmap_on_image

    def generate_cam(self, input_image, filename ,target_class=None):
        # Full forward pass
        # conv_output is the output of convolutions at specified layer
        # model_output is the final output of the model (1, 1000)
        conv_output, model_output = self.extractor.forward_pass(input_image)
        if target_class is None:
            target_class = np.argmax(
        # Get convolution outputs
        target = conv_output[0]
        # print("target",target.shape)
        # Create empty numpy array for cam
        cam = np.ones(target.shape[1:], dtype=np.float32)
        # Multiply each weight with its conv output and then, sum
        print("********\n\n beginning the cam generation\n\n**********")
        print("len of target in 4 loop", len(target))
        for i in range(len(target)):
            # Unsqueeze to 4D
            saliency_map = torch.unsqueeze(torch.unsqueeze(target[i, :, :],0),0)
            # Upsampling to input size
            input_size = input_image.shape[2:]
            # print("inputsize",input_size)
            saliency_map = F.interpolate(saliency_map, size=(input_size[0],input_size[0]), mode='bilinear', align_corners=False)
            if saliency_map.max() == saliency_map.min():
            # Scale between 0-1
            norm_saliency_map = (saliency_map - saliency_map.min()) / (saliency_map.max() - saliency_map.min())
            # Get the target score
#             print("img shape", input_image.shape)
#             print("norm_saliency_map shape", norm_saliency_map.shape)
#             print("target class", target_class)
            w = F.softmax(self.extractor.forward_pass(input_image*norm_saliency_map)[1],dim=1)[0][target_class]
            cam += * target[i, :, :].data.detach().cpu().clone().numpy()
        cam = np.maximum(cam, 0)
        cam = (cam - np.min(cam)) / (np.max(cam) - np.min(cam))  # Normalize between 0-1
        cam = np.uint8(cam * 255)  # Scale between 0-255 to visualize
        cam = np.uint8(Image.fromarray(cam).resize((input_image.shape[2],
                       input_image.shape[3]), Image.ANTIALIAS))/255

        no_trans_heatmap, heatmap_on_image = self.apply_colormap_on_image(filename, cam, input_image)

        return no_trans_heatmap, heatmap_on_image

The validation is as follows:

                            all_layers_maps = []
                            print("Preparing CAM")
                            for i in range(1,5):
                                score_cam = ScoreCam(self.model, f"layer{i}",self.device)


                                top_images = []
                                bottom_images = []
                                images_list = []

                                images = data["image"]
                                names = data["file_name"]

                                for idx, (image,name) in enumerate(zip(images,names),1):
#                                     image = torch.unsqueeze(image,0)
                                    image = torch.unsqueeze(image,0).cuda()
                                    no_trans, heatmap_image = score_cam.generate_cam(input_image=image, 
#                                     if idx <= int(len(data)/2):
#                                         top_images.append(np.array(heatmap_image))
#                                     else:
#                                         bottom_images.append(np.array(heatmap_image))
#                                 top_images = np.hstack(top_images)
#                                 print("shape of top im", top_images.shape)
#                                 bottom_images = np.hstack(bottom_images)
#                                 print("shape of bottom", bottom_images.shape)
#                                 all_images = np.vstack((top_images,bottom_images))
                            all_layers_maps = np.hstack(all_layers_maps)
                            self.writer.add_image("Class Activation Maps, Layers 1-4", torch.from_numpy(all_layers_maps), global_step = epoch)

I am not sure how to go around solving this issue, feeling quite braindead at 5am…

Regards and thank you in advance!

As per usual right after posting I am certain that the shape of the tensor ought to be: (4,3,300,1200) rather than (4,1200,300,4). That however does not change the fact that I have no clue where and when the 4th channel has been added!

If anyone ever encounters a similar problem, the solution seems to be simply slicing the tensor and removing the 4th colour channel.

new_tensor = T[ :, :3, :, : ]