RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1536 and 11x1536)

Hello Everyone,
I am getting this error RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1536 and 11x1536). I found several suggestions discussed in this platform on the related topic. I know where the problem is but I don’t know how to implement that. I’m new to Pytorch. Any help will be highly appreciated.

Error Traceback:

traceback (most recent call last):
  File "2-heatmap.py", line 478, in <module>
    mask = grad_cam(inp, None)
  File "/grad.py", line 105, in __call__
    features, output = self.extractor(input_img)  # 保存中间特征图的列表, 以及网络最后输出的分类结果
  File "/grad.py", line 64, in __call__
    x = module(x)
  File "/conda/envs/multimodal/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/conda/envs/multimodal/lib/python3.6/site-packages/torch/nn/modules/container.py", line 141, in forward
    input = module(input)
  File "/conda/envs/multimodal/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/conda/envs/multimodal/lib/python3.6/site-packages/torch/nn/modules/linear.py", line 103, in forward
    return F.linear(input, self.weight, self.bias)
  File "/conda/envs/multimodal/lib/python3.6/site-packages/torch/nn/functional.py", line 1848, in linear
    return torch._C._nn.linear(input, weight, bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1536 and 11x1536)

My model takes images and metadata as input. I trained my model successfully but now I want to visualize the feature maps from convolution layer. Code of the model architecture.

import geffnet
class enetv2(nn.Module):
    def __init__(self, backbone, out_dim, n_meta_features=0, load_pretrained=True):

        super(enetv2, self).__init__()
        self.n_meta_features = n_meta_features
        self.enet = geffnet.create_model(enet_type.replace('-', '_'), pretrained=True)
        self.dropout = nn.Dropout(0.5)
        in_ch = self.enet.classifier.in_features
        if n_meta_features > 0:
            self.meta = nn.Sequential(
                nn.Linear(n_meta_features, 1536),
                nn.BatchNorm1d(1536),
                Swish_module(),
                nn.Dropout(p=0.3),
                nn.Linear(1536, 128),
                nn.BatchNorm1d(128),
                Swish_module(),
            )
            in_ch += 128
        self.myfc = nn.Linear(in_ch, out_dim)
        self.enet.classifier = nn.Identity()

    def extract(self, x):
        x = self.enet(x)
        return x

    def forward(self, x, x_meta=None):
        x = self.extract(x).squeeze(-1).squeeze(-1)
        if self.n_meta_features > 0:
            x_meta = self.meta(x_meta)
            x = torch.cat((x, x_meta), dim=1)
            x = self.myfc(self.dropout(x))
        return x
class GradCam:
    def __init__(self, model, feature_module, target_layer_names, use_cuda= False):
        self.model = model  # model
        self.feature_module = feature_module  # model.layer4
        self.model.eval()
        self.cuda = use_cuda
        if self.cuda:
            self.model = model.cuda()

        self.extractor = ModelOutputs(self.model, self.feature_module, target_layer_names)
        # ModelOutputs(model, model.layer4, ["2"])
    def forward(self, input_img):  # 似乎这个方法没有使用到,注释掉之后没有影响,没有被执行到
        print("林麻子".center(50,'-'))  # 这行打印语句用来证明,该方法并没有被调用执行.
        return self.model(input_img)  

    def __call__(self, input_img, target_category=None):
        if self.cuda:
            input_img = input_img.cuda()  # torch.Size([1, 3, 224, 224])
        pdb.set_trace()
        features, output = self.extractor(input_img)  # 保存中间特征图的列表, 以及网络最后输出的分类结果
        # 列表[torch.Size([1, 2048, 7, 7])], 张量:torch.Size([1, 1000])
        if target_category == None:
            target_category = np.argmax(output.cpu().data.numpy())  # 多维数组展平后最大值的索引
            # <class 'numpy.int64'>  243

        one_hot = np.zeros((1, output.size()[-1]), dtype=np.float32)  # 独热编码,shape:(1, 1000)
        one_hot[0,target_category] = 1  # 独热编码  shape (1, 1000) # one_hot[0][target_category] = 1
        one_hot = torch.from_numpy(one_hot).requires_grad_(False)  # torch.Size([1, 1000]) # requires_grad_(True)
        if self.cuda:
            one_hot = one_hot.cuda()
        
        loss = torch.sum(one_hot * output)  # tensor(9.3856, grad_fn=<SumBackward0>) one_hot = torch.sum(one_hot * output)

        self.feature_module.zero_grad()  # 将模型的所有参数的梯度清零.
        self.model.zero_grad()  # 将模型的所有参数的梯度清零.
        loss.backward()  # one_hot.backward(retain_graph=True)
        grads_val = self.extractor.get_gradients()[0].cpu().data.numpy()  # shape:(1, 2048, 7, 7)  # 顾名思义,梯度值
        # 注: self.extractor.get_gradients()[-1]返回保存着梯度的列表,[-1]表示最后一项,即最靠近输入的一组特征层上的梯度
        target = features[-1]  # torch.Size([1, 2048, 7, 7])  列表中的最后一项,也是唯一的一项,特征图
        target = target.cpu().data.numpy()[0, :]  # shape: (2048, 7, 7)

        weights = np.mean(grads_val, axis=(2, 3))[0, :]  # shape: (2048,)  计算每个特征图上梯度的均值,以此作为权重
        cam = np.zeros(target.shape[1:], dtype=np.float32)  # 获得零矩阵 shape: (7, 7)

        for i, w in enumerate(weights):  # 迭代遍历该权重
            cam += w * target[i, :, :]   # 使用该权重,对特征图进行线性组合

        cam = np.maximum(cam, 0)  # shape: (7, 7) # 相当于ReLU函数
        # print(type(input_img.shape[3:1:-1]),'cxq林麻子cxq',input_img.shape[3:1:-1])
        # print(type(input_img.shape[2:]),'cxq林麻子cxq',input_img.shape[2:])
        cam = cv2.resize(cam, input_img.shape[3:1:-1])  # shape: (224, 224) # 这里要留意传入的形状是(w,h) 所以这里切片的顺序是反过来的
        cam = cam - np.min(cam)  # shape: (224, 224)  # 以下两部是做归一化
        cam = cam / np.max(cam)  # shape: (224, 224)  # 归一化,取值返回是[0,1]
        return cam  # shape: (224, 224) 取值返回是[0,1]
for (data) in tqdm(test_loader):
    data, meta = data
    dataTra = np.transpose(data[0].numpy(), (1, 2, 0))     
    im = Image.fromarray((dataTra * 1).astype(np.uint8)).convert('RGB')

The issue is arising when I’m doing mask = grad_cam(inp, None) as I am just feeding images. Model is expecting images and metadata both. The number of features we have in meta data is 11. Most probably, that’s why expected shape (11x1536) but getting (1x1536). How can I feed image and metadata?

grad_cam = GradCam(model=model, feature_module=features, target_layer_names=["2"], use_cuda=False)
inp = val_transformer(im).unsqueeze(0)
mask = grad_cam(inp, None)

If 11 is your batch_size then transpose the tensor by using the following.

https://pytorch.org/docs/stable/generated/torch.transpose.html

Matrix multiplication is only possible between matrix m x n and matrix n x p. Please note that n is equal in both matrices.

@tiramisuNcustard Thanks for your suggestion. 11 is not batch_size. This is the number features I used in the meta data.