Pytorch Error Runtime RuntimeError: mat1 and mat2 shapes cannot be multiplied (10x2048 and 64x6)

I am trying to concat the X,Y and Rag Feature But it is Giving me erorr I have use the simple concat but it is Giving me error I just want to concat the x,y,rag feature in the forward function, can Anyone help me to solve the Problem

How do I fix an error when concatenating x, y, and rag in the forward function using torch.cat, ensuring matching dimensions and device types?

import torch
import torch.nn as nn
import torch.nn.functional as F
class MFB(nn.Module):
    def __init__(self,img_feat_size, ques_feat_size, is_first, MFB_K, MFB_O, DROPOUT_R):
        super(MFB, self).__init__()
        #self.__C = __C
        self.MFB_K = MFB_K
        self.MFB_O = MFB_O
        self.DROPOUT_R = DROPOUT_R

        self.is_first = is_first
        self.proj_i = nn.Linear(img_feat_size, MFB_K * MFB_O)
        self.proj_q = nn.Linear(ques_feat_size, MFB_K * MFB_O)

        self.dropout = nn.Dropout(DROPOUT_R)
        self.pool = nn.AvgPool1d(MFB_K, stride = MFB_K)

    def forward(self, img_feat, ques_feat, exp_in=1):
        batch_size = img_feat.shape[0]
        img_feat = self.proj_i(img_feat)                # (N, C, K*O)
        ques_feat = self.proj_q(ques_feat)              # (N, 1, K*O)

        exp_out = img_feat * ques_feat             # (N, C, K*O)
        exp_out = self.dropout(exp_out) if self.is_first else self.dropout(exp_out * exp_in)     # (N, C, K*O)
        z = self.pool(exp_out) * self.MFB_K         # (N, C, O)
        z = torch.sqrt(F.relu(z)) - torch.sqrt(F.relu(-z))
        z = F.normalize(z.view(batch_size, -1))         # (N, C*O)
        z = z.view(batch_size, -1, self.MFB_O)      # (N, C, O)
        return z


#MFB -> Multimodal Factorized Bilinear Pooling
#used to model complex interactions between features like image and text
#MFB_K -> Number Of factors, MFB_O -> Output size,
#Init initializes linear projection layers for image and question features , dropout layer and average pooling layer

#Forward:

#exp_in = input expansion factor (default - 1)
#Linear projection of image and question features to factorized bilinear form
#Element-wise multiplication of image and question features
#APply Dropout
#Average pooling along the factorized dimension (MFB_K) to reduce the size of the output tensor
#Element-wise operations to compute the final output (z) using square root and normalization using Relu.
#The final output represents the fused representation of image and question features.

data = data[~data['Name'].isin(outliers)]
len(sample_dataset_new)

torch.manual_seed(123)
t_p,v_p = torch.utils.data.random_split(sample_dataset_new,[450,50])

# torch.manual_seed(123)
t_p,te_p = torch.utils.data.random_split(t_p,[340,110])

t_p[1]["processed_img"].shape
t_p[1]['processed_txt'].shape
t_p[1]['processed_rag'].shape

(768,)
    class Classifier(pl.LightningModule):

    def __init__(self):
      super().__init__()
      self.MFB = MFB(512,768,True,256,64,0.1)
      self.fin_y_shape = torch.nn.Linear(768,512)
      self.fin_old = torch.nn.Linear(64,2)
      self.fin = torch.nn.Linear(16 * 768, 64)
      self.fin_inten = torch.nn.Linear(2048,6)
      self.fin_e1 = torch.nn.Linear(64,2)
      self.fin_e2 = torch.nn.Linear(64,2)
      self.fin_e3 = torch.nn.Linear(64,2)
      self.fin_e4 = torch.nn.Linear(64,2)
      self.fin_e5 = torch.nn.Linear(64,2)
      self.fin_e6 = torch.nn.Linear(64,2)
      self.fin_e7 = torch.nn.Linear(64,2)
      self.fin_e8 = torch.nn.Linear(64,2)
      self.fin_e9 = torch.nn.Linear(64,2)
      # self.reduce_x = torch.nn.Linear(768, 512)
      # self.reduce_rag = torch.nn.Linear(768, 512)



      self.validation_step_outputs = []
      self.test_step_outputs = []


    def forward(self, x,y,rag):
        x_,y_,rag_ = x,y,rag
        print("x.shape", x.shape)
        print("y.shape",y.shape)
        print("rag.shape",rag.shape)

        # x = self.reduce_x(x)
        # rag = self.reduce_rag(rag)

        # print("x.shape", x.shape)
        # print("y.shape",y.shape)
        # print("rag.shape",rag.shape)
        # z = self.MFB(torch.unsqueeze(y, axis=1), torch.unsqueeze(rag, axis=1))
        # z_rag = self.MFB(torch.unsqueeze(y, axis=1),torch.unsqueeze(rag, axis=1))
        # z_con = torch.cat((z, z_rag), dim=1)


        # Concatenate x with y and then with rag


        z= torch.cat((torch.cat((x, y), dim=1), rag), dim=1)


        # Pass concatenated x with y and x with rag through your network
        z_new = torch.squeeze(z,dim=1)
        print("z_new shape",z_new)


        c_inten = self.fin_inten(z_new)
        c_e1 = self.fin_e1(z_new)
        c_e2 = self.fin_e2(z_new)
        c_e3 = self.fin_e3(z_new)
        c_e4 = self.fin_e4(z_new)
        c_e5 = self.fin_e5(z_new)
        c_e6 = self.fin_e6(z_new)
        c_e7 = self.fin_e7(z_new)
        c_e8 = self.fin_e8(z_new)
        c_e9 = self.fin_e9(z_new)
        c = self.fin_old(z_new)

        # print("z.shape",z.shape)
        # print("z_new shape",z_new.shape)
        # print("intensity error:", c_inten.shape)
        # print("output:", c.shape)
        # print("c_e1:", c_e1.shape)
        # print("c_e2:", c_e2.shape)
        # print("c_e3:", c_e3.shape)
        # print("c_e4:", c_e4.shape)
        # print("c_e5:", c_e5.shape)
        # print("c_e6:", c_e6.shape)
        # print("c_e7:", c_e7.shape)
        # print("c_e8:", c_e8.shape)
        # print("c_e9:", c_e9.shape)
        # print("logits.shape",logits.shape)


        output = torch.log_softmax(c, dim=1)
        c_inten = torch.log_softmax(c_inten, dim=1)
        c_e1 = torch.log_softmax(c_e1, dim=1)
        c_e2 = torch.log_softmax(c_e2, dim=1)
        c_e3 = torch.log_softmax(c_e3, dim=1)
        c_e4 = torch.log_softmax(c_e4, dim=1)
        c_e5 = torch.log_softmax(c_e5, dim=1)
        c_e6 = torch.log_softmax(c_e6, dim=1)
        c_e7 = torch.log_softmax(c_e7, dim=1)
        c_e8 = torch.log_softmax(c_e8, dim=1)
        c_e9 = torch.log_softmax(c_e9, dim=1)

        return output,c_inten,c_e1,c_e2,c_e3,c_e4,c_e5,c_e6,c_e7,c_e8,c_e9


    def cross_entropy_loss(self, logits, labels):
      print("logits.shape",logits.shape)
      return F.nll_loss(logits, labels)

    def training_step(self, train_batch, batch_idx):
        #lab,txt,rag,img,name,per,iro,alli,ana,inv,meta,puns,sat,hyp= train_batch
        lab,txt,rag,img,name,intensity,e1,e2,e3,e4,e5,e6,e7,e8,e9= train_batch
        #logit_offen,a,b,c,d,e,f,g,h,i,logit_inten_target= self.forward(txt,img,rag)

        lab = train_batch[lab].unsqueeze(1)
        #print(lab)
        txt = train_batch[txt]
        rag = train_batch[rag]
        img = train_batch[img]
        name= train_batch[name]
        intensity = train_batch[intensity].unsqueeze(1)
        e1 = train_batch[e1].unsqueeze(1)
        e2 = train_batch[e2].unsqueeze(1)
        e3 = train_batch[e3].unsqueeze(1)
        e4 = train_batch[e4].unsqueeze(1)
        e5 = train_batch[e5].unsqueeze(1)
        e6 = train_batch[e6].unsqueeze(1)
        e7 = train_batch[e7].unsqueeze(1)
        e8 = train_batch[e8].unsqueeze(1)
        e9 = train_batch[e9].unsqueeze(1)

        lab = F.one_hot(lab, num_classes=2)
        intensity = torch.abs(intensity)
        intensity = F.one_hot(intensity, num_classes=6)  # Assuming you have 6 classes
        e1 = F.one_hot(e1,num_classes = 2)
        e2 = F.one_hot(e2,num_classes = 2)
        e3 = F.one_hot(e3,num_classes = 2)
        e4 = F.one_hot(e4,num_classes = 2)
        e5 = F.one_hot(e5,num_classes = 2)
        e6 = F.one_hot(e6,num_classes = 2)
        e7 = F.one_hot(e7,num_classes = 2)
        e8 = F.one_hot(e8,num_classes = 2)
        e9 = F.one_hot(e9,num_classes = 2)

        lab = lab.squeeze(dim=1)
        intensity = intensity.squeeze(dim=1)
        e1 = e1.squeeze(dim=1)
        e2 = e2.squeeze(dim=1)
        e3 = e3.squeeze(dim=1)
        e4 = e4.squeeze(dim=1)
        e5 = e5.squeeze(dim=1)
        e6 = e6.squeeze(dim=1)
        e7 = e7.squeeze(dim=1)
        e8 = e8.squeeze(dim=1)
        e9 = e9.squeeze(dim=1)

        logit_offen,logit_inten_target,a,b,c,d,e,f,g,h,i= self.forward(txt,img,rag)

        loss1 = self.cross_entropy_loss(logit_offen, lab)
        loss17 = self.cross_entropy_loss(logit_inten_target, intensity)
        loss4 = self.cross_entropy_loss(a, e1)
        loss5 = self.cross_entropy_loss(b, e2)
        loss6 = self.cross_entropy_loss(c, e3)
        loss7 = self.cross_entropy_loss(d, e4)
        loss8 = self.cross_entropy_loss(e, e5)
        loss9 = self.cross_entropy_loss(f, e6)
        loss10 = self.cross_entropy_loss(g, e7)
        loss11 = self.cross_entropy_loss(h, e8)
        loss12 = self.cross_entropy_loss(i, e9)

        loss = loss1 + loss4 + loss5 + loss6 + loss7 + loss8 +loss9 + loss10 +loss11 +loss12 + loss17

        self.log('train_loss', loss)
        return loss


    def validation_step(self, val_batch, batch_idx):
        #lab,txt,rag,img,name,per,iro,alli,ana,inv,meta,puns,sat,hyp = val_batch
        lab,txt,rag,img,name,intensity,e1,e2,e3,e4,e5,e6,e7,e8,e9= val_batch
        lab = val_batch[lab].unsqueeze(1)
        #print(lab)
        txt = val_batch[txt]
        rag = val_batch[rag]
        img = val_batch[img]
        name = val_batch[name]
        intensity = val_batch[intensity].unsqueeze(1)
        e1 = val_batch[e1].unsqueeze(1)
        e2 = val_batch[e2].unsqueeze(1)
        e3 = val_batch[e3].unsqueeze(1)
        e4 = val_batch[e4].unsqueeze(1)
        e5 = val_batch[e5].unsqueeze(1)
        e6 = val_batch[e6].unsqueeze(1)
        e7 = val_batch[e7].unsqueeze(1)
        e8 = val_batch[e8].unsqueeze(1)
        e9 = val_batch[e9].unsqueeze(1)

        lab = F.one_hot(lab, num_classes=2)

        intensity = torch.abs(intensity)
        intensity = F.one_hot(intensity, num_classes=6)
        e1 = F.one_hot(e1,num_classes = 2)
        e2 = F.one_hot(e2,num_classes = 2)
        e3 = F.one_hot(e3,num_classes = 2)
        e4 = F.one_hot(e4,num_classes = 2)
        e5 = F.one_hot(e5,num_classes = 2)
        e6 = F.one_hot(e6,num_classes = 2)
        e7 = F.one_hot(e7,num_classes = 2)
        e8 = F.one_hot(e8,num_classes = 2)
        e9 = F.one_hot(e9,num_classes = 2)
        lab = lab.squeeze(dim=1)


        intensity = intensity.squeeze(dim = 1)
        e1 = e1.squeeze(dim=1)
        e2 = e2.squeeze(dim=1)
        e3 = e3.squeeze(dim=1)
        e4 = e4.squeeze(dim=1)
        e5 = e5.squeeze(dim=1)
        e6 = e6.squeeze(dim=1)
        e7 = e7.squeeze(dim=1)
        e8 = e8.squeeze(dim=1)
        e9 = e9.squeeze(dim=1)

        logits,inten,a,b,c,d,e,f,g,h,i = self.forward(txt,img,rag)

        logits=logits.float()

        tmp = np.argmax(logits.detach().cpu().numpy(),axis=1)
        loss = self.cross_entropy_loss(logits, lab)
        lab = lab.detach().cpu().numpy()
        self.log('val_acc', accuracy_score(lab,tmp))
        self.log('val_roc_auc',roc_auc_score(lab,tmp))
        self.log('val_loss', loss)
        tqdm_dict = {'val_acc': accuracy_score(lab,tmp)}
        self.validation_step_outputs.append({'progress_bar': tqdm_dict,'val_f1 offensive': f1_score(lab,tmp,average='macro')})

        return {
                  'progress_bar': tqdm_dict,
        'val_f1 offensive': f1_score(lab,tmp,average='macro')
        }

    def on_validation_epoch_end(self):
      outs = []
      outs14=[]
      for out in self.validation_step_outputs:
        outs.append(out['progress_bar']['val_acc'])
        outs14.append(out['val_f1 offensive'])
      self.log('val_acc_all_offn', sum(outs)/len(outs))
      self.log('val_f1 offensive', sum(outs14)/len(outs14))
      print(f'***val_acc_all_offn at epoch end {sum(outs)/len(outs)}****')
      print(f'***val_f1 offensive at epoch end {sum(outs14)/len(outs14)}****')
      self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx):
        lab,txt,rag,img,name,intensity,e1,e2,e3,e4,e5,e6,e7,e8,e9= batch
        lab = batch[lab].unsqueeze(1)
        #print(lab)
        txt = batch[txt]
        rag = batch[rag]
        img = batch[img]
        name = batch[name]
        intensity = batch[intensity].unsqueeze(1)
        e1 = batch[e1].unsqueeze(1)
        e2 = batch[e2].unsqueeze(1)
        e3 = batch[e3].unsqueeze(1)
        e4 = batch[e4].unsqueeze(1)
        e5 = batch[e5].unsqueeze(1)
        e6 = batch[e6].unsqueeze(1)
        e7 = batch[e7].unsqueeze(1)
        e8 = batch[e8].unsqueeze(1)
        e9 = batch[e9].unsqueeze(1)
        lab = F.one_hot(lab, num_classes=2)
        intensity = F.one_hot(intensity, num_classes=6)
        e1 = F.one_hot(e1,num_classes = 2)
        e2 = F.one_hot(e2,num_classes = 2)
        e3 = F.one_hot(e3,num_classes = 2)
        e4 = F.one_hot(e4,num_classes = 2)
        e5 = F.one_hot(e5,num_classes = 2)
        e6 = F.one_hot(e6,num_classes = 2)
        e7 = F.one_hot(e7,num_classes = 2)
        e8 = F.one_hot(e8,num_classes = 2)
        e9 = F.one_hot(e9,num_classes = 2)
        lab = lab.squeeze(dim=1)
        intensity = intensity.squeeze(dim=1)
        e1 = e1.squeeze(dim=1)
        e2 = e2.squeeze(dim=1)
        e3 = e3.squeeze(dim=1)
        e4 = e4.squeeze(dim=1)
        e5 = e5.squeeze(dim=1)
        e6 = e6.squeeze(dim=1)
        e7 = e7.squeeze(dim=1)
        e8 = e8.squeeze(dim=1)
        e9 = e9.squeeze(dim=1)

        logits,inten,a,b,c,d,e,f,g,h,i= self.forward(txt,img,rag)

        logits = logits.float()
        tmp = np.argmax(logits.detach().cpu().numpy(force=True),axis=-1)
        loss = self.cross_entropy_loss(logits, lab)
        lab = lab.detach().cpu().numpy()
        self.log('test_acc', accuracy_score(lab,tmp))
        self.log('test_roc_auc',roc_auc_score(lab,tmp))
        self.log('test_loss', loss)
        tqdm_dict = {'test_acc': accuracy_score(lab,tmp)}
        self.test_step_outputs.append({'progress_bar': tqdm_dict,'test_acc': accuracy_score(lab,tmp), 'test_f1_score': f1_score(lab,tmp,average='macro')})
        return {
                  'progress_bar': tqdm_dict,
                  'test_acc': accuracy_score(lab,tmp),
                  'test_f1_score': f1_score(lab,tmp,average='macro')
        }
    def on_test_epoch_end(self):
        # OPTIONAL
        outs = []
        outs1,outs2,outs3,outs4,outs5,outs6,outs7,outs8,outs9,outs10,outs11,outs12,outs13,outs14 = \
        [],[],[],[],[],[],[],[],[],[],[],[],[],[]
        for out in self.test_step_outputs:
          outs.append(out['test_acc'])
          outs2.append(out['test_f1_score'])
        self.log('test_acc', sum(outs)/len(outs))
        self.log('test_f1_score', sum(outs2)/len(outs2))
        self.test_step_outputs.clear()

    def configure_optimizers(self):
      # optimizer = torch.optim.Adam(self.parameters(), lr=3e-2)
      optimizer = torch.optim.Adam(self.parameters(), lr=1e-5)

      return optimizer


  """
  Main Model:
  Initialize
  Forward Pass
  Training Step
  Validation Step
  Testing Step

  Pp
  """

  class HmDataModule(pl.LightningDataModule):

    def setup(self, stage):
      self.hm_train = t_p
      self.hm_val = v_p
      # self.hm_test = test
      self.hm_test = te_p

    def train_dataloader(self):
      return DataLoader(self.hm_train, batch_size=10, drop_last=True)

    def val_dataloader(self):
      return DataLoader(self.hm_val, batch_size=10, drop_last=True)

    def test_dataloader(self):
      return DataLoader(self.hm_test, batch_size=10, drop_last=True)

  data_module = HmDataModule()
  checkpoint_callback = ModelCheckpoint(
      monitor='val_acc_all_offn',
      dirpath='mrinal/',
      filename='epoch{epoch:02d}-val_f1_all_offn{val_acc_all_offn:.2f}',
      auto_insert_metric_name=False,
      save_top_k=1,
      mode="max",
  )
  all_callbacks = []
  all_callbacks.append(checkpoint_callback)
  # train
  from pytorch_lightning import seed_everything
  seed_everything(42, workers=True)
  hm_model = Classifier()
  gpus=1
  #if torch.cuda.is_available():gpus=0
  trainer = pl.Trainer(deterministic=True,max_epochs=10,precision=16,callbacks=all_callbacks)
  trainer.fit(hm_model, data_module)
INFO:lightning_fabric.utilities.seed:Seed set to 42
/usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:556: You passed `Trainer(accelerator='cpu', precision='16-mixed')` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'` instead.
INFO:pytorch_lightning.utilities.rank_zero:Using bfloat16 Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
WARNING:pytorch_lightning.loggers.tensorboard:Missing logger folder: /content/LLaVA/lightning_logs
INFO:pytorch_lightning.callbacks.model_summary:
   | Name        | Type   | Params
----------------------------------------
0  | MFB         | MFB    | 21.0 M
1  | fin_y_shape | Linear | 393 K 
2  | fin_old     | Linear | 130   
3  | fin         | Linear | 786 K 
4  | fin_inten   | Linear | 12.3 K
5  | fin_e1      | Linear | 130   
6  | fin_e2      | Linear | 130   
7  | fin_e3      | Linear | 130   
8  | fin_e4      | Linear | 130   
9  | fin_e5      | Linear | 130   
10 | fin_e6      | Linear | 130   
11 | fin_e7      | Linear | 130   
12 | fin_e8      | Linear | 130   
13 | fin_e9      | Linear | 130   
----------------------------------------
22.2 M    Trainable params
0         Non-trainable params
22.2 M    Total params
88.792    Total estimated model params size (MB)
Sanity Checking DataLoader 0:   0%
 0/2 [00:00<?, ?it/s]
x.shape torch.Size([10, 768])
y.shape torch.Size([10, 512])
rag.shape torch.Size([10, 768])
z_new shape tensor([[ 0.0144, -0.1677,  0.1100,  ..., -0.1818,  0.4250, -0.2985],
        [-0.2105, -0.1002, -0.0113,  ..., -0.0639,  0.3789, -0.0553],
        [-0.1221, -0.1026, -0.3277,  ..., -0.3724,  0.1562,  0.0286],
        ...,
        [-0.0950,  0.3957,  0.3603,  ..., -0.2121,  0.6465, -0.1983],
        [ 0.0080,  0.2380, -0.0409,  ..., -0.2565,  0.0946, -0.1098],
        [ 0.1351, -0.3463,  0.3371,  ..., -0.2283,  0.4667,  0.0087]])
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-29-279b4c8e1163> in <cell line: 369>()
    367 #if torch.cuda.is_available():gpus=0
    368 trainer = pl.Trainer(deterministic=True,max_epochs=10,precision=16,callbacks=all_callbacks)
--> 369 trainer.fit(hm_model, data_module)

14 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py in forward(self, input)
    112 
    113     def forward(self, input: Tensor) -> Tensor:
--> 114         return F.linear(input, self.weight, self.bias)
    115 
    116     def extra_repr(self) -> str:

RuntimeError: mat1 and mat2 shapes cannot be multiplied (10x2048 and 64x2)

Concatenating these tensors:

x.shape torch.Size([10, 768])
y.shape torch.Size([10, 512])
rag.shape torch.Size([10, 768])

in the feature dimension will create 2048 features while your linear layers expect 64. Change the in_features arguments of the linear layers to 2048 and it should work.

No I have already perform this it is showing this

INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
   | Name        | Type   | Params
----------------------------------------
0  | MFB         | MFB    | 21.0 M
1  | fin_y_shape | Linear | 393 K 
2  | fin_old     | Linear | 4.1 K 
3  | fin         | Linear | 786 K 
4  | fin_inten   | Linear | 12.3 K
5  | fin_e1      | Linear | 4.1 K 
6  | fin_e2      | Linear | 4.1 K 
7  | fin_e3      | Linear | 4.1 K 
8  | fin_e4      | Linear | 4.1 K 
9  | fin_e5      | Linear | 4.1 K 
10 | fin_e6      | Linear | 4.1 K 
11 | fin_e7      | Linear | 4.1 K 
12 | fin_e8      | Linear | 4.1 K 
13 | fin_e9      | Linear | 4.1 K 
----------------------------------------
22.2 M    Trainable params
0         Non-trainable params
22.2 M    Total params
88.951    Total estimated model params size (MB)
x.shape torch.Size([10, 768])
y.shape torch.Size([10, 512])
rag.shape torch.Size([10, 768])
z_new shape tensor([[ 0.0144, -0.1677,  0.1100,  ..., -0.1818,  0.4250, -0.2985],
        [-0.2105, -0.1002, -0.0113,  ..., -0.0639,  0.3789, -0.0553],
        [-0.1221, -0.1026, -0.3277,  ..., -0.3724,  0.1562,  0.0286],
        ...,
        [-0.0950,  0.3957,  0.3603,  ..., -0.2121,  0.6465, -0.1983],
        [ 0.0080,  0.2380, -0.0409,  ..., -0.2565,  0.0946, -0.1098],
        [ 0.1351, -0.3463,  0.3371,  ..., -0.2283,  0.4667,  0.0087]])
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-37-6957c3f81a92> in <cell line: 369>()
    367 #if torch.cuda.is_available():gpus=0
    368 trainer = pl.Trainer(deterministic=True,max_epochs=10,precision=16,callbacks=all_callbacks)
--> 369 trainer.fit(hm_model, data_module)

13 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
   2702     if size_average is not None or reduce is not None:
   2703         reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2704     return torch._C._nn.nll_loss_nd(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
   2705 
   2706 

RuntimeError: 0D or 1D target tensor expected, multi-target not supported

This error points to the target used in nn.CrossEntropyLoss or nn.NLLLoss having an invalid shape.
Your code is unfortunately not executable, so it’s not trivial to copy/paste it to fix other issues.

I’m seeking help i’ve been working on building a Siamese Network to output similarity, because i 'm new to the field and because it’s my firs time using PyTorch i have faced difficulties in the code.
Here’s the my code:
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

Function to load and preprocess an image

def load_image(img_path):
img = PIL.Image.open(img_path)
img = img.convert(‘RGB’) # Convert to RGB if the image has other color modes
img = transform(img)
return img

Function to create pairs of images and labels

def create_pairs(df, is_match):
pairs =
labels =
for , row in df.iterrows():

Get the name and image numbers for the first image

name1 = row[‘name’]
imagenum1 = row[‘imagenum1’]

Get the name and image numbers for the second image

if is_match:

For matched pairs, both images are of the same person

name2 = name1
imagenum2 = row[‘imagenum2’]
label = 1
else:

For mismatched pairs, the images are of different people

name2 = row[‘name.1’] if ‘name.1’ in df.columns else row[‘name’]
imagenum2 = row[‘imagenum2’]
label = 0

Load images

img1_path = f"…/input/lfw-dataset/lfw-deepfunneled/lfw-deepfunneled/{name1}/{name1}{imagenum1:04d}.jpg"
img2_path = f"…/input/lfw-dataset/lfw-deepfunneled/lfw-deepfunneled/{name2}/{name2}_{imagenum2:04d}.jpg"
img1 = load_image(img1_path)
img2 = load_image(img2_path)

Append the pair and label to the respective lists

pairs.append([img1, img2])
labels.append(label)

# Convert the lists to NumPy arrays
pairs = np.array(pairs)
labels = np.array(labels)

return pairs, labels.astype("float32")

train_matched_pairs, train_matched_labels = create_pairs(matchpairsDevTrain, is_match=True)

train_mismatched_pairs, train_mismatched_labels = create_pairs(mismatchpairsDevTrain, is_match=False)

test_matched_pairs, test_matched_labels = create_pairs(matchpairsDevTest, is_match=True)

test_mismatched_pairs, test_mismatched_labels = create_pairs(mismatchpairsDevTest, is_match=False)

class SiameseDataset(Dataset):
def init(self, matched_pairs, matched_labels, mismatched_pairs, mismatched_labels):
self.pairs = np.concatenate((matched_pairs, mismatched_pairs), axis=0)
self.labels = np.concatenate((matched_labels, mismatched_labels), axis=0)

    # Shuffle the data
    indices = np.arange(len(self.pairs))
    random.shuffle(indices)
    self.pairs = self.pairs[indices]
    self.labels = self.labels[indices]

def __len__(self):
    return len(self.labels)

def __getitem__(self, idx):
    img1 = self.pairs[idx][0]
    img2 = self.pairs[idx][1]
    label = self.labels[idx]
    
    # Convert images to PyTorch tensors
    img1 = transforms.ToTensor()(img1)
    img2 = transforms.ToTensor()(img2)

    return img1, img2, label

train_dataset = SiameseDataset(train_matched_pairs, train_matched_labels,
train_mismatched_pairs, train_mismatched_labels)

test_dataset = SiameseDataset(test_matched_pairs, test_matched_labels,
test_mismatched_pairs, test_mismatched_labels)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

class SiameseNetwork(nn.Module):
def init(self):
super(SiameseNetwork, self).init()
self.vgg = models.vgg16(pretrained=True).features
self.fc_in_features = 512
self.fc = nn.Sequential(
nn.Linear(self.fc_in_features * 7 * 7, 256),
nn.ReLU(inplace=True),
nn.Linear(256, 1),
)
self.sigmoid = nn.Sigmoid()

    # initialize the weights
    self.vgg.apply(self.init_weights)
    self.fc.apply(self.init_weights)
    
def init_weights(self, m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

def forward_once(self, x):
    output = self.vgg(x)
    output = output.view(output.size()[0], -1)
    return output

def forward(self, input1, input2):
    # get two images features
    output1 = self.forward_once(input1)
    output2 = self.forward_once(input2)

    # concatenate both images' features
    output = torch.cat((output1, output2), 1)

    # reshape the concatenated output to match the linear layer's input size
    output = output.view(output.size(0), -1)

    # pass the reshaped tensor to the linear layers
    output = self.fc(output)

    # pass the out of the linear layers to sigmoid layer
    output = self.sigmoid(output)

    return output

Define the model, loss function, optimizer, and other hyperparameters

device = torch.device(‘cuda:0’ if torch.cuda.is_available() else ‘cpu’)

siamese_model = SiameseNetwork().to(device)

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(siamese_model.parameters(), lr=0.1)

batch_size = 1

epochs = 12

Train the model

for epoch in range(epochs):
running_loss = 0.0
running_accuracy = 0.0

# Iterate over the train_loader
for batch_idx, (img1, img2, labels) in enumerate(train_loader): 
    img1 = img1.permute(0, 2, 1, 3)
    img2 = img2.permute(0, 2, 1, 3)
    print(img1.shape, img2.shape) # Verify batch shape
    img1 = img1.to(device)  
    img2 = img2.to(device)  
    labels = labels.to(device)

    optimizer.zero_grad()

    # Forward pass
    output = siamese_model(img1, img2).squeeze()

    # Compute the loss
    loss = criterion(output, labels.float())

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    # Compute the running loss and accuracy
    running_loss += loss.item()
    running_accuracy += (output.squeeze() > 0.5).float().eq(labels.float()).sum().item() / batch_size

# Print the average loss and accuracy for the epoch
epoch_loss = running_loss / len(train_loader)
epoch_accuracy = running_accuracy / len(train_loader)
print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

this outputs: RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x50176 and 25088x256)

Based on the shapes it seems the error is raised in:

self.fc = nn.Sequential(
    nn.Linear(self.fc_in_features * 7 * 7, 256),`
    ...

which requires 25088 input features while the input activation uses 50176.
You could increase the in_features by 2x to fix the issue.

Thanks a lot that was really helpful

Hi, i have build that Siamese network finally, my goal is to predict similarity, and because i haven’t found any dataset with a similarity score as a label. I’ve decided to train my network on the LFW dataset, the task i 've trained it on is face verification. I’ve used the pretrained VGG16 as a feature extractor to encode images, then i 've merged the two embeddings into some fc layers to get an output of a sigmoid (it’s as if i’ve defined similarity to be the probability of being the same person so that i can train it on the face verification task). The problem i 've is that the network seems to be doing well on the training set but i’m getting very low accuracy on the test (57% on test 99% on train) and so there are few things i wanna know first should i interpret this low accuracy as overfitting or not since my goal finally is to estimate similarity and not face verification and the similarity can differ from face verification second thing if yes i should look at this issue as an overfitting how can i improve the test set accuracy.