RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x13056 and 153600x2048)

Amir_Alioghli · December 11, 2023, 6:58am

Hi Ken_Jovan

could you find solution for your problem i faced the same thing. if possible could you share you experiences?

walkii22 · December 13, 2023, 10:39am

I have encountered a similar problem with a multitask model for audio event detection task with trackwise output:

from numpy.core.fromnumeric import shape
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.container import Sequential
from methods.utils.model_utilities_transfer import (Transfer_Cnn14, PositionalEncoding, init_layer) #( CustomCNN, OrthogonalConv2d,PositionalEncoding, DoubleCNN, init_layer)
import torchaudio
from methods.utils.transfer_doa import(Transfer_Cnn14_d)

class EINV2(nn.Module):
def init(self, cfg, dataset):
super().init()
self.pe_enable = False # Ture | False
self.in_channels= 4
self.in_channels_doa = 7
freeze_base = False
if cfg[‘data’][‘audio_feature’] == ‘logmel&intensity’:
self.f_bins = cfg[‘data’][‘n_mels’]
# self.in_channels_doa = 7
# self.in_channels_sed = 4

    self.downsample_ratio = 2 ** 2
    self.sed = nn.Sequential(
          Transfer_Cnn14(in_channels = 4,  classes_num = 14, freeze_base = False), #nn.AvgPool2d(kernel_size=(2, 2)
          nn.AvgPool2d(2, 2)

    )
    # self.sed = (Transfer_Cnn14(4,  classes_num = 14, freeze_base = False),
    #       nn.AvgPool2d(kernel_size=(2, 2))
    # )
    self.doa= nn.Sequential(
         Transfer_Cnn14_d(in_channels = 7,  classes_num = 3, freeze_base = False),
          nn.AvgPool2d(2, 2) 
    )
  
       
    self.pe = PositionalEncoding(pos_len=100, d_model=2048, pe_type='t', dropout=0.0)
    self.sed_trans_track1 = nn.TransformerEncoder(
        nn.TransformerEncoderLayer(d_model=2048, nhead=8, dim_feedforward=1024, dropout=0.2), num_layers=2)
    self.sed_trans_track2 = nn.TransformerEncoder(
        nn.TransformerEncoderLayer(d_model=2048, nhead=8, dim_feedforward=1024, dropout=0.2), num_layers=2)
    self.doa_trans_track1 = nn.TransformerEncoder(
        nn.TransformerEncoderLayer(d_model=2048, nhead=8, dim_feedforward=1024, dropout=0.2), num_layers=2)
    self.doa_trans_track2 = nn.TransformerEncoder(
        nn.TransformerEncoderLayer(d_model=2048, nhead=8, dim_feedforward=1024, dropout=0.2), num_layers=2)

    self.fc_sed_track1 = nn.Linear(1024, 14, bias=True)
    self.fc_sed_track2 = nn.Linear(1024, 14, bias=True)
    self.fc_doa_track1 = nn.Linear(1024, 3, bias=True)
    self.fc_doa_track2 = nn.Linear(1024, 3, bias=True)
    self.final_act_sed = nn.Sequential() # nn.Sigmoid()
    self.final_act_doa = nn.Tanh()

    self.init_weight()

    for param in Transfer_Cnn14.parameters(self):
        param.requires_grad = False
    if  freeze_base:
        # Freeze AudioSet pretrained layers
        for param in self.base.parameters():
            param.requires_grad = False

        self.init_weights()
    for param in Transfer_Cnn14_d.parameters(self):
        param.requires_grad = False
    if  freeze_base:
        # Freeze AudioSet pretrained layers
        for param in self.base.parameters():
            param.requires_grad = False

        self.init_weights()
# def init_weights(self):
#     init_layer(self) #.fc_transfer

def load_from_pretrain(self, pretrained_checkpoint_path):
    checkpoint = torch.load('/mnt/raid/ni/WALE_SEdl/EIN-SELD/Cnn14_DecisionLevelMax_mAP=0.385.pth') # pretrained_checkpoint_path
    self.base.load_state_dict(checkpoint['model']) #model

def forward(self, input,mixup_lambda=None):
    """Input: (batch_size, data_length)
    """
    output_dict = self.base(input, mixup_lambda)
    embedding = output_dict['embedding']
def init_weight(self):

    init_layer(self.fc_sed_track1)
    init_layer(self.fc_sed_track2)
    init_layer(self.fc_doa_track1)
    init_layer(self.fc_doa_track2)


def forward(self, x):
    """
    x: waveform, (batch_size, num_channels, data_length)
    """
    
    x_sed = x[:, :4] #4
    x_doa = x
      # fc
    x_sed_1 = self.final_act_sed(self.fc_sed_track1(x_sed))   #x_sed 
    x_sed_2 = self.final_act_sed(self.fc_sed_track2(x_sed))
    x_sed = torch.stack((x_sed_1, x_sed_2), 2)
    x_doa_1 = self.final_act_doa(self.fc_doa_track1(x_doa))
    x_doa_2 = self.final_act_doa(self.fc_doa_track2(x_doa))
    x_doa = torch.stack((x_doa_1, x_doa_2), 2)
    output = {
        'sed': x_sed,
        'doa': x_doa,
    }

    return output

ptrblck · December 13, 2023, 5:44pm

self.fc_sed_track1(x_sed) fails as 1024 input features are expected while x_sed contains 256.

walkii22 · December 13, 2023, 7:26pm

Thanks, but how can I resolve this issue? please help. I have also observed that during model compilation, the number of parameters = 0, what is actually wrong? is there something odd about my model?

Ashfaque_Khowaja · January 13, 2024, 10:07am

I have also the same problem, can you please help me with this. Here is my code:

Define the Vision Transformer model

class VisionTransformer(nn.Module):
def init(self, num_classes):
super(VisionTransformer, self).init()
# Define your ViT model architecture here

def forward(self, x):
    # Implement the forward pass of the ViT model here
    
    return x

Define the Convolutional Neural Network model

class CNN(nn.Module):
def init(self, num_classes):
super(CNN, self).init()
# Define your CNN model architecture here

def forward(self, x):
    # Implement the forward pass of the CNN model here
    
    return x

Define the combined model

batch_size = 2
feature_size = 150528
out_features = 5 # Number of classes

Define your model architecture

class CervicalCancerClassifier(nn.Module):
def init(self):
super(CervicalCancerClassifier, self).init()
self.vit = VisionTransformer(num_classes=out_features)
self.cnn = CNN(num_classes=out_features)
self.fc = nn.Linear(feature_size, out_features)

def forward(self, x):
    x = self.vit(x)
    x = self.cnn(x)
    x = x.view(x.size(0), -1)
    x = self.fc(x)
    return x

here is the error:
strong text

ptrblck · January 13, 2024, 2:18pm

You didn’t post the error message but I assume you are seeing a shape mismatch which would point to the linear layer. Check the activation shape as well as the features the linear layer expects and adapt the latter.

Ashfaque_Khowaja · January 13, 2024, 8:21pm

here is the error:
RuntimeError: Given groups=1, weight of size [64, 768, 3, 3], expected input[32, 1000, 1, 1] to have 768 channels, but got 1000 channels instead

and here is the updated code:

Set the number of desired output channels

desired_output_channels = 12

Transformer encoder output size (adjust if needed)

transformer_output_size = 768

Define additional convolutional and fully connected layers

additional_conv = nn.Conv2d(in_channels=768, out_channels=64, kernel_size=3, stride=1, padding=1)
additional_fc = nn.Linear(in_features=64, out_features=desired_output_channels)

Concatenate ViT, Conv, and FC layers

class ExtendedViTModel(nn.Module):
def init(self, vit_model, additional_conv, additional_fc):
super(ExtendedViTModel, self).init()
self.vit_model = vit_model
self.additional_conv = additional_conv
self.additional_fc = additional_fc

def forward(self, x):
    # ViT forward pass
    vit_output = self.vit_model(x)
    
    # Additional convolutional layer
    vit_output = vit_output.unsqueeze(-1).unsqueeze(-1)  # Add height and width dimensions
    model.additional_conv.out_channels = 1000

    conv_output = self.additional_conv(vit_output)

    conv_output = nn.functional.adaptive_avg_pool2d(conv_output, (1, 1))

    # Flatten
    conv_output = conv_output.view(conv_output.size(0), -1)

    # Additional fully connected layer
    fc_output = self.additional_fc(conv_output)

    return fc_output

Create the extended model

pretrained_vit = ExtendedViTModel(pretrained_vit, additional_conv, additional_fc).to(device)

Uncomment for model output

print(pretrained_vit)

Ashfaque_Khowaja · January 13, 2024, 8:23pm

Prachi_Garg1 · February 13, 2024, 11:23pm

for i in range(0, 2034, 16):
for j in range(0, 2003, 16):
for k in range(0, 8182, 16):
sliced_data = crystal[i:i+16, j:j+16, k:k+16,:]
tensor_data=torch.as_tensor(sliced_data) #convert to tensor
tensor_data=torch.permute(tensor_data, (3, 0, 1, 2))
Y=model(tensor_data.float()) # pass through model

Error:
RuntimeError Traceback (most recent call last)
/scratch/14940477/ipykernel_1076915/2486182164.py in
5 tensor_data=torch.as_tensor(sliced_data) #convert to tensor
6 tensor_data=torch.permute(tensor_data, (3, 0, 1, 2))
----> 7 Y=model(tensor_data.float()) # pass through model
8 #vac conc
9 #x=i*vxl_size_x

/projects/academic/kreyes3/AtomicDL/YZTDL/model_training/my_env./lib/python3.9/site-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
→ 1518 return self._call_impl(*args, **kwargs)
1519
1520 def _call_impl(self, *args, **kwargs):

/projects/academic/kreyes3/AtomicDL/YZTDL/model_training/my_env./lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1527 return forward_call(*args, **kwargs)
1528
1529 try:

/projects/academic/kreyes3/AtomicDL/YZTDL/model_training/my_env./lib/python3.9/site-packages/vacancy_reconstruction/init.py in forward(self, x)
127
128 elif self.reconstruction_mode == ReconstructionMode.COUNTS:
→ 129 return self.head(z[-1]) # Dense network applied to latent features
130
131 else:

/projects/academic/kreyes3/AtomicDL/YZTDL/model_training/my_env./lib/python3.9/site-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
→ 1518 return self._call_impl(*args, **kwargs)
1519
1520 def _call_impl(self, *args, **kwargs):

/projects/academic/kreyes3/AtomicDL/YZTDL/model_training/my_env./lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1527 return forward_call(*args, **kwargs)
1528
1529 try:

/projects/academic/kreyes3/AtomicDL/YZTDL/model_training/my_env./lib/python3.9/site-packages/torch/nn/modules/container.py in forward(self, input)
213 def forward(self, input):
214 for module in self:
→ 215 input = module(input)
216 return input
217

/projects/academic/kreyes3/AtomicDL/YZTDL/model_training/my_env./lib/python3.9/site-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
→ 1518 return self._call_impl(*args, **kwargs)
1519
1520 def _call_impl(self, *args, **kwargs):

/projects/academic/kreyes3/AtomicDL/YZTDL/model_training/my_env./lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1527 return forward_call(*args, **kwargs)
1528
1529 try:

/projects/academic/kreyes3/AtomicDL/YZTDL/model_training/my_env./lib/python3.9/site-packages/torch/nn/modules/linear.py in forward(self, input)
112
113 def forward(self, input: Tensor) → Tensor:
→ 114 return F.linear(input, self.weight, self.bias)
115
116 def extra_repr(self) → str:

RuntimeError: mat1 and mat2 shapes cannot be multiplied (6x512 and 3072x4)

ptrblck · February 14, 2024, 2:10pm

Your code is not properly formatted and hard to read. However, it seems the shape mismatch is raised in:

/projects/academic/kreyes3/AtomicDL/YZTDL/model_training/my_env./lib/python3.9/site-packages/vacancy_reconstruction/init.py in forward(self, x)
127
128 elif self.reconstruction_mode == ReconstructionMode.COUNTS:
→ 129 return self.head(z[-1]) # Dense network applied to latent features

so you might want to check which input self.head expects and why z does not fit.

neelam19051 · March 6, 2024, 3:58pm

RuntimeError: mat1 and mat2 shapes cannot be multiplied (8x645 and 215x215)

Here, i am providing few chunk of my code where it gives error regarding matrix multiplication, i am new here, i am trying but unable to figure out its solution. if possible please help me out.

BP  Weight_model start
Weight_classifier(
  (weight_layer): MaskedLinear(in_features=215, out_features=215, bias=True)
  (outlayer): Linear(in_features=215, out_features=215, bias=True)
)
batch_size_8,learning_rate_0.01,epoch_times_1
Traceback (most recent call last):
  File "/home/bvs/neelam/input_ourmodel/input/4valid.py", line 1012, in <module>
    validation(Terms[0], 5)
  File "/home/bvs/neelam/input_ourmodel/input/4valid.py", line 994, in validation
    each_fold_scores = Main(train_set, test_set, func=func)
  File "/home/bvs/neelam/input_ourmodel/input/4valid.py", line 884, in Main
    out = weight_model(weight_features)
  File "/home/bvs/miniconda3/envs/crisprcasfinder/envs/envML/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/bvs/neelam/input_ourmodel/input/4valid.py", line 314, in forward
    weight_out = self.weight_layer(weight_features)
  File "/home/bvs/miniconda3/envs/crisprcasfinder/envs/envML/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/bvs/neelam/input_ourmodel/input/4valid.py", line 331, in forward
    return F.linear(input, masked_weight, self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (8x645 and 215x215)`

Here, I am providing link of all my input file that i am using in this model (4valid.py). (GitHub - neelam19051/DLmodel)

Thank you so much!

ptrblck · March 7, 2024, 2:30am

Double post from here.

Harshal_Dharpure · March 22, 2024, 10:30am

Can Yo please help me solve this I am trying it from last three days

import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
class MFB(nn.Module):
    def __init__(self,img_feat_size, ques_feat_size,is_first, MFB_K, MFB_O, DROPOUT_R):
        super(MFB, self).__init__()
        #self.__C = __C
        self.MFB_K = MFB_K
        self.MFB_O = MFB_O
        self.DROPOUT_R = DROPOUT_R


        self.is_first = is_first
        self.proj_i = nn.Linear(img_feat_size, MFB_K * MFB_O)
        self.proj_q = nn.Linear(ques_feat_size, MFB_K * MFB_O)


        self.dropout = nn.Dropout(DROPOUT_R)
        self.pool = nn.AvgPool1d(MFB_K, stride = MFB_K)

    def forward(self, img_feat, ques_feat,exp_in=1):
        batch_size = img_feat.shape[0]
        img_feat = self.proj_i(img_feat)                # (N, C, K*O)
        ques_feat = self.proj_q(ques_feat)              # (N, 1, K*O)



        exp_out = img_feat * ques_feat           # (N, C, K*O)
        exp_out = self.dropout(exp_out) if self.is_first else self.dropout(exp_out * exp_in)     # (N, C, K*O)
        z = self.pool(exp_out) * self.MFB_K         # (N, C, O)
        z = torch.sqrt(F.relu(z)) - torch.sqrt(F.relu(-z))
        z = F.normalize(z.view(batch_size, -1))         # (N, C*O)
        z = z.view(batch_size, -1, self.MFB_O)      # (N, C, O)
        return z
class Classifier(pl.LightningModule):

  def __init__(self):
    super().__init__()
    self.MFB = MFB(512,768,True,256,64,0.1)
    self.loss_fn_emotion=torch.nn.KLDivLoss(reduction='batchmean',log_target=True)
    self.fin_y_shape = torch.nn.Linear(768,512)
    self.fin_old = torch.nn.Linear(64,2)
    self.fin = torch.nn.Linear(16 * 768, 64)
    self.fin_persuasive = torch.nn.Linear(16 * 768, 64)
    self.fin_e1 = torch.nn.Linear(16 * 768, 64)
    self.fin_e2 = torch.nn.Linear(16 * 768, 64)
    self.fin_e3 = torch.nn.Linear(16 * 768, 64)
    self.fin_e4 = torch.nn.Linear(16 * 768, 64)
    self.fin_e5 = torch.nn.Linear(16 * 768, 64)
    self.fin_e6 = torch.nn.Linear(16 * 768, 64)
    self.fin_e7 = torch.nn.Linear(16 * 768, 64)
    self.fin_e8 = torch.nn.Linear(16 * 768, 64)
    self.fin_e9 = torch.nn.Linear(16 * 768, 64)


    self.validation_step_outputs = []
    self.test_step_outputs = []

  def forward(self, x,y,rag):
      x_,y_,rag_ = x,y,rag
      print("x.shape", x.shape)
      z = self.MFB(torch.unsqueeze(y, axis=1), torch.unsqueeze(x, axis=1))
      #cross_attention= (rag and  x/y)
      z_new = torch.squeeze(z, dim=1)
      c = self.fin_old(z_new)
      c_e1 = self.fin_e1(torch.squeeze(z,dim=1))
      c_v = self.fin_persuasive(torch.squeeze(z,dim=1))
      c_e2 = self.fin_e2(torch.squeeze(z,dim=1))
      c_e3 = self.fin_e3(torch.squeeze(z,dim=1))
      c_e4 = self.fin_e4(torch.squeeze(z,dim=1))
      c_e5 = self.fin_e5(torch.squeeze(z,dim=1))
      c_e6 = self.fin_e6(torch.squeeze(z,dim=1))
      c_e7 = self.fin_e7(torch.squeeze(z,dim=1))
      c_e8 = self.fin_e8(torch.squeeze(z,dim=1))
      c_e9 = self.fin_e9(torch.squeeze(z,dim=1))

      c = torch.log_softmax(c, dim=1)
      c_v = torch.log_softmax(c_v, dim=1)
      c_e1 = torch.log_softmax(c_e1, dim=1)
      c_e2 = torch.log_softmax(c_e2, dim=1)
      c_e3 = torch.log_softmax(c_e3, dim=1)
      c_e4 = torch.log_softmax(c_e4, dim=1)

      c_e5 = torch.log_softmax(c_e5, dim=1)
      c_e6 = torch.log_softmax(c_e6, dim=1)
      c_e7 = torch.log_softmax(c_e7, dim=1)

      c_e8 = torch.log_softmax(c_e8, dim=1)
      c_e9 = torch.log_softmax(c_e9, dim=1)
      return z,c,c_v,c_e1,c_e2,c_e3,c_e4,c_e5,c_e6,c_e7,c_e8,c_e9

  def cross_entropy_loss(self, logits, labels):
    return F.nll_loss(logits, labels)

  def training_step(self, train_batch, batch_idx):
      lab,txt,rag,img,name,perin,per,iro,alli,ana,inv,meta,puns,sat,hyp= train_batch
      lab = train_batch[lab]
      #print(lab)
      name= train_batch[name]
      txt = train_batch[txt]
      rag = train_batch[rag]
      img = train_batch[img]
      perin = train_batch[perin]
      per = train_batch[per]
      iro= train_batch[iro]
      alli = train_batch[alli]
      ana = train_batch[ana]
      inv = train_batch[inv]
      meta = train_batch[meta]
      puns = train_batch[puns]
      sat = train_batch[sat]
      hyp = train_batch[hyp]

      gt_emotion = torch.cat((torch.unsqueeze(per,1),torch.unsqueeze(iro,1),torch.unsqueeze(alli,1),\
                              torch.unsqueeze(ana,1),torch.unsqueeze(inv,1),torch.unsqueeze(meta,1),\
                              torch.unsqueeze(puns,1),torch.unsqueeze(sat,1),torch.unsqueeze(hyp,1)),1)

      z,logit_offen,logit_perin,a,b,c,d,e,f,g,h,i= self.forward(txt,img,rag) # logit_target is logits of target


      # logit_offen= self.forward(txt,img,rag)
      loss23=self.cross_entropy_loss(logit_perin,perin)
      loss1 = self.cross_entropy_loss(logit_offen, lab)
      loss2 = self.cross_entropy_loss(a,per)
      loss3 = self.cross_entropy_loss(b,iro)
      loss4 = self.cross_entropy_loss(c, alli)
      loss5 = self.cross_entropy_loss(d,ana)
      loss6 = self.cross_entropy_loss(e,inv)
      loss7 = self.cross_entropy_loss(f,meta)
      loss8 = self.cross_entropy_loss(g,puns)
      loss9 = self.cross_entropy_loss(h,sat)
      loss10 = self.cross_entropy_loss(i,hyp)

      # loss = loss1 + loss2 + loss3 + loss4 + loss5 + loss6 + loss7 + loss8 +loss9 + loss10

      loss_emo_mult = F.binary_cross_entropy_with_logits(gt_emotion.float())


      loss=loss1+loss_emo_mult
      self.log('train_loss', loss)

      return loss


  def validation_step(self, val_batch, batch_idx):
      lab,txt,rag,img,name,perin,per,iro,alli,ana,inv,meta,puns,sat,hyp = val_batch
      lab = val_batch[lab]
      #print(lab)
      txt = val_batch[txt]
      rag = val_batch[rag]
      img = val_batch[img]
      name = val_batch[name]
      perin = val_batch[perin]
      per = val_batch[per]
      iro = val_batch[iro]
      alli = val_batch[alli]
      ana = val_batch[ana]
      inv = val_batch[inv]
      meta = val_batch[meta]
      puns = val_batch[puns]
      sat = val_batch[sat]
      hyp = val_batch[hyp]
      
      gt_emotion = torch.cat((torch.unsqueeze(per,1),torch.unsqueeze(iro,1),torch.unsqueeze(alli,1),\
                              torch.unsqueeze(ana,1),torch.unsqueeze(inv,1),torch.unsqueeze(meta,1),\
                              torch.unsqueeze(puns,1),torch.unsqueeze(sat,1),torch.unsqueeze(hyp,1)),1)

      logits,logit_perin,a,b,c,d,e,f,g,h,i = self.forward(txt,img,rag)


      # logits= self.forward(txt,img,rag)
      logits=logits.float()
      tmp = np.argmax(logits.detach().cpu().numpy(),axis=-1)
      loss = self.cross_entropy_loss(logits, lab)
      lab = lab.detach().cpu().numpy()
      self.log('val_acc', accuracy_score(lab,tmp))
      self.log('val_roc_auc',roc_auc_score(lab,tmp))
      self.log('val_loss', loss)
      tqdm_dict = {'val_acc': accuracy_score(lab,tmp)}
      self.validation_step_outputs.append({'progress_bar': tqdm_dict,'val_f1 offensive': f1_score(lab,tmp,average='macro')})

      return {
                'progress_bar': tqdm_dict,
      'val_f1 offensive': f1_score(lab,tmp,average='macro'),
      'val_f1 personification': f1_score(per,tmp,average='macro'),
      'val_f1 irony': f1_score(iro,tmp,average='macro'),
      'val_f1 alliteration': f1_score(alli,tmp,average='macro'),
      'val_f1 analogies': f1_score(ana,tmp,average='macro'),
      'val_f1 invective': f1_score(inv,tmp,average='macro'),
      'val_f1 metaphor': f1_score(meta,tmp,average='macro'),
      'val_f1 punsandplay': f1_score(puns,tmp,average='macro'),
      'val_f1 satire': f1_score(sat,tmp,average='macro'),
      'val_f1 hyperboles': f1_score(hyp,tmp,average='macro')

      }

  def on_validation_epoch_end(self):
    outs = []
    outs14=[]
    for out in self.validation_step_outputs:
       outs.append(out['progress_bar']['val_acc'])
       outs14.append(out['val_f1 offensive'])
    self.log('val_acc_all_offn', sum(outs)/len(outs))
    self.log('val_f1 offensive', sum(outs14)/len(outs14))
    print(f'***val_acc_all_offn at epoch end {sum(outs)/len(outs)}****')
    print(f'***val_f1 offensive at epoch end {sum(outs14)/len(outs14)}****')
    self.validation_step_outputs.clear()

  def test_step(self, batch, batch_idx):
    #   lab,txt,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12, e13,e14, e15,e16,img,name= batch
      lab,txt,rag,img,name,per,iro,alli,ana,inv,meta,puns,sat,hyp= batch

      lab = batch[lab]
      #print(lab)
      rag = batch[rag]

      txt = batch[txt]
      img = batch[img]
      name = batch[name]
      per = batch[per]
      iro = batch[iro]
      alli = batch[alli]
      ana = batch[ana]
      inv = batch[inv]
      meta = batch[meta]
      puns = batch[puns]
      sat = batch[sat]
      hyp = batch[hyp]

      gt_emotion = torch.cat((torch.unsqueeze(e1,1),torch.unsqueeze(e2,1),torch.unsqueeze(e3,1),torch.unsqueeze(e4,1),torch.unsqueeze(e5,1),torch.unsqueeze(e6,1),\
                              torch.unsqueeze(e7,1),torch.unsqueeze(e8,1),torch.unsqueeze(e9,1)),1)

      _,logits,a,b,c,d,e,f,g,h,i= self.forward(txt,img,rag)
      logits = logits.float()
      tmp = np.argmax(logits.detach().cpu().numpy(force=True),axis=-1)
      loss = self.cross_entropy_loss(logits, lab)
      lab = lab.detach().cpu().numpy()
      self.log('test_acc', accuracy_score(lab,tmp))
      self.log('test_roc_auc',roc_auc_score(lab,tmp))
      self.log('test_loss', loss)
      tqdm_dict = {'test_acc': accuracy_score(lab,tmp)}
      self.test_step_outputs.append({'progress_bar': tqdm_dict,'test_acc': accuracy_score(lab,tmp), 'test_f1_score': f1_score(lab,tmp,average='macro')})
      return {
                'progress_bar': tqdm_dict,
                'test_acc': accuracy_score(lab,tmp),
                'test_f1_score': f1_score(lab,tmp,average='macro'),
                'test_f1_score': f1_score(lab,tmp,average='macro'),
                'test_f1_score': f1_score(lab,tmp,average='macro'),
                'test_f1_score': f1_score(lab,tmp,average='macro'),
                'test_f1_score': f1_score(lab,tmp,average='macro'),
                'test_f1_score': f1_score(lab,tmp,average='macro'),
                'test_f1_score': f1_score(lab,tmp,average='macro')
      }
  def on_test_epoch_end(self):
      # OPTIONAL
      outs = []
      outs1,outs2,outs3,outs4,outs5,outs6,outs7,outs8,outs9,outs10,outs11,outs12,outs13,outs14 = \
      [],[],[],[],[],[],[],[],[],[],[],[],[],[]
      for out in self.test_step_outputs:
        outs.append(out['test_acc'])
        outs2.append(out['test_f1_score'])
      self.log('test_acc', sum(outs)/len(outs))
      self.log('test_f1_score', sum(outs2)/len(outs2))
      self.test_step_outputs.clear()

  def configure_optimizers(self):
    # optimizer = torch.optim.Adam(self.parameters(), lr=3e-2)
    optimizer = torch.optim.Adam(self.parameters(), lr=1e-5)

    return optimizer


"""
Main Model:
Initialize
Forward Pass
Training Step
Validation Step
Testing Step

Pp
"""

class HmDataModule(pl.LightningDataModule):

  def setup(self, stage):
    self.hm_train = t_p
    self.hm_val = v_p
    # self.hm_test = test
    self.hm_test = te_p

  def train_dataloader(self):
    return DataLoader(self.hm_train, batch_size=20, drop_last=True)

  def val_dataloader(self):
    return DataLoader(self.hm_val, batch_size=20, drop_last=True)

  def test_dataloader(self):
    return DataLoader(self.hm_test, batch_size=20, drop_last=True)

data_module = HmDataModule()
checkpoint_callback = ModelCheckpoint(
     monitor='val_acc_all_offn',
     dirpath='mrinal/',
     filename='epoch{epoch:02d}-val_f1_all_offn{val_acc_all_offn:.2f}',
     auto_insert_metric_name=False,
     save_top_k=1,
    mode="max",
 )
all_callbacks = []
all_callbacks.append(checkpoint_callback)
# train
from pytorch_lightning import seed_everything
seed_everything(42, workers=True)
hm_model = Classifier()
gpus=1
#if torch.cuda.is_available():gpus=0
trainer = pl.Trainer(deterministic=True,max_epochs=10,precision=16,callbacks=all_callbacks)
trainer.fit(hm_model, data_module)

30.0 M Total params
120.198 Total estimated model params size (MB)
Sanity Checking DataLoader 0: 0%
0/2 [00:00<?, ?it/s]
x.shape torch.Size([20, 768])

RuntimeError Traceback (most recent call last)
in <cell line: 285>()
283 #if torch.cuda.is_available():gpus=0
284 trainer = pl.Trainer(deterministic=True,max_epochs=10,precision=16,callbacks=all_callbacks)
→ 285 trainer.fit(hm_model, data_module)

14 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py in forward(self, input)
112
113 def forward(self, input: Tensor) → Tensor:
→ 114 return F.linear(input, self.weight, self.bias)
115
116 def extra_repr(self) → str:

RuntimeError: mat1 and mat2 shapes cannot be multiplied (20x64 and 12288x64)

Harshal_Dharpure · March 22, 2024, 10:32am

I am not getting what’s wrong I am doing

ptrblck · March 22, 2024, 12:45pm

It seems z should have a shape of [batch_size, -1, 64] here:

z = self.MFB(torch.unsqueeze(y, axis=1), torch.unsqueeze(x, axis=1))

based on:

z = z.view(batch_size, -1, self.MFB_O)      # (N, C, O)
return z

and will thus use 64 input features (dim1 is squeezed later).
The self.fin_eX layers however expect an activation input with 16 * 768 input features and will this fail.

Harshal_Dharpure · March 22, 2024, 1:26pm

sir,How should I solve this, can you please tell me little bit more.

Jay_Gatsby · March 25, 2024, 10:10am

import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
from typing import Union

class SineReLU(nn.Module):
def forward(self, x):
return torch.sin(F.relu(x))

class HighwayNetwork(nn.Module):
def init(self, size):
super().init()
self.W1 = nn.Linear(size, size)
self.W2 = nn.Linear(size, size)
self.W1.bias.data.fill_(0.)

def forward(self, x):
    x1 = self.W1(x)
    x2 = self.W2(x)
    g = torch.sigmoid(x2)
    # Replace F.relu(x1) with SineReLU in the HighwayNetwork class
    y = g * SineReLU()(x1) + (1. - g) * x
    return y

class Encoder(nn.Module):
def init(self, embed_dims, num_chars, encoder_dims, K, num_highways, dropout):
super().init()
prenet_dims = (encoder_dims, encoder_dims)
cbhg_channels = encoder_dims
self.embedding = nn.Embedding(num_chars, embed_dims)
self.pre_net = PreNet(embed_dims, fc1_dims=prenet_dims[0], fc2_dims=prenet_dims[1],
dropout=dropout)
self.cbhg = CBHG(K=K, in_channels=cbhg_channels, channels=cbhg_channels,
proj_channels=[cbhg_channels, cbhg_channels],
num_highways=num_highways)

def forward(self, x, speaker_embedding=None):
    x = self.embedding(x)
    x = self.pre_net(x)
    x.transpose_(1, 2)
    x = self.cbhg(x)
    if speaker_embedding is not None:
        x = self.add_speaker_embedding(x, speaker_embedding)
    return x

def add_speaker_embedding(self, x, speaker_embedding):
    # SV2TTS
    # The input x is the encoder output and is a 3D tensor with size (batch_size, num_chars, tts_embed_dims)
    # When training, speaker_embedding is also a 2D tensor with size (batch_size, speaker_embedding_size)
    #     (for inference, speaker_embedding is a 1D tensor with size (speaker_embedding_size))
    # This concats the speaker embedding for each char in the encoder output

    # Save the dimensions as human-readable names
    batch_size = x.size()[0]
    num_chars = x.size()[1]

    if speaker_embedding.dim() == 1:
        idx = 0
    else:
        idx = 1

    # Start by making a copy of each speaker embedding to match the input text length
    # The output of this has size (batch_size, num_chars * tts_embed_dims)
    speaker_embedding_size = speaker_embedding.size()[idx]
    e = speaker_embedding.repeat_interleave(num_chars, dim=idx)

    # Reshape it and transpose
    e = e.reshape(batch_size, speaker_embedding_size, num_chars)
    e = e.transpose(1, 2)

    # Concatenate the tiled speaker embedding with the encoder output
    x = torch.cat((x, e), 2)
    return x

class BatchNormConv(nn.Module):
def init(self, in_channels, out_channels, kernel, relu=True):
super().init()
self.conv = nn.Conv1d(in_channels, out_channels, kernel, stride=1, padding=kernel // 2, bias=False)
self.bnorm = nn.BatchNorm1d(out_channels)
self.relu = SineReLU()

def forward(self, x):
    x = self.conv(x)
    x = F.relu(x) if self.relu is True else x
    return self.bnorm(x)

class CBHG(nn.Module):
def init(self, K, in_channels, channels, proj_channels, num_highways):
super().init()

    # List of all rnns to call `flatten_parameters()` on
    self._to_flatten = []

    self.bank_kernels = [i for i in range(1, K + 1)]
    self.conv1d_bank = nn.ModuleList()
    for k in self.bank_kernels:
        conv = BatchNormConv(in_channels, channels, k)
        self.conv1d_bank.append(conv)

    self.maxpool = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)

    self.conv_project1 = BatchNormConv(len(self.bank_kernels) * channels, proj_channels[0], 3)
    self.conv_project2 = BatchNormConv(proj_channels[0], proj_channels[1], 3, relu=False)

    # Fix the highway input if necessary
    if proj_channels[-1] != channels:
        self.highway_mismatch = True
        self.pre_highway = nn.Linear(proj_channels[-1], channels, bias=False)
    else:
        self.highway_mismatch = False

    self.highways = nn.ModuleList()
    for i in range(num_highways):
        hn = HighwayNetwork(channels)
        self.highways.append(hn)

    self.rnn = nn.GRU(channels, channels // 2, batch_first=True, bidirectional=True)
    self._to_flatten.append(self.rnn)

    # Avoid fragmentation of RNN parameters and associated warning
    self._flatten_parameters()

def forward(self, x):
    # Although we `_flatten_parameters()` on init, when using DataParallel
    # the model gets replicated, making it no longer guaranteed that the
    # weights are contiguous in GPU memory. Hence, we must call it again
    self._flatten_parameters()

    # Save these for later
    residual = x
    seq_len = x.size(-1)
    conv_bank = []

    # Convolution Bank
    for conv in self.conv1d_bank:
        c = conv(x) # Convolution
        conv_bank.append(c[:, :, :seq_len])

    # Stack along the channel axis
    conv_bank = torch.cat(conv_bank, dim=1)

    # dump the last padding to fit residual
    x = self.maxpool(conv_bank)[:, :, :seq_len]

    # Conv1d projections
    x = self.conv_project1(x)
    x = self.conv_project2(x)

    # Residual Connect
    x = x + residual

    # Through the highways
    x = x.transpose(1, 2)
    if self.highway_mismatch is True:
        x = self.pre_highway(x)
    for h in self.highways: x = h(x)

    # And then the RNN
    x, _ = self.rnn(x)
    return x

def _flatten_parameters(self):
    """Calls `flatten_parameters` on all the rnns used by the WaveRNN. Used
    to improve efficiency and avoid PyTorch yelling at us."""
    [m.flatten_parameters() for m in self._to_flatten]

class PreNet(nn.Module):
def init(self, in_dims, fc1_dims=256, fc2_dims=128, dropout=0.5):
super().init()
self.fc1 = nn.Linear(in_dims, fc1_dims)
self.fc2 = nn.Linear(fc1_dims, fc2_dims)
self.p = dropout

def forward(self, x):
    x = self.fc1(x)
    x = F.relu(x)
    x = F.dropout(x, self.p, training=True)
    x = self.fc2(x)
    x = F.relu(x)
    x = F.dropout(x, self.p, training=True)
    return x

class Attention(nn.Module):
def init(self, attn_dims):
super().init()
self.W = nn.Linear(attn_dims, attn_dims, bias=False)
self.v = nn.Linear(attn_dims, 1, bias=False)

def forward(self, encoder_seq_proj, query, t):

    # print(encoder_seq_proj.shape)
    # Transform the query vector
    query_proj = self.W(query).unsqueeze(1)

    # Compute the scores
    u = self.v(torch.tanh(encoder_seq_proj + query_proj))
    scores = F.softmax(u, dim=1)

    return scores.transpose(1, 2)

class LSA(nn.Module):
def init(self, attn_dim, kernel_size=31, filters=32):
super().init()
self.conv = nn.Conv1d(1, filters, padding=(kernel_size - 1) // 2, kernel_size=kernel_size, bias=True)
self.L = nn.Linear(filters, attn_dim, bias=False)
self.W = nn.Linear(attn_dim, attn_dim, bias=True) # Include the attention bias in this term
self.v = nn.Linear(attn_dim, 1, bias=False)
self.cumulative = None
self.attention = None

def init_attention(self, encoder_seq_proj):
    device = next(self.parameters()).device  # use same device as parameters
    b, t, c = encoder_seq_proj.size()
    self.cumulative = torch.zeros(b, t, device=device)
    self.attention = torch.zeros(b, t, device=device)

def forward(self, encoder_seq_proj, query, t, chars):

    if t == 0: self.init_attention(encoder_seq_proj)

    processed_query = self.W(query).unsqueeze(1)

    location = self.cumulative.unsqueeze(1)
    processed_loc = self.L(self.conv(location).transpose(1, 2))

    u = self.v(torch.tanh(processed_query + encoder_seq_proj + processed_loc))
    u = u.squeeze(-1)

    # Mask zero padding chars
    u = u * (chars != 0).float()

    # Smooth Attention
    # scores = torch.sigmoid(u) / torch.sigmoid(u).sum(dim=1, keepdim=True)
    scores = F.softmax(u, dim=1)
    self.attention = scores
    self.cumulative = self.cumulative + self.attention

    return scores.unsqueeze(-1).transpose(1, 2)

class Decoder(nn.Module):
# Class variable because its value doesn’t change between classes
# yet ought to be scoped by class because its a property of a Decoder
max_r = 20
def init(self, n_mels, encoder_dims, decoder_dims, lstm_dims,
dropout, speaker_embedding_size):
super().init()
self.register_buffer(“r”, torch.tensor(1, dtype=torch.int))
self.n_mels = n_mels
prenet_dims = (decoder_dims * 2, decoder_dims * 2)
self.prenet = PreNet(n_mels, fc1_dims=prenet_dims[0], fc2_dims=prenet_dims[1],
dropout=dropout)
self.attn_net = LSA(decoder_dims)
self.attn_rnn = nn.GRUCell(encoder_dims + prenet_dims[1] + speaker_embedding_size, decoder_dims)
self.rnn_input = nn.Linear(encoder_dims + decoder_dims + speaker_embedding_size, lstm_dims)
self.res_rnn1 = nn.LSTMCell(lstm_dims, lstm_dims)
self.res_rnn2 = nn.LSTMCell(lstm_dims, lstm_dims)
self.mel_proj = nn.Linear(lstm_dims, n_mels * self.max_r, bias=False)
self.stop_proj = nn.Linear(encoder_dims + speaker_embedding_size + lstm_dims, 1)

def zoneout(self, prev, current, p=0.1):
    device = next(self.parameters()).device  # Use same device as parameters
    mask = torch.zeros(prev.size(), device=device).bernoulli_(p)
    return prev * mask + current * (1 - mask)

def forward(self, encoder_seq, encoder_seq_proj, prenet_in,
            hidden_states, cell_states, context_vec, t, chars):

    # Need this for reshaping mels
    batch_size = encoder_seq.size(0)

    # Unpack the hidden and cell states
    attn_hidden, rnn1_hidden, rnn2_hidden = hidden_states
    rnn1_cell, rnn2_cell = cell_states

    # PreNet for the Attention RNN
    prenet_out = self.prenet(prenet_in)

    # Compute the Attention RNN hidden state
    attn_rnn_in = torch.cat([context_vec, prenet_out], dim=-1)
    attn_hidden = self.attn_rnn(attn_rnn_in.squeeze(1), attn_hidden)

    # Compute the attention scores
    scores = self.attn_net(encoder_seq_proj, attn_hidden, t, chars)

    # Dot product to create the context vector
    context_vec = scores @ encoder_seq
    context_vec = context_vec.squeeze(1)

    # Concat Attention RNN output w. Context Vector & project
    x = torch.cat([context_vec, attn_hidden], dim=1)
    x = self.rnn_input(x)

    # Compute first Residual RNN
    rnn1_hidden_next, rnn1_cell = self.res_rnn1(x, (rnn1_hidden, rnn1_cell))
    if self.training:
        rnn1_hidden = self.zoneout(rnn1_hidden, rnn1_hidden_next)
    else:
        rnn1_hidden = rnn1_hidden_next
    x = x + rnn1_hidden

    # Compute second Residual RNN
    rnn2_hidden_next, rnn2_cell = self.res_rnn2(x, (rnn2_hidden, rnn2_cell))
    if self.training:
        rnn2_hidden = self.zoneout(rnn2_hidden, rnn2_hidden_next)
    else:
        rnn2_hidden = rnn2_hidden_next
    x = x + rnn2_hidden

    # Project Mels
    mels = self.mel_proj(x)
    mels = mels.view(batch_size, self.n_mels, self.max_r)[:, :, :self.r]
    hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
    cell_states = (rnn1_cell, rnn2_cell)

    # Stop token prediction
    s = torch.cat((x, context_vec), dim=1)
    s = self.stop_proj(s)
    stop_tokens = torch.sigmoid(s)

    return mels, scores, hidden_states, cell_states, context_vec, stop_tokens

class Tacotron(nn.Module):
def init(self, embed_dims, num_chars, encoder_dims, decoder_dims, n_mels,
fft_bins, postnet_dims, encoder_K, lstm_dims, postnet_K, num_highways,
dropout, stop_threshold, speaker_embedding_size):
super().init()
self.n_mels = n_mels
self.lstm_dims = lstm_dims
self.encoder_dims = encoder_dims
self.decoder_dims = decoder_dims
self.speaker_embedding_size = speaker_embedding_size
self.encoder = Encoder(embed_dims, num_chars, encoder_dims,
encoder_K, num_highways, dropout)
self.encoder_proj = nn.Linear(encoder_dims + speaker_embedding_size, decoder_dims, bias=False)
self.decoder = Decoder(n_mels, encoder_dims, decoder_dims, lstm_dims,
dropout, speaker_embedding_size)
self.postnet = CBHG(postnet_K, n_mels, postnet_dims,
[postnet_dims, fft_bins], num_highways)
self.post_proj = nn.Linear(postnet_dims, fft_bins, bias=False)

    self.init_model()
    self.num_params()

    self.register_buffer("step", torch.zeros(1, dtype=torch.long))
    self.register_buffer("stop_threshold", torch.tensor(stop_threshold, dtype=torch.float32))

@property
def r(self):
    return self.decoder.r.item()

@r.setter
def r(self, value):
    self.decoder.r = self.decoder.r.new_tensor(value, requires_grad=False)

def forward(self, x, m, speaker_embedding):
    device = next(self.parameters()).device  # use same device as parameters

    self.step += 1
    batch_size, _, steps  = m.size()

    # Initialise all hidden states and pack into tuple
    attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
    rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
    rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
    hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)

    # Initialise all lstm cell states and pack into tuple
    rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
    rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
    cell_states = (rnn1_cell, rnn2_cell)

    # <GO> Frame for start of decoder loop
    go_frame = torch.zeros(batch_size, self.n_mels, device=device)

    # Need an initial context vector
    context_vec = torch.zeros(batch_size, self.encoder_dims + self.speaker_embedding_size, device=device)

    # SV2TTS: Run the encoder with the speaker embedding
    # The projection avoids unnecessary matmuls in the decoder loop
    encoder_seq = self.encoder(x, speaker_embedding)
    print("Encoder sequence shape:", encoder_seq.shape)
    encoder_seq_proj = self.encoder_proj(encoder_seq)
    print(encoder_seq.shape, self.encoder_proj.weight.shape, self.encoder_proj.bias.shape)

    # Need a couple of lists for outputs
    mel_outputs, attn_scores, stop_outputs = [], [], []

    # Run the decoder loop
    for t in range(0, steps, self.r):
        prenet_in = m[:, :, t - 1] if t > 0 else go_frame
        mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \
            self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
                         hidden_states, cell_states, context_vec, t, x)
        mel_outputs.append(mel_frames)
        attn_scores.append(scores)
        stop_outputs.extend([stop_tokens] * self.r)

    # Concat the mel outputs into sequence
    mel_outputs = torch.cat(mel_outputs, dim=2)

    # Post-Process for Linear Spectrograms
    postnet_out = self.postnet(mel_outputs)
    linear = self.post_proj(postnet_out)
    linear = linear.transpose(1, 2)

    # For easy visualisation
    attn_scores = torch.cat(attn_scores, 1)
    # attn_scores = attn_scores.cpu().data.numpy()
    stop_outputs = torch.cat(stop_outputs, 1)

    return mel_outputs, linear, attn_scores, stop_outputs

def generate(self, x, speaker_embedding=None, steps=2000):
    self.eval()
    device = next(self.parameters()).device  # use same device as parameters

    batch_size, _  = x.size()

    # Need to initialise all hidden states and pack into tuple for tidyness
    attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
    rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
    rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
    hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)

    # Need to initialise all lstm cell states and pack into tuple for tidyness
    rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
    rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
    cell_states = (rnn1_cell, rnn2_cell)

    # Need a <GO> Frame for start of decoder loop
    go_frame = torch.zeros(batch_size, self.n_mels, device=device)

    # Need an initial context vector
    context_vec = torch.zeros(batch_size, self.encoder_dims + self.speaker_embedding_size, device=device)

    # SV2TTS: Run the encoder with the speaker embedding
    # The projection avoids unnecessary matmuls in the decoder loop
    encoder_seq = self.encoder(x, speaker_embedding)
    encoder_seq_proj = self.encoder_proj(encoder_seq)

    # Need a couple of lists for outputs
    mel_outputs, attn_scores, stop_outputs = [], [], []

    # Run the decoder loop
    for t in range(0, steps, self.r):
        prenet_in = mel_outputs[-1][:, :, -1] if t > 0 else go_frame
        mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \
        self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
                     hidden_states, cell_states, context_vec, t, x)
        mel_outputs.append(mel_frames)
        attn_scores.append(scores)
        stop_outputs.extend([stop_tokens] * self.r)
        # Stop the loop when all stop tokens in batch exceed threshold
        if (stop_tokens > 0.5).all() and t > 10: break

    # Concat the mel outputs into sequence
    mel_outputs = torch.cat(mel_outputs, dim=2)

    # Post-Process for Linear Spectrograms
    postnet_out = self.postnet(mel_outputs)
    linear = self.post_proj(postnet_out)


    linear = linear.transpose(1, 2)

    # For easy visualisation
    attn_scores = torch.cat(attn_scores, 1)
    stop_outputs = torch.cat(stop_outputs, 1)

    self.train()

    return mel_outputs, linear, attn_scores

def init_model(self):
    for p in self.parameters():
        if p.dim() > 1: nn.init.xavier_uniform_(p)

def get_step(self):
    return self.step.data.item()

def reset_step(self):
    # assignment to parameters or buffers is overloaded, updates internal dict entry
    self.step = self.step.data.new_tensor(1)

def log(self, path, msg):
    with open(path, "a") as f:
        print(msg, file=f)

def load(self, path, optimizer=None):
    # Use device of model params as location for loaded state
    device = next(self.parameters()).device
    checkpoint = torch.load(str(path), map_location=device)
    self.load_state_dict(checkpoint["model_state"])

    if "optimizer_state" in checkpoint and optimizer is not None:
        optimizer.load_state_dict(checkpoint["optimizer_state"])

def save(self, path, optimizer=None):
    if optimizer is not None:
        torch.save({
            "model_state": self.state_dict(),
            "optimizer_state": optimizer.state_dict(),
        }, str(path))
    else:
        torch.save({
            "model_state": self.state_dict(),
        }, str(path))


def num_params(self, print_out=True):
    parameters = filter(lambda p: p.requires_grad, self.parameters())
    parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
    if print_out:
        print("Trainable Parameters: %.3fM" % parameters)
    return parameters

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1788x320 and 512x128)

—can someone guide me with this error?

Harshal_Dharpure · April 20, 2024, 3:38pm

RuntimeError: mat1 and mat2 shapes cannot be multiplied (10x2048 and 64x2)

I am trying to concat the X,Y and Rag Feature But it is Giving me erorr I have use the simple concat but it is Giving me error I just want to concat the x,y,rag feature in the forward function, can Anyone help me to solve the Problem

How do I fix an error when concatenating x, y, and rag in the forward function using torch.cat, ensuring matching dimensions and device types?

import torch
import torch.nn as nn
import torch.nn.functional as F
class MFB(nn.Module):
    def __init__(self,img_feat_size, ques_feat_size, is_first, MFB_K, MFB_O, DROPOUT_R):
        super(MFB, self).__init__()
        #self.__C = __C
        self.MFB_K = MFB_K
        self.MFB_O = MFB_O
        self.DROPOUT_R = DROPOUT_R

        self.is_first = is_first
        self.proj_i = nn.Linear(img_feat_size, MFB_K * MFB_O)
        self.proj_q = nn.Linear(ques_feat_size, MFB_K * MFB_O)

        self.dropout = nn.Dropout(DROPOUT_R)
        self.pool = nn.AvgPool1d(MFB_K, stride = MFB_K)

    def forward(self, img_feat, ques_feat, exp_in=1):
        batch_size = img_feat.shape[0]
        img_feat = self.proj_i(img_feat)                # (N, C, K*O)
        ques_feat = self.proj_q(ques_feat)              # (N, 1, K*O)

        exp_out = img_feat * ques_feat             # (N, C, K*O)
        exp_out = self.dropout(exp_out) if self.is_first else self.dropout(exp_out * exp_in)     # (N, C, K*O)
        z = self.pool(exp_out) * self.MFB_K         # (N, C, O)
        z = torch.sqrt(F.relu(z)) - torch.sqrt(F.relu(-z))
        z = F.normalize(z.view(batch_size, -1))         # (N, C*O)
        z = z.view(batch_size, -1, self.MFB_O)      # (N, C, O)
        return z


#MFB -> Multimodal Factorized Bilinear Pooling
#used to model complex interactions between features like image and text
#MFB_K -> Number Of factors, MFB_O -> Output size,
#Init initializes linear projection layers for image and question features , dropout layer and average pooling layer

#Forward:

#exp_in = input expansion factor (default - 1)
#Linear projection of image and question features to factorized bilinear form
#Element-wise multiplication of image and question features
#APply Dropout
#Average pooling along the factorized dimension (MFB_K) to reduce the size of the output tensor
#Element-wise operations to compute the final output (z) using square root and normalization using Relu.
#The final output represents the fused representation of image and question features.

data = data[~data['Name'].isin(outliers)]
len(sample_dataset_new)

torch.manual_seed(123)
t_p,v_p = torch.utils.data.random_split(sample_dataset_new,[450,50])

# torch.manual_seed(123)
t_p,te_p = torch.utils.data.random_split(t_p,[340,110])

t_p[1]["processed_img"].shape
t_p[1]['processed_txt'].shape
t_p[1]['processed_rag'].shape

(768,)

    class Classifier(pl.LightningModule):

    def __init__(self):
      super().__init__()
      self.MFB = MFB(512,768,True,256,64,0.1)
      self.fin_y_shape = torch.nn.Linear(768,512)
      self.fin_old = torch.nn.Linear(64,2)
      self.fin = torch.nn.Linear(16 * 768, 64)
      self.fin_inten = torch.nn.Linear(2048,6)
      self.fin_e1 = torch.nn.Linear(64,2)
      self.fin_e2 = torch.nn.Linear(64,2)
      self.fin_e3 = torch.nn.Linear(64,2)
      self.fin_e4 = torch.nn.Linear(64,2)
      self.fin_e5 = torch.nn.Linear(64,2)
      self.fin_e6 = torch.nn.Linear(64,2)
      self.fin_e7 = torch.nn.Linear(64,2)
      self.fin_e8 = torch.nn.Linear(64,2)
      self.fin_e9 = torch.nn.Linear(64,2)
      # self.reduce_x = torch.nn.Linear(768, 512)
      # self.reduce_rag = torch.nn.Linear(768, 512)



      self.validation_step_outputs = []
      self.test_step_outputs = []


    def forward(self, x,y,rag):
        x_,y_,rag_ = x,y,rag
        print("x.shape", x.shape)
        print("y.shape",y.shape)
        print("rag.shape",rag.shape)

        # x = self.reduce_x(x)
        # rag = self.reduce_rag(rag)

        # print("x.shape", x.shape)
        # print("y.shape",y.shape)
        # print("rag.shape",rag.shape)
        # z = self.MFB(torch.unsqueeze(y, axis=1), torch.unsqueeze(rag, axis=1))
        # z_rag = self.MFB(torch.unsqueeze(y, axis=1),torch.unsqueeze(rag, axis=1))
        # z_con = torch.cat((z, z_rag), dim=1)


        # Concatenate x with y and then with rag


        z= torch.cat((torch.cat((x, y), dim=1), rag), dim=1)


        # Pass concatenated x with y and x with rag through your network
        z_new = torch.squeeze(z,dim=1)
        print("z_new shape",z_new)


        c_inten = self.fin_inten(z_new)
        c_e1 = self.fin_e1(z_new)
        c_e2 = self.fin_e2(z_new)
        c_e3 = self.fin_e3(z_new)
        c_e4 = self.fin_e4(z_new)
        c_e5 = self.fin_e5(z_new)
        c_e6 = self.fin_e6(z_new)
        c_e7 = self.fin_e7(z_new)
        c_e8 = self.fin_e8(z_new)
        c_e9 = self.fin_e9(z_new)
        c = self.fin_old(z_new)

        # print("z.shape",z.shape)
        # print("z_new shape",z_new.shape)
        # print("intensity error:", c_inten.shape)
        # print("output:", c.shape)
        # print("c_e1:", c_e1.shape)
        # print("c_e2:", c_e2.shape)
        # print("c_e3:", c_e3.shape)
        # print("c_e4:", c_e4.shape)
        # print("c_e5:", c_e5.shape)
        # print("c_e6:", c_e6.shape)
        # print("c_e7:", c_e7.shape)
        # print("c_e8:", c_e8.shape)
        # print("c_e9:", c_e9.shape)
        # print("logits.shape",logits.shape)


        output = torch.log_softmax(c, dim=1)
        c_inten = torch.log_softmax(c_inten, dim=1)
        c_e1 = torch.log_softmax(c_e1, dim=1)
        c_e2 = torch.log_softmax(c_e2, dim=1)
        c_e3 = torch.log_softmax(c_e3, dim=1)
        c_e4 = torch.log_softmax(c_e4, dim=1)
        c_e5 = torch.log_softmax(c_e5, dim=1)
        c_e6 = torch.log_softmax(c_e6, dim=1)
        c_e7 = torch.log_softmax(c_e7, dim=1)
        c_e8 = torch.log_softmax(c_e8, dim=1)
        c_e9 = torch.log_softmax(c_e9, dim=1)

        return output,c_inten,c_e1,c_e2,c_e3,c_e4,c_e5,c_e6,c_e7,c_e8,c_e9


    def cross_entropy_loss(self, logits, labels):
      print("logits.shape",logits.shape)
      return F.nll_loss(logits, labels)

    def training_step(self, train_batch, batch_idx):
        #lab,txt,rag,img,name,per,iro,alli,ana,inv,meta,puns,sat,hyp= train_batch
        lab,txt,rag,img,name,intensity,e1,e2,e3,e4,e5,e6,e7,e8,e9= train_batch
        #logit_offen,a,b,c,d,e,f,g,h,i,logit_inten_target= self.forward(txt,img,rag)

        lab = train_batch[lab].unsqueeze(1)
        #print(lab)
        txt = train_batch[txt]
        rag = train_batch[rag]
        img = train_batch[img]
        name= train_batch[name]
        intensity = train_batch[intensity].unsqueeze(1)
        e1 = train_batch[e1].unsqueeze(1)
        e2 = train_batch[e2].unsqueeze(1)
        e3 = train_batch[e3].unsqueeze(1)
        e4 = train_batch[e4].unsqueeze(1)
        e5 = train_batch[e5].unsqueeze(1)
        e6 = train_batch[e6].unsqueeze(1)
        e7 = train_batch[e7].unsqueeze(1)
        e8 = train_batch[e8].unsqueeze(1)
        e9 = train_batch[e9].unsqueeze(1)

        lab = F.one_hot(lab, num_classes=2)
        intensity = torch.abs(intensity)
        intensity = F.one_hot(intensity, num_classes=6)  # Assuming you have 6 classes
        e1 = F.one_hot(e1,num_classes = 2)
        e2 = F.one_hot(e2,num_classes = 2)
        e3 = F.one_hot(e3,num_classes = 2)
        e4 = F.one_hot(e4,num_classes = 2)
        e5 = F.one_hot(e5,num_classes = 2)
        e6 = F.one_hot(e6,num_classes = 2)
        e7 = F.one_hot(e7,num_classes = 2)
        e8 = F.one_hot(e8,num_classes = 2)
        e9 = F.one_hot(e9,num_classes = 2)

        lab = lab.squeeze(dim=1)
        intensity = intensity.squeeze(dim=1)
        e1 = e1.squeeze(dim=1)
        e2 = e2.squeeze(dim=1)
        e3 = e3.squeeze(dim=1)
        e4 = e4.squeeze(dim=1)
        e5 = e5.squeeze(dim=1)
        e6 = e6.squeeze(dim=1)
        e7 = e7.squeeze(dim=1)
        e8 = e8.squeeze(dim=1)
        e9 = e9.squeeze(dim=1)

        logit_offen,logit_inten_target,a,b,c,d,e,f,g,h,i= self.forward(txt,img,rag)

        loss1 = self.cross_entropy_loss(logit_offen, lab)
        loss17 = self.cross_entropy_loss(logit_inten_target, intensity)
        loss4 = self.cross_entropy_loss(a, e1)
        loss5 = self.cross_entropy_loss(b, e2)
        loss6 = self.cross_entropy_loss(c, e3)
        loss7 = self.cross_entropy_loss(d, e4)
        loss8 = self.cross_entropy_loss(e, e5)
        loss9 = self.cross_entropy_loss(f, e6)
        loss10 = self.cross_entropy_loss(g, e7)
        loss11 = self.cross_entropy_loss(h, e8)
        loss12 = self.cross_entropy_loss(i, e9)

        loss = loss1 + loss4 + loss5 + loss6 + loss7 + loss8 +loss9 + loss10 +loss11 +loss12 + loss17

        self.log('train_loss', loss)
        return loss


    def validation_step(self, val_batch, batch_idx):
        #lab,txt,rag,img,name,per,iro,alli,ana,inv,meta,puns,sat,hyp = val_batch
        lab,txt,rag,img,name,intensity,e1,e2,e3,e4,e5,e6,e7,e8,e9= val_batch
        lab = val_batch[lab].unsqueeze(1)
        #print(lab)
        txt = val_batch[txt]
        rag = val_batch[rag]
        img = val_batch[img]
        name = val_batch[name]
        intensity = val_batch[intensity].unsqueeze(1)
        e1 = val_batch[e1].unsqueeze(1)
        e2 = val_batch[e2].unsqueeze(1)
        e3 = val_batch[e3].unsqueeze(1)
        e4 = val_batch[e4].unsqueeze(1)
        e5 = val_batch[e5].unsqueeze(1)
        e6 = val_batch[e6].unsqueeze(1)
        e7 = val_batch[e7].unsqueeze(1)
        e8 = val_batch[e8].unsqueeze(1)
        e9 = val_batch[e9].unsqueeze(1)

        lab = F.one_hot(lab, num_classes=2)

        intensity = torch.abs(intensity)
        intensity = F.one_hot(intensity, num_classes=6)
        e1 = F.one_hot(e1,num_classes = 2)
        e2 = F.one_hot(e2,num_classes = 2)
        e3 = F.one_hot(e3,num_classes = 2)
        e4 = F.one_hot(e4,num_classes = 2)
        e5 = F.one_hot(e5,num_classes = 2)
        e6 = F.one_hot(e6,num_classes = 2)
        e7 = F.one_hot(e7,num_classes = 2)
        e8 = F.one_hot(e8,num_classes = 2)
        e9 = F.one_hot(e9,num_classes = 2)
        lab = lab.squeeze(dim=1)


        intensity = intensity.squeeze(dim = 1)
        e1 = e1.squeeze(dim=1)
        e2 = e2.squeeze(dim=1)
        e3 = e3.squeeze(dim=1)
        e4 = e4.squeeze(dim=1)
        e5 = e5.squeeze(dim=1)
        e6 = e6.squeeze(dim=1)
        e7 = e7.squeeze(dim=1)
        e8 = e8.squeeze(dim=1)
        e9 = e9.squeeze(dim=1)

        logits,inten,a,b,c,d,e,f,g,h,i = self.forward(txt,img,rag)

        logits=logits.float()

        tmp = np.argmax(logits.detach().cpu().numpy(),axis=1)
        loss = self.cross_entropy_loss(logits, lab)
        lab = lab.detach().cpu().numpy()
        self.log('val_acc', accuracy_score(lab,tmp))
        self.log('val_roc_auc',roc_auc_score(lab,tmp))
        self.log('val_loss', loss)
        tqdm_dict = {'val_acc': accuracy_score(lab,tmp)}
        self.validation_step_outputs.append({'progress_bar': tqdm_dict,'val_f1 offensive': f1_score(lab,tmp,average='macro')})

        return {
                  'progress_bar': tqdm_dict,
        'val_f1 offensive': f1_score(lab,tmp,average='macro')
        }

    def on_validation_epoch_end(self):
      outs = []
      outs14=[]
      for out in self.validation_step_outputs:
        outs.append(out['progress_bar']['val_acc'])
        outs14.append(out['val_f1 offensive'])
      self.log('val_acc_all_offn', sum(outs)/len(outs))
      self.log('val_f1 offensive', sum(outs14)/len(outs14))
      print(f'***val_acc_all_offn at epoch end {sum(outs)/len(outs)}****')
      print(f'***val_f1 offensive at epoch end {sum(outs14)/len(outs14)}****')
      self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx):
        lab,txt,rag,img,name,intensity,e1,e2,e3,e4,e5,e6,e7,e8,e9= batch
        lab = batch[lab].unsqueeze(1)
        #print(lab)
        txt = batch[txt]
        rag = batch[rag]
        img = batch[img]
        name = batch[name]
        intensity = batch[intensity].unsqueeze(1)
        e1 = batch[e1].unsqueeze(1)
        e2 = batch[e2].unsqueeze(1)
        e3 = batch[e3].unsqueeze(1)
        e4 = batch[e4].unsqueeze(1)
        e5 = batch[e5].unsqueeze(1)
        e6 = batch[e6].unsqueeze(1)
        e7 = batch[e7].unsqueeze(1)
        e8 = batch[e8].unsqueeze(1)
        e9 = batch[e9].unsqueeze(1)
        lab = F.one_hot(lab, num_classes=2)
        intensity = F.one_hot(intensity, num_classes=6)
        e1 = F.one_hot(e1,num_classes = 2)
        e2 = F.one_hot(e2,num_classes = 2)
        e3 = F.one_hot(e3,num_classes = 2)
        e4 = F.one_hot(e4,num_classes = 2)
        e5 = F.one_hot(e5,num_classes = 2)
        e6 = F.one_hot(e6,num_classes = 2)
        e7 = F.one_hot(e7,num_classes = 2)
        e8 = F.one_hot(e8,num_classes = 2)
        e9 = F.one_hot(e9,num_classes = 2)
        lab = lab.squeeze(dim=1)
        intensity = intensity.squeeze(dim=1)
        e1 = e1.squeeze(dim=1)
        e2 = e2.squeeze(dim=1)
        e3 = e3.squeeze(dim=1)
        e4 = e4.squeeze(dim=1)
        e5 = e5.squeeze(dim=1)
        e6 = e6.squeeze(dim=1)
        e7 = e7.squeeze(dim=1)
        e8 = e8.squeeze(dim=1)
        e9 = e9.squeeze(dim=1)

        logits,inten,a,b,c,d,e,f,g,h,i= self.forward(txt,img,rag)

        logits = logits.float()
        tmp = np.argmax(logits.detach().cpu().numpy(force=True),axis=-1)
        loss = self.cross_entropy_loss(logits, lab)
        lab = lab.detach().cpu().numpy()
        self.log('test_acc', accuracy_score(lab,tmp))
        self.log('test_roc_auc',roc_auc_score(lab,tmp))
        self.log('test_loss', loss)
        tqdm_dict = {'test_acc': accuracy_score(lab,tmp)}
        self.test_step_outputs.append({'progress_bar': tqdm_dict,'test_acc': accuracy_score(lab,tmp), 'test_f1_score': f1_score(lab,tmp,average='macro')})
        return {
                  'progress_bar': tqdm_dict,
                  'test_acc': accuracy_score(lab,tmp),
                  'test_f1_score': f1_score(lab,tmp,average='macro')
        }
    def on_test_epoch_end(self):
        # OPTIONAL
        outs = []
        outs1,outs2,outs3,outs4,outs5,outs6,outs7,outs8,outs9,outs10,outs11,outs12,outs13,outs14 = \
        [],[],[],[],[],[],[],[],[],[],[],[],[],[]
        for out in self.test_step_outputs:
          outs.append(out['test_acc'])
          outs2.append(out['test_f1_score'])
        self.log('test_acc', sum(outs)/len(outs))
        self.log('test_f1_score', sum(outs2)/len(outs2))
        self.test_step_outputs.clear()

    def configure_optimizers(self):
      # optimizer = torch.optim.Adam(self.parameters(), lr=3e-2)
      optimizer = torch.optim.Adam(self.parameters(), lr=1e-5)

      return optimizer


  """
  Main Model:
  Initialize
  Forward Pass
  Training Step
  Validation Step
  Testing Step

  Pp
  """

  class HmDataModule(pl.LightningDataModule):

    def setup(self, stage):
      self.hm_train = t_p
      self.hm_val = v_p
      # self.hm_test = test
      self.hm_test = te_p

    def train_dataloader(self):
      return DataLoader(self.hm_train, batch_size=10, drop_last=True)

    def val_dataloader(self):
      return DataLoader(self.hm_val, batch_size=10, drop_last=True)

    def test_dataloader(self):
      return DataLoader(self.hm_test, batch_size=10, drop_last=True)

  data_module = HmDataModule()
  checkpoint_callback = ModelCheckpoint(
      monitor='val_acc_all_offn',
      dirpath='mrinal/',
      filename='epoch{epoch:02d}-val_f1_all_offn{val_acc_all_offn:.2f}',
      auto_insert_metric_name=False,
      save_top_k=1,
      mode="max",
  )
  all_callbacks = []
  all_callbacks.append(checkpoint_callback)
  # train
  from pytorch_lightning import seed_everything
  seed_everything(42, workers=True)
  hm_model = Classifier()
  gpus=1
  #if torch.cuda.is_available():gpus=0
  trainer = pl.Trainer(deterministic=True,max_epochs=10,precision=16,callbacks=all_callbacks)
  trainer.fit(hm_model, data_module)

INFO:lightning_fabric.utilities.seed:Seed set to 42
/usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:556: You passed `Trainer(accelerator='cpu', precision='16-mixed')` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'` instead.
INFO:pytorch_lightning.utilities.rank_zero:Using bfloat16 Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
WARNING:pytorch_lightning.loggers.tensorboard:Missing logger folder: /content/LLaVA/lightning_logs
INFO:pytorch_lightning.callbacks.model_summary:
   | Name        | Type   | Params
----------------------------------------
0  | MFB         | MFB    | 21.0 M
1  | fin_y_shape | Linear | 393 K 
2  | fin_old     | Linear | 130   
3  | fin         | Linear | 786 K 
4  | fin_inten   | Linear | 12.3 K
5  | fin_e1      | Linear | 130   
6  | fin_e2      | Linear | 130   
7  | fin_e3      | Linear | 130   
8  | fin_e4      | Linear | 130   
9  | fin_e5      | Linear | 130   
10 | fin_e6      | Linear | 130   
11 | fin_e7      | Linear | 130   
12 | fin_e8      | Linear | 130   
13 | fin_e9      | Linear | 130   
----------------------------------------
22.2 M    Trainable params
0         Non-trainable params
22.2 M    Total params
88.792    Total estimated model params size (MB)
Sanity Checking DataLoader 0:   0%
 0/2 [00:00<?, ?it/s]
x.shape torch.Size([10, 768])
y.shape torch.Size([10, 512])
rag.shape torch.Size([10, 768])
z_new shape tensor([[ 0.0144, -0.1677,  0.1100,  ..., -0.1818,  0.4250, -0.2985],
        [-0.2105, -0.1002, -0.0113,  ..., -0.0639,  0.3789, -0.0553],
        [-0.1221, -0.1026, -0.3277,  ..., -0.3724,  0.1562,  0.0286],
        ...,
        [-0.0950,  0.3957,  0.3603,  ..., -0.2121,  0.6465, -0.1983],
        [ 0.0080,  0.2380, -0.0409,  ..., -0.2565,  0.0946, -0.1098],
        [ 0.1351, -0.3463,  0.3371,  ..., -0.2283,  0.4667,  0.0087]])
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-29-279b4c8e1163> in <cell line: 369>()
    367 #if torch.cuda.is_available():gpus=0
    368 trainer = pl.Trainer(deterministic=True,max_epochs=10,precision=16,callbacks=all_callbacks)
--> 369 trainer.fit(hm_model, data_module)

14 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py in forward(self, input)
    112 
    113     def forward(self, input: Tensor) -> Tensor:
--> 114         return F.linear(input, self.weight, self.bias)
    115 
    116     def extra_repr(self) -> str:

RuntimeError: mat1 and mat2 shapes cannot be multiplied (10x2048 and 64x2)

ptrblck · April 20, 2024, 4:37pm

Double post from here.

Vanessa_Niemczyk · May 6, 2024, 3:24pm

Hi, I’m getting the following error. I believe it’s from my input and output shapes mismatching like the others in this thread but I am confused on what to change them to. My x_train_tensor.shape is 1117157, 8, train_dataloader is 32, 8, and net.fc1.weight.shape is 50, 1117157.

RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x8 and 1117157x50)

class NetFc(nn.Module):
    def __init__(self):
      super(NetFc, self).__init__()
      self.fc1 = nn.Linear(x_train_tensor.shape[0], 50)
      self.fc2 = nn.Linear(50, 50)
      self.fc3 = nn.Linear(50, 50)
      self.fc4 = nn.Linear(50, 50)
      self.fc5 = nn.Linear(50, 50)
      self.fc6 = nn.Linear(50, 50)
      self.fc7 = nn.Linear(50, 50)
      self.fc8 = nn.Linear(50, y_train_tensor.shape[0])
      
    def forward(self, x):
      x = torch.flatten(x, 1)
      x = F.relu(self.fc1(x))
      x = F.relu(self.fc2(x))
      x = F.relu(self.fc3(x))
      x = F.relu(self.fc4(x))
      x = F.relu(self.fc5(x))
      x = F.relu(self.fc6(x))
      x = F.relu(self.fc7(x))
      x = self.fc8(x)
      #Softmax layer should always be last
      output = F.log_softmax(x, dim=1)

      # Return the output of the network
      return output

def trainMyModel(net,lr,train_dataloader,n_epochs):

  # define loss and optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.Adam(net.parameters(), lr=lr)

  for epoch in range(n_epochs):                  # loop over number of epochs
    running_loss = 0.0
    for data, target in train_dataloader:
      optimizer.zero_grad()                      # zero gradient buffers
      outputs = net(data.float())                      # forward prop
      loss = criterion(outputs, target)          # calculate loss
      loss.backward()                            # backward prop
      optimizer.step()                           # optimize

      # print statistics
      running_loss += loss.item()
      if i % 100 == 99:              # print every 100 mini-batches
        print(f'[{epoch + 1}, {i +1:5d}] loss: {running_loss / 100:.3f}')
        running_loss = 0.0

  print('Finished Training')
  return net

# Train your model.
net = NetFc();
lr = 1e-2;
n_epochs = 2;

trainedNet = trainMyModel(net,lr,train_dataloader,n_epochs);

RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x13056 and 153600x2048)

Define the Vision Transformer model

Define the Convolutional Neural Network model

Define the combined model

Define your model architecture

Set the number of desired output channels

Transformer encoder output size (adjust if needed)

Define additional convolutional and fully connected layers

Concatenate ViT, Conv, and FC layers

Create the extended model

Uncomment for model output

RuntimeError: mat1 and mat2 shapes cannot be multiplied (8x645 and 215x215)

30.0 M Total params 120.198 Total estimated model params size (MB) Sanity Checking DataLoader 0: 0% 0/2 [00:00<?, ?it/s] x.shape torch.Size([20, 768])

30.0 M Total params
120.198 Total estimated model params size (MB)
Sanity Checking DataLoader 0: 0%
0/2 [00:00<?, ?it/s]
x.shape torch.Size([20, 768])