RuntimeError: Expected target size [32, 1], got [32]

Hello,
I am new to Pytorch as well as ML. Can someone help to fix?

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=2, stride=1),
            nn.Dropout(0.5),
            nn.ReLU())
        
        self.conv2 = nn.Sequential(
            nn.Conv1d(32,32, kernel_size=2, stride=1),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.MaxPool1d(2,stride=3))
          
        self.conv3 = nn.Sequential(
            nn.Conv1d(32, 32, kernel_size=2, stride=1),
            nn.Dropout(0.5),
            nn.ReLU())
        
        #fully connected layers
        self.fc1 = nn.Linear(32*47,32)
        #self.fc2 = nn.Linear(32,2)
        self.fc2 = nn.Linear(32,1)
        self.activation = nn.Softmax()
        
    def forward(self, x):
        # input x : 
        #expected conv1d input = minibatch_size * num_channel * width
        batch_size=x.size(0)
        y = self.conv1(x.view(batch_size,1,-1))
        y = self.conv2(y)
        y = self.conv3(y)
        print(y.size())
        batch_size= y.size(0)
        y = y.flatten(start_dim=1)
        print(y.size())
        y = self.fc1(y.view(batch_size,1,-1))
        y = self.fc2(y.view(batch_size,1,-1))

        return y

import numpy as np
batch_size = 32
epochs = 3
min_valid_loss = np.inf

for e in range(epochs):
    train_loss = 0.0
    model.train()     # Optional when not using Model Specific layer
    for data, labels in train_loader:
         # Transfer Data to GPU if available
        if torch.cuda.is_available():
            data, labels = data.cuda(), labels.cuda()

        
         # Clear the gradients
        optimizer.zero_grad()
        # Forward Pass
        target = model(data)
        # Find the Loss
        loss = criterion(target,labels)
        # Calculate gradients 
        loss.backward()
        # Update Weights
        optimizer.step()
        # Calculate Loss
        train_loss += loss.item()
    
    valid_loss = 0.0
    model.eval()     # Optional when not using Model Specific layer
    for data, labels in validloader:
        if torch.cuda.is_available():
            data, labels = data.cuda(), labels.cuda()
        
        target = model(data)
        target = target.long()
        loss = criterion(target,labels)
        valid_loss = loss.item() * data.size(0)

Hello!

I think you need to make sure your labels object (also known as your target) is shaped like (32, 1). Your terminology is a bit confusing because you gave the name target to your model output, typically target refers to the ground truth / labels.

Try this:

loss = criterion(target,labels.view(len(labels), 1))

If this fails, please paste the full stack trace, as it will make debugging easier.

I changed as per your suggestion and now I get this error


RuntimeError Traceback (most recent call last)
in
18 # Find the Loss
19 #loss = criterion(target,labels)
—> 20 loss = criterion(target,labels.view(len(labels), 1))
21 # Calculate gradients
22 loss.backward()

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1109 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1110 return forward_call(*input, **kwargs)
1111 # Do not call functions when jit is used
1112 full_backward_hooks, non_full_backward_hooks = ,

~\anaconda3\lib\site-packages\torch\nn\modules\loss.py in forward(self, input, target)
1161
1162 def forward(self, input: Tensor, target: Tensor) → Tensor:
→ 1163 return F.cross_entropy(input, target, weight=self.weight,
1164 ignore_index=self.ignore_index, reduction=self.reduction,
1165 label_smoothing=self.label_smoothing)

~\anaconda3\lib\site-packages\torch\nn\functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
2994 if size_average is not None or reduce is not None:
2995 reduction = _Reduction.legacy_get_string(size_average, reduce)
→ 2996 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
2997
2998

RuntimeError: expected scalar type Long but found Float

(in the future you can wrap blocks of code in 3 backquotes to make them easier to read, like this: ```)

It looks like you’re using Cross Entropy Loss as your criterion. Per the example in the docs, the target (in your case that’s named label) ought to be of type Long.

So try this:

labels = labels.long()  # add this before you call criterion()
loss = criterion(target, labels.view(len(labels), 1))
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-73-75bc6e93bf9b> in <module>
     19         #loss = criterion(target,labels)
     20         labels = labels.long()
---> 21         loss = criterion(target, labels.view(len(labels), 1))
     22         # Calculate gradients
     23         loss.backward()

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

~\anaconda3\lib\site-packages\torch\nn\modules\loss.py in forward(self, input, target)
   1161 
   1162     def forward(self, input: Tensor, target: Tensor) -> Tensor:
-> 1163         return F.cross_entropy(input, target, weight=self.weight,
   1164                                ignore_index=self.ignore_index, reduction=self.reduction,
   1165                                label_smoothing=self.label_smoothing)

~\anaconda3\lib\site-packages\torch\nn\functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
   2994     if size_average is not None or reduce is not None:
   2995         reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2996     return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
   2997 
   2998 

IndexError: Target 1 is out of bounds.

I have now used the wrap blocks.

now I get an index error.
Also is there a better way to call the target without calling them labels?

My other question is, what I am trying to do is a binary classification. Whether there is label 1 or 0. Am i using a wrong loss function ?

We’re getting closer!

For the variable naming question, I would just call it “output” instead of “target”.

For a binary classification you should try BCELoss. For BCELoss, you do need to run the model output through the sigmoid layer yourself before passing it to the loss function, because it expects output in the (0, 1) range.

So try this:

m = nn.Sigmoid()
...

output = m(model(data))
labels = labels.view(len(labels), 1).long()
loss = criterion(output, labels)

For better clarity, I changed the “target” to output and “label” to target

I changed the loss function to BCELoss and tried as you suggested. it throws a different error

`---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-96-a9e48fd0e0c5> in <module>
     20         # Find the Loss
     21         target = target.view(len(labels), 1).long()
---> 22         loss = criterion(output,target)
     23 
     24         # Calculate gradients

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

~\anaconda3\lib\site-packages\torch\nn\modules\loss.py in forward(self, input, target)
    610 
    611     def forward(self, input: Tensor, target: Tensor) -> Tensor:
--> 612         return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
    613 
    614 

~\anaconda3\lib\site-packages\torch\nn\functional.py in binary_cross_entropy(input, target, weight, size_average, reduce, reduction)
   3054         reduction_enum = _Reduction.get_enum(reduction)
   3055     if target.size() != input.size():
-> 3056         raise ValueError(
   3057             "Using a target size ({}) that is different to the input size ({}) is deprecated. "
   3058             "Please ensure they have the same size.".format(target.size(), input.size())

ValueError: Using a target size (torch.Size([32, 1])) that is different to the input size (torch.Size([32, 1, 2])) is deprecated. Please ensure they have the same size.
`

The error is telling you that your model output is shaped like (32, 1, 2) but your target (labels) is shaped like (31, 1).

Did you swap out the (32, 1) Linear layer with (32, 2) one here by any chance?

#fully connected layers
self.fc1 = nn.Linear(32*47,32)
#self.fc2 = nn.Linear(32,2)
self.fc2 = nn.Linear(32,1)

I swapped but moved back to (32, 1) and still get the same error

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-135-822ecf5e7f2c> in <module>
     19         # Find the Loss
     20         target = target.view(len(labels), 1).long()
---> 21         loss = criterion(output,target)
     22 
     23         # Calculate gradients

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

~\anaconda3\lib\site-packages\torch\nn\modules\loss.py in forward(self, input, target)
    610 
    611     def forward(self, input: Tensor, target: Tensor) -> Tensor:
--> 612         return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
    613 
    614 

~\anaconda3\lib\site-packages\torch\nn\functional.py in binary_cross_entropy(input, target, weight, size_average, reduce, reduction)
   3054         reduction_enum = _Reduction.get_enum(reduction)
   3055     if target.size() != input.size():
-> 3056         raise ValueError(
   3057             "Using a target size ({}) that is different to the input size ({}) is deprecated. "
   3058             "Please ensure they have the same size.".format(target.size(), input.size())

ValueError: Using a target size (torch.Size([32, 1])) that is different to the input size (torch.Size([32, 1, 1])) is deprecated. Please ensure they have the same size.

OK, looking through your ConvNet model I’m seeing that you have batch_size showing up in your forward method, which it shouldn’t. The idea is that PyTorch separates out the model design from the training. The minibatch size is a training choice, rather than a model design choice, so you might guess that it shouldn’t show up in the model architecture anywhere (since really passing a minibatch through the model just passes all of its elements separately through it, each pass has no awareness of being part of a broader minibatch).

So you need to rewrite your forward method to something like this:

    def forward(self, x):
        # input x : 
        #expected conv1d input = num_channel * width
        y = self.conv1(x)
        y = self.conv2(y)
        y = self.conv3(y)
        print(y.size())
        y = y.flatten(start_dim=1)
        print(y.size())
        y = self.fc1(y)
        y = self.fc2(y)

Then if I run a minibatch of 10 samples of shape (1, 145) through it, it works fine:

x = ConvNet()
x(torch.randn(10, 1, 145))

Output:

torch.Size([10, 32, 47])
torch.Size([10, 1504])
tensor([[-0.00054],
        [ 0.27069],
        [ 0.14760],
        [ 0.02364],
        [ 0.13513],
        [ 0.14203],
        [ 0.08971],
        [ 0.22606],
        [ 0.15584],
        [ 0.04441]], grad_fn=<AddmmBackward>)

The shape (1, 145) means, for your purposes, 1 channel and width 145. The number of channels needs to match the number of input features of conv1, which in your case is 1 (the first “1” from here nn.Conv1d(1, 32, kernel_size=2, stride=1)) and the width needs to be such that by the time your activation hits the first Linear layer, its shape matches the input shape of your Linear layer (in your case it’s 1504, written in your code as 32 * 47).

I think I am kind of lost with how PyTorch and the CNN works. This is the first time I am using PyTorch.
Even though I use the batch_size, I still get the same output as yours.

This is my entire class

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        
        
        self.conv1 = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=2, stride=1),
            nn.Dropout(0.5),
            nn.ReLU())
        
        self.conv2 = nn.Sequential(
            nn.Conv1d(32,32, kernel_size=2, stride=1),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.MaxPool1d(2,stride=3))
          
        self.conv3 = nn.Sequential(
            nn.Conv1d(32, 32, kernel_size=2, stride=1),
            nn.Dropout(0.5),
            nn.ReLU())
        
        #fully connected layers
        self.fc1 = nn.Linear(32*47,32)
        self.fc2 = nn.Linear(32,1)
        #self.fc2 = nn.Linear(32,2)
        self.activation = nn.Softmax()
    
    
    def forward(self, x):
        # input x : 
        #expected conv1d input = minibatch_size * num_channel * width
        batch_size=x.size(0)
        y = self.conv1(x.view(batch_size,1,-1))
        y = self.conv2(y)
        y = self.conv3(y)
        
        print(y.size())
        batch_size= y.size(0)
        y = y.flatten(start_dim=1)
        print(y.size())
        y = self.fc1(y.view(y.size(0), -1))
        #y = self.fc1(y.view(batch_size,1,-1))
        y = self.fc2(y.view(batch_size,1,-1))

        return y


model = ConvNet()
batch_size = 32
segment_size = 145
X = torch.ones(batch_size, segment_size)
output=my_model((X))
print(output.size())

#The output when I run this : 
torch.Size([32, 32, 47])
torch.Size([32, 1504])
torch.Size([32, 1, 1])
# define training hyperparameters
learning_rate = 0.001
batch_size = 32
epochs= 10
number_classess = 2

#criterion = nn.CrossEntropyLoss()
m = nn.Sigmoid()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
import numpy as np
batch_size = 32
epochs = 3
min_valid_loss = np.inf

for e in range(epochs):
    train_loss = 0.0
    model.train()     # Optional when not using Model Specific layer
    for data, target in train_loader:
         # Transfer Data to GPU if available
        if torch.cuda.is_available():
            data, target = data.cuda(), target.cuda()
        
        
         # Clear the gradients
        optimizer.zero_grad()
        # Forward Pass
        output = m(model(data))
        # Find the Loss
        target = target.view(len(labels), 1).long()
        loss = criterion(output,target)
        
        # Calculate gradients 
        loss.backward()
        # Update Weights
        optimizer.step()
        # Calculate Loss
        train_loss += loss.item()
    
    valid_loss = 0.0
    model.eval()     # Optional when not using Model Specific layer
    for data, target in validloader:
        if torch.cuda.is_available():
            data, target = data.cuda(), target.cuda()
        
        output = model(data)
        output = output.long()
        print("=====LOSS===========",target)
        loss = criterion(output,target)
        valid_loss = loss.item() * data.size(0)
        

    print(f'Epoch {e+1} \t\t Training Loss: {train_loss / len(trainloader)} \t\t Validation Loss: {valid_loss / len(validloader)}')
    if min_valid_loss > valid_loss:
        print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
        min_valid_loss = valid_loss
        


        # Saving State Dict
        torch.save(model.state_dict(), 'saved_model.pth')

I think I missed the most important part to share. I am trying to implement this architecture. I didn’t take the same values as mentioned.

I tried changing the input channels , Kernal size etc according to the paper but it didn’t work. Maybe I didn’t implement it in the correct way.

Suggestions:

  • your input should be shaped like (batch_size, num_channels, segment_size)
  • your ConvNet shouldn’t have batch size showing up anywhere in its code
  • after looking at their architecture, and noticing that they have Softmax as their final step, I think you want the loss to be CrossEntropyLoss as you originally had and you want your fc2 linear layer to have an output dimension of 2 (not 1). you don’t need to apply the Sigmoid() anywhere

Note that there’s some differences between your architecture and the model one e.g. they use 2 conv layers, you use 3. They use kernel size 5 in the conv layers, you use 3. They use a fully connected 2nd layer of input size 512, yours has input size 32. I’ll ignore those changes.

This code runs fine for me:

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=2, stride=1),
            nn.Dropout(0.5),
            nn.ReLU())
        
        self.conv2 = nn.Sequential(
            nn.Conv1d(32,32, kernel_size=2, stride=1),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.MaxPool1d(2,stride=3))
          
        self.conv3 = nn.Sequential(
            nn.Conv1d(32, 32, kernel_size=2, stride=1),
            nn.Dropout(0.5),
            nn.ReLU())
        
        #fully connected layers
        self.fc1 = nn.Linear(32*47,512)
        self.fc2 = nn.Linear(512,2)
        # self.fc2 = nn.Linear(32,1)
        # self.activation = nn.Softmax()
        
    def forward(self, x):
        # input x : 
        #expected conv1d input = num_channel * width
        y = self.conv1(x)
        y = self.conv2(y)
        y = self.conv3(y)
        print(y.size())
        y = y.flatten(start_dim=1)
        print(y.size())
        y = self.fc1(y)
        y = self.fc2(y)

        return y

x = ConvNet()
loss = nn.CrossEntropyLoss()
batch_size = 32

data = torch.randn(batch_size, 1, 145)
target = torch.empty(batch_size, dtype=torch.long).random_(2)

output = x(data)
loss(output, target)

Output:
tensor(0.69431, grad_fn=<NllLossBackward>)

for the suggestion,

  • when you say input, that’s the inputs of the layers?
  • why shouldn’t we have batch_size?
  • what I am trying to classify is whether the signal has a peak or not. ( peak = 1 , not peak = 0) isn’t this a binary classification?

I removed the Sigmoid() and revert back to Cross-Entropy Loss. However, when I run the loop I get a different error now

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-206-d6853bf8f764> in <module>
     16         optimizer.zero_grad()
     17         # Forward Pass
---> 18         output = (model(data))
     19         # Find the Loss
     20         target = target.view(len(labels), 1).long()

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

<ipython-input-200-7704075c7552> in forward(self, x)
     28         # input x :
     29         #expected conv1d input = num_channel * width
---> 30         y = self.conv1(x)
     31         y = self.conv2(y)
     32         y = self.conv3(y)

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

~\anaconda3\lib\site-packages\torch\nn\modules\container.py in forward(self, input)
    139     def forward(self, input):
    140         for module in self:
--> 141             input = module(input)
    142         return input
    143 

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

~\anaconda3\lib\site-packages\torch\nn\modules\conv.py in forward(self, input)
    300 
    301     def forward(self, input: Tensor) -> Tensor:
--> 302         return self._conv_forward(input, self.weight, self.bias)
    303 
    304 

~\anaconda3\lib\site-packages\torch\nn\modules\conv.py in _conv_forward(self, input, weight, bias)
    296                             weight, bias, self.stride,
    297                             _single(0), self.dilation, self.groups)
--> 298         return F.conv1d(input, weight, bias, self.stride,
    299                         self.padding, self.dilation, self.groups)
    300 

RuntimeError: Given groups=1, weight of size [32, 1, 2], expected input[1, 32, 145] to have 1 channels, but got 32 channels instead

That’s the input to the model, you have it named data.

As I mentioned in a previous post, think of the batch size as representing how many inputs you are processing in parallel at the same time. Each input is being processed according to the model architecture, which is the same whether you are processing one input at a time (batch size of 1) or 100 inputs at a time (batch size of 100). So, in PyTorch, the model itself is not built with a particular batch size in mind, as it can work with all different batch sizes. Therefore, there’s no reason why any of the pieces of model architecture should ever reference, or be related to, batch size.

As an analogy, think of any vectorized operation, such as summing two numpy arrays. The (element-wise, not array-wise) sum is the model, and the length of the arrays is the batch size. When the code for “sum” is implemented, it’s implemented strictly with reference to the two elements being added, whether those elements are standalone floats, or are part of an array of size 1 or size 32 or size 100 does not show up in the “sum” code.

Typically, yes. However, you can think the model implementing this classification in two ways:

  • a single output in the (0, 1) range, with a value closer to 1 meaning more likely to be peak - this is how BCELoss works
  • two outputs, each in the (0, 1) range, which sum to 1, which can roughly be thought of as probabilities* of the two states (peak or not peak). here the index whose value is closer to 1 is your prediction, e.g. (0.98, 0.02) would be interpreted as “most likely a 0”.

In the Table you posted, they have the “Output size” for the last layer as 2, and that layer is a Softmax layer. This suggests the 2nd bullet point above is their implementation.

nb: the probability interpretation is actually not true, but useful to think about as you’re trying to differentiate CrossEntropyLoss from BCELoss at least
In the table you pasted

I squeezed the output and it worked. I also used BCElogist as the loss function. it worked. Thanks for your clarity. I am still in the process of how different loss functions work. Your explanations helped me a lot.


class Classifier(pl.LightningModule):

  def __init__(self):
    super().__init__()
    self.MFB = MFB(512,768,True,256,64,0.1)
    self.fin_y_shape = torch.nn.Linear(768,512)
    self.fin_old = torch.nn.Linear(64,2)
    self.fin = torch.nn.Linear(16 * 768, 64)
    self.fin_inten = torch.nn.Linear(64,6)
    self.fin_e1 = torch.nn.Linear(64,2)
    self.fin_e2 = torch.nn.Linear(64,2)
    self.fin_e3 = torch.nn.Linear(64,2)
    self.fin_e4 = torch.nn.Linear(64,2)
    self.fin_e5 = torch.nn.Linear(64,2)
    self.fin_e6 = torch.nn.Linear(64,2)
    self.fin_e7 = torch.nn.Linear(64,2)
    self.fin_e8 = torch.nn.Linear(64,2)
    self.fin_e9 = torch.nn.Linear(64,2)



    self.validation_step_outputs = []
    self.test_step_outputs = []

  def forward(self, x,y,rag):
      x_,y_,rag_ = x,y,rag
      print("x.shape", x.shape)
      print("y.shape",y.shape)
      print("rag.shape",rag.shape)
      z = self.MFB(torch.unsqueeze(y, axis=1), torch.unsqueeze(x, axis=1))
      z_rag = self.MFB(torch.unsqueeze(y, axis=1),torch.unsqueeze(rag, axis=1))
      z_concatenated = torch.cat((z, z_rag), dim=1)
      
      # z_rag = self.MFB(torch.unsqueeze(y, axis=1), torch.unsqueeze(rag, axis=1))
        
      #   # Concatenate the two tensors
      # concatenated_output = torch.cat((z, z_rag),dim=1)
        
      # z_new = torch.squeeze(concatenated_output, dim=1)
      # z1 = self.MFB(torch.unsqueeze(y, axis=1), torch.unsqueeze(rag, axis=1))
      # z_ = torch.cat((z,z1),dim=1)
      # #cross_attention= (rag and  x/y)
       
      #z_new = torch.squeeze(z, dim=1)
      # z_rag = self.MFB(torch.unsqueeze(y, axis=1), torch.unsqueeze(rag, axis=1))

      # z_newe = torch.cat((z, z_rag), dim=1)

      z_new = torch.squeeze(z_concatenated, dim=1)
      c_inten = self.fin_inten(z_new)
      c_e1 = self.fin_e1(z_new)
      c_e2 = self.fin_e2(z_new)
      c_e3 = self.fin_e3(z_new)
      c_e4 = self.fin_e4(z_new)
      c_e5 = self.fin_e5(z_new)
      c_e6 = self.fin_e6(z_new)
      c_e7 = self.fin_e7(z_new)
      c_e8 = self.fin_e8(z_new)
      c_e9 = self.fin_e9(z_new)
      c = self.fin_old(z_new)

      print("intensity error:", c_inten.shape)
      print("output:", c.shape)
      print("c_e1:", c_e1.shape)
      print("c_e2:", c_e2.shape)
      print("c_e3:", c_e3.shape)
      print("c_e4:", c_e4.shape)
      print("c_e5:", c_e5.shape)
      print("c_e6:", c_e6.shape)
      print("c_e7:", c_e7.shape)
      print("c_e8:", c_e8.shape)
      print("c_e9:", c_e9.shape)


      output = torch.log_softmax(c, dim=1)
      c_inten = torch.log_softmax(c_inten, dim=1)
      c_e1 = torch.log_softmax(c_e1, dim=1)
      c_e2 = torch.log_softmax(c_e2, dim=1)
      c_e3 = torch.log_softmax(c_e3, dim=1)
      c_e4 = torch.log_softmax(c_e4, dim=1)
      c_e5 = torch.log_softmax(c_e5, dim=1)
      c_e6 = torch.log_softmax(c_e6, dim=1)
      c_e7 = torch.log_softmax(c_e7, dim=1)
      c_e8 = torch.log_softmax(c_e8, dim=1)
      c_e9 = torch.log_softmax(c_e9, dim=1)
      # z, z1 = z.chunk(2, dim=0)  # Split representations
      # contrastive_loss = torch.nn.functional.cosine_embedding_loss(z, z1, torch.ones_like(z1))  # Contrastive loss
      


      return output,c_inten,c_e1,c_e2,c_e3,c_e4,c_e5,c_e6,c_e7,c_e8,c_e9
  
  # def contrastive_loss(self, z, z1, label, margin=1.0):
  #       # Compute contrastive loss
  #       euclidean_distance = F.pairwise_distance(z, z1)
  #       loss_contrastive = torch.mean((1 - label) * torch.pow(euclidean_distance, 2) +
  #                                      (label) * torch.pow(torch.clamp(margin - euclidean_distance, min=0.0), 2))
  #       return loss_contrastive

  def cross_entropy_loss(self, logits, labels):
    return F.nll_loss(logits, labels)

  def training_step(self, train_batch, batch_idx):
      #lab,txt,rag,img,name,per,iro,alli,ana,inv,meta,puns,sat,hyp= train_batch
      lab,txt,rag,img,name,intensity,e1,e2,e3,e4,e5,e6,e7,e8,e9= train_batch
      #logit_offen,a,b,c,d,e,f,g,h,i,logit_inten_target= self.forward(txt,img,rag)
      
      lab = train_batch[lab]
      #print(lab)
      txt = train_batch[txt]
      rag = train_batch[rag]
      img = train_batch[img]
      name= train_batch[name]
      intensity = train_batch[intensity]
      e1 = train_batch[e1]
      e2 = train_batch[e2]
      e3 = train_batch[e3]
      e4 = train_batch[e4]
      e5 = train_batch[e5]
      e6 = train_batch[e6]
      e7 = train_batch[e7]
      e8 = train_batch[e8]
      e9 = train_batch[e9]
      # per = train_batch[per]
      # iro= train_batch[iro]
      # alli = train_batch[alli]
      # ana = train_batch[ana]
      # inv = train_batch[inv]
      # meta = train_batch[meta]
      # puns = train_batch[puns]
      # sat = train_batch[sat]
      # hyp = train_batch[hyp]

      logit_offen,logit_inten_target,a,b,c,d,e,f,g,h,i= self.forward(txt,img,rag)

      loss1 = self.cross_entropy_loss(logit_offen, lab)
      loss17 = self.cross_entropy_loss(logit_inten_target, intensity)
      loss4 = self.cross_entropy_loss(a, e1)
      loss5 = self.cross_entropy_loss(b, e2)
      loss6 = self.cross_entropy_loss(c, e3)
      loss7 = self.cross_entropy_loss(d, e4)
      loss8 = self.cross_entropy_loss(e, e5)
      loss9 = self.cross_entropy_loss(f, e6)
      loss10 = self.cross_entropy_loss(g, e7)
      loss11 = self.cross_entropy_loss(h, e8)
      loss12 = self.cross_entropy_loss(i, e9)
      
      # loss2 = self.cross_entropy_loss(a,per)
      # loss3 = self.cross_entropy_loss(b,iro)
      # loss4 = self.cross_entropy_loss(c, alli)
      # loss5 = self.cross_entropy_loss(d,ana)
      # loss6 = self.cross_entropy_loss(e,inv)
      # loss7 = self.cross_entropy_loss(f,meta)
      # loss8 = self.cross_entropy_loss(g,puns)
      # loss9 = self.cross_entropy_loss(h,sat)
      # loss10 = self.cross_entropy_loss(i,hyp)

      loss = loss1 + loss4 + loss5 + loss6 + loss7 + loss8 +loss9 + loss10 +loss11 +loss12 + loss17

      self.log('train_loss', loss)
      return loss


  def validation_step(self, val_batch, batch_idx):
      #lab,txt,rag,img,name,per,iro,alli,ana,inv,meta,puns,sat,hyp = val_batch
      lab,txt,rag,img,name,intensity,e1,e2,e3,e4,e5,e6,e7,e8,e9= val_batch
      lab = val_batch[lab]
      #print(lab)
      txt = val_batch[txt]
      rag = val_batch[rag]
      img = val_batch[img]
      name = val_batch[name]
      intensity = val_batch[intensity]
      e1 = val_batch[e1]
      e2 = val_batch[e2]
      e3 = val_batch[e3]
      e4 = val_batch[e4]
      e5 = val_batch[e5]
      e6 = val_batch[e6]
      e7 = val_batch[e7]
      e8 = val_batch[e8]
      e9 = val_batch[e9]
      
      # per = val_batch[per]
      # iro = val_batch[iro]
      # alli = val_batch[alli]
      # ana = val_batch[ana]
      # inv = val_batch[inv]
      # meta = val_batch[meta]
      # puns = val_batch[puns]
      # sat = val_batch[sat]
      # hyp = val_batch[hyp]


      logits,inten,a,b,c,d,e,f,g,h,i = self.forward(txt,img,rag)


      logits=logits.float()


      tmp = np.argmax(logits.detach().cpu().numpy(),axis=1)
      loss = self.cross_entropy_loss(logits, lab)
      lab = lab.detach().cpu().numpy()
      self.log('val_acc', accuracy_score(lab,tmp))
      self.log('val_roc_auc',roc_auc_score(lab,tmp))
      self.log('val_loss', loss)
      tqdm_dict = {'val_acc': accuracy_score(lab,tmp)}
      self.validation_step_outputs.append({'progress_bar': tqdm_dict,'val_f1 offensive': f1_score(lab,tmp,average='macro')})

      return {
                'progress_bar': tqdm_dict,
      'val_f1 offensive': f1_score(lab,tmp,average='macro')
      }

  def on_validation_epoch_end(self):
    outs = []
    outs14=[]
    for out in self.validation_step_outputs:
       outs.append(out['progress_bar']['val_acc'])
       outs14.append(out['val_f1 offensive'])
    self.log('val_acc_all_offn', sum(outs)/len(outs))
    self.log('val_f1 offensive', sum(outs14)/len(outs14))
    print(f'***val_acc_all_offn at epoch end {sum(outs)/len(outs)}****')
    print(f'***val_f1 offensive at epoch end {sum(outs14)/len(outs14)}****')
    self.validation_step_outputs.clear()

  def test_step(self, batch, batch_idx):
      lab,txt,rag,img,name,intensity,e1,e2,e3,e4,e5,e6,e7,e8,e9= batch
      #lab,txt,rag,img,name,per,iro,alli,ana,inv,meta,puns,sat,hyp= batch
      lab = batch[lab]
      #print(lab)
      txt = batch[txt]
      rag = batch[rag]
      img = batch[img]
      name = batch[name]
      intensity = batch[intensity]
      e1 = batch[e1]
      e2 = batch[e2]
      e3 = batch[e3]
      e4 = batch[e4]
      e5 = batch[e5]
      e6 = batch[e6]
      e7 = batch[e7]
      e8 = batch[e8]
      e9 = batch[e9]
      
      # per = batch[per]
      # iro = batch[iro]
      # alli = batch[alli]
      # ana = batch[ana]
      # inv = batch[inv]
      # meta = batch[meta]
      # puns = batch[puns]
      # sat = batch[sat]
      # hyp = batch[hyp]

      logits,inten,a,b,c,d,e,f,g,h,i= self.forward(txt,img,rag)

      logits = logits.float()
      tmp = np.argmax(logits.detach().cpu().numpy(force=True),axis=-1)
      loss = self.cross_entropy_loss(logits, lab)
      lab = lab.detach().cpu().numpy()
      self.log('test_acc', accuracy_score(lab,tmp))
      self.log('test_roc_auc',roc_auc_score(lab,tmp))
      self.log('test_loss', loss)
      tqdm_dict = {'test_acc': accuracy_score(lab,tmp)}
      self.test_step_outputs.append({'progress_bar': tqdm_dict,'test_acc': accuracy_score(lab,tmp), 'test_f1_score': f1_score(lab,tmp,average='macro')})
      return {
                'progress_bar': tqdm_dict,
                'test_acc': accuracy_score(lab,tmp),
                'test_f1_score': f1_score(lab,tmp,average='macro')
      }
  def on_test_epoch_end(self):
      # OPTIONAL
      outs = []
      outs1,outs2,outs3,outs4,outs5,outs6,outs7,outs8,outs9,outs10,outs11,outs12,outs13,outs14 = \
      [],[],[],[],[],[],[],[],[],[],[],[],[],[]
      for out in self.test_step_outputs:
        outs.append(out['test_acc'])
        outs2.append(out['test_f1_score'])
      self.log('test_acc', sum(outs)/len(outs))
      self.log('test_f1_score', sum(outs2)/len(outs2))
      self.test_step_outputs.clear()

  def configure_optimizers(self):
    # optimizer = torch.optim.Adam(self.parameters(), lr=3e-2)
    optimizer = torch.optim.Adam(self.parameters(), lr=1e-5)

    return optimizer


"""
Main Model:
Initialize
Forward Pass
Training Step
Validation Step
Testing Step

Pp
"""

class HmDataModule(pl.LightningDataModule):

  def setup(self, stage):
    self.hm_train = t_p
    self.hm_val = v_p
    # self.hm_test = test
    self.hm_test = te_p

  def train_dataloader(self):
    return DataLoader(self.hm_train, batch_size=20, drop_last=True)

  def val_dataloader(self):
    return DataLoader(self.hm_val, batch_size=20, drop_last=True)

  def test_dataloader(self):
    return DataLoader(self.hm_test, batch_size=20, drop_last=True)

data_module = HmDataModule()
checkpoint_callback = ModelCheckpoint(
     monitor='val_acc_all_offn',
     dirpath='mrinal/',
     filename='epoch{epoch:02d}-val_f1_all_offn{val_acc_all_offn:.2f}',
     auto_insert_metric_name=False,
     save_top_k=1,
    mode="max",
 )
all_callbacks = []
all_callbacks.append(checkpoint_callback)
# train
from pytorch_lightning import seed_everything
seed_everything(42, workers=True)
hm_model = Classifier()
gpus=1
#if torch.cuda.is_available():gpus=0
trainer = pl.Trainer(deterministic=True,max_epochs=10,precision=16,callbacks=all_callbacks)
trainer.fit(hm_model, data_module)

INFO:lightning_fabric.utilities.seed:Seed set to 42
/usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:552: You passed `Trainer(accelerator='cpu', precision='16-mixed')` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'` instead.
INFO:pytorch_lightning.utilities.rank_zero:Using bfloat16 Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /content/LLaVA/mrinal exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
   | Name        | Type   | Params
----------------------------------------
0  | MFB         | MFB    | 21.0 M
1  | fin_y_shape | Linear | 393 K 
2  | fin_old     | Linear | 130   
3  | fin         | Linear | 786 K 
4  | fin_inten   | Linear | 390   
5  | fin_e1      | Linear | 130   
6  | fin_e2      | Linear | 130   
7  | fin_e3      | Linear | 130   
8  | fin_e4      | Linear | 130   
9  | fin_e5      | Linear | 130   
10 | fin_e6      | Linear | 130   
11 | fin_e7      | Linear | 130   
12 | fin_e8      | Linear | 130   
13 | fin_e9      | Linear | 130   
----------------------------------------
22.2 M    Trainable params
0         Non-trainable params
22.2 M    Total params
88.745    Total estimated model params size (MB)
Sanity Checking DataLoader 0:   0%
 0/2 [00:00<?, ?it/s]
x.shape torch.Size([20, 768])
y.shape torch.Size([20, 512])
rag.shape torch.Size([20, 768])
intensity error: torch.Size([20, 2, 6])
output: torch.Size([20, 2, 2])
c_e1: torch.Size([20, 2, 2])
c_e2: torch.Size([20, 2, 2])
c_e3: torch.Size([20, 2, 2])
c_e4: torch.Size([20, 2, 2])
c_e5: torch.Size([20, 2, 2])
c_e6: torch.Size([20, 2, 2])
c_e7: torch.Size([20, 2, 2])
c_e8: torch.Size([20, 2, 2])
c_e9: torch.Size([20, 2, 2])
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-76-84cfd97e3d73> in <cell line: 338>()
    336 #if torch.cuda.is_available():gpus=0
    337 trainer = pl.Trainer(deterministic=True,max_epochs=10,precision=16,callbacks=all_callbacks)
--> 338 trainer.fit(hm_model, data_module)

13 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
   2702     if size_average is not None or reduce is not None:
   2703         reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2704     return torch._C._nn.nll_loss_nd(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
   2705 
   2706 

RuntimeError: Expected target size [20, 2], got [20]

I am Also facing the same error can you help me in this

RuntimeError: Expected target size [20, 2], got [20]

I am New to this can anyone help me in this

Could you post the logits shape? I assume you are using nn.NLLLoss and the target contains indices?
If so, these shapes would cause the shape mismatch:

criterion = nn.NLLLoss()
output = torch.randn(20, 2, 2, requires_grad=True)
target = torch.randint(0, 2, (20,))

loss = criterion(output, target)
# RuntimeError: Expected target size [20, 2], got [20]

since the output has a temporal dimension.
Either reduce it or provide a target containing class indices for each time step.

logits.shape torch.Size([20, 2, 2])
logits.shape torch.Size([20, 2, 6])
logits.shape torch.Size([20, 2, 2])
logits.shape torch.Size([20, 2, 2])
logits.shape torch.Size([20, 2, 2])
logits.shape torch.Size([20, 2, 2])
logits.shape torch.Size([20, 2, 2])
logits.shape torch.Size([20, 2, 2])
logits.shape torch.Size([20, 2, 2])
logits.shape torch.Size([20, 2, 2])
logits.shape torch.Size([20, 2, 2])
logits.shape torch.Size([20, 2, 2])

My dataset has 12 categories 1st one is 0&1 label ,2nd coloumn contains 0-5 labels , and other remaining 10 contains same labels 0&1