ValueError: Expected input batch_size (324) to match target batch_size (4)

Hi, @ptrblck I have a problem related to the batch size in my fusion model. I followed your suggestion on flattening the feature vector before the linear layer, but I still get the same error.

I have a fusion model and it works fine when I set the batch size to 1, but when I change it to 2 or any other number, it gives this error:

ile ~\anaconda3\lib\site-packages\torch\nn\functional.py:2996 in cross_entropy
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)

ValueError: Expected input batch_size (4) to match target batch_size (2).

Here is my fusion model:



from typing import Union, List, Dict, Any, cast

import torch
import torch.nn as nn

from torch.utils.model_zoo import load_url as load_state_dict_from_url
from torchinfo import summary
from snn_classification import model_SNN
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"



################################# VGG model ##################################################

__all__ = [
    "VGG",   
    "vgg11_bn",

]


model_urls = {
    "vgg11": "https://download.pytorch.org/models/vgg11-8a719046.pth",
    "vgg16": "https://download.pytorch.org/models/vgg16-397923af.pth",
    "vgg19": "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth",
    "vgg11_bn": "https://download.pytorch.org/models/vgg11_bn-6002323d.pth",

}


class VGG(nn.Module):
    def __init__(
        self, features: nn.Module, num_classes: int = 32, init_weights: bool = True, dropout: float = 0.5
    ) -> None:
        super().__init__()
        #_log_api_usage_once(self)
        self.features = features
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 1024),
            nn.ReLU(True),
            nn.Dropout(p=dropout),
            nn.Linear(1024, 512),
            nn.ReLU(True),
            nn.Dropout(p=dropout),
            nn.Linear(512, num_classes),
        )
        if init_weights:
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
                    if m.bias is not None:
                        nn.init.constant_(m.bias, 0)
                elif isinstance(m, nn.BatchNorm2d):
                    nn.init.constant_(m.weight, 1)
                    nn.init.constant_(m.bias, 0)
                elif isinstance(m, nn.Linear):
                    nn.init.normal_(m.weight, 0, 0.01)
                    nn.init.constant_(m.bias, 0)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1) 
        #x = x.view(x.size(0), -1)
        #print('the feature aftr flatten',x.shape)
        x = self.classifier(x)
        #print('the classifier ',x.shape)

        return x


def make_layers(cfg: List[Union[str, int]], batch_norm: bool = False) -> nn.Sequential:
    layers: List[nn.Module] = []
    in_channels = 3
    for v in cfg:
        if v == "M":
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            v = cast(int, v)
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)


cfgs: Dict[str, List[Union[str, int]]] = {
    "A": [64, "M", 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
    "B": [64, 64, "M", 128, 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
    "D": [64, 64, "M", 128, 128, "M", 256, 256, 256, "M", 512, 512, 512, "M", 512, 512, 512, "M"],
    "E": [64, 64, "M", 128, 128, "M", 256, 256, 256, 256, "M", 512, 512, 512, 512, "M", 512, 512, 512, 512, "M"],
}


def _vgg(arch: str, cfg: str, batch_norm: bool, pretrained: bool, progress: bool, **kwargs: Any) -> VGG:
    if pretrained:
        kwargs["init_weights"] = False
    model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm), **kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
        model.load_state_dict(state_dict)
    return model




def vgg11_bn(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG:
    r"""VGG 11-layer model (configuration "A") with batch normalization
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
    The required minimum input size of the model is 32x32.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _vgg("vgg11_bn", "A", True, pretrained, progress, **kwargs)






##################### MLP model#####################################

class MulticlassClassification(nn.Module):
    def __init__(self, num_class=3):
        super(MulticlassClassification, self).__init__()
        
        self.layer_1 = nn.Linear(81, 80)
        self.layer_2 = nn.Linear(80, 30)
        self.layer_3 = nn.Linear(30, 30)      
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, x):
        x = self.layer_1(x)
        x = self.relu(x)
        
        x = self.layer_2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_3(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        
        return x
    
def model_gens() -> MulticlassClassification:
    model = MulticlassClassification()
    return model
################################################################


net_CNN = vgg11_bn()
net_MLP = model_gens()


############################## fusion model #######################


class MyEnsemble(nn.Module):
    def __init__(self, nb_classes=3):
        super(MyEnsemble, self).__init__()
        self.model_image =  net_CNN
        self.model_EHR = net_MLP     

        # Create new classifier
        self.layer_out = nn.Linear(960, nb_classes)

    
    def forward(self, x1,x3):
        x1 = self.model_image(x1)       
        
		x3 = self.model_EHR(x3)
        x3 = x3.view(x3.size(0), -1) 

        x = torch.kron(x1,x3)
        x = self.layer_out(x)

        return x
    
    
def model_snn_vgg() -> MyEnsemble:
    model = MyEnsemble()
    return model



# model = model_snn_vgg()

# model.to(device=DEVICE,dtype=torch.float)
# print(summary(model,[(1,3, 224, 224),(1,81)]))

# print(model)

torch.kron(x1, x3) is changing the batch size as described in the docs.

Oh no :frowning: ,that’s correct, it changes the batch size when I print the shape of the kronker product.

Thanks a lot for your answer. Do you think there is a way to keep the batch size and maintain the nature of the krnoker product operation?

import cv2
import imghdr
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import LabelEncoder
from keras.initializers import RandomNormal, GlorotUniform

Set the path to the ISIC dataset

dataset_path = ‘data/train’

Load the images and labels

image_paths = []
labels = []
label_df = pd.read_csv(‘data/ISIC_2019_Training_GroundTruth.csv’)
for label in os.listdir(dataset_path):
label_path = os.path.join(dataset_path, label)
for image_name in os.listdir(label_path):
if imghdr.what(os.path.join(label_path, image_name)) is not None:
image_paths.append(os.path.join(label_path, image_name))
labels.append(label_df[label_df.image == image_name])

Convert the labels to integers using LabelEncoder

le = LabelEncoder()
labels = le.fit_transform(labels)

Split the data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(image_paths, labels, test_size=0.2)

Create a data generator for data augmentation

datagen = ImageDataGenerator(rotation_range=40, width_shift_range=0.2, height_shift_range=0.2, zoom_range=0.2, horizontal_flip=True)

Read and preprocess the images

X_train_array = []
for img_path in X_train:
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = cv2.resize(img, (128, 128))
X_train_array.append(img)

X_test_array = []
for img_path in X_test:
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = cv2.resize(img, (128, 128))
X_test_array.append(img)

Convert the data to numpy arrays and normalize it

X_train = np.array(X_train_array) / 255.0
X_test = np.array(X_test_array) / 255.0

ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_10236\3795273597.py in
27 # Convert the labels to integers using LabelEncoder
28 le = LabelEncoder()
—> 29 labels = le.fit_transform(labels)
30
31 # Split the data into train and test sets

~\anaconda3\lib\site-packages\sklearn\preprocessing_label.py in fit_transform(self, y)
113 Encoded labels.
114 “”"
→ 115 y = column_or_1d(y, warn=True)
116 self.classes_, y = _unique(y, return_inverse=True)
117 return y

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in column_or_1d(y, warn)
1036 return np.ravel(y)
1037
→ 1038 raise ValueError(
1039 “y should be a 1d array, got an array of shape {} instead.”.format(shape)
1040 )

ValueError: y should be a 1d array, got an array of shape (2239, 0, 10) instead.

hi
I have the same problem but when i use print(x.shape) , code shows nothing!

This shouldn’t be the case so could you post a minimal and executable code snippet to reproduce the issue?

Posting screenshots is unfortunately not really helpful so could you post a minimal and executable code snippet by wrapping it into three backticks ```?

Thanks. based on this explanation, I fixed my erorr

Hi @ptrblck , I got similar error,> /python3.8/site-packages/torch/nn/functional.py", line 3029, in cross_entropy

return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)

ValueError: Expected input batch_size (1022) to match target batch_size (1)
I found the input tensor is wright, but somehow in the crossentropy calculating, the input batch_size and output batch_size, miss-matched. Thanks a lot if you could help.

the log is quite long:


Installed CUDA version 11.4 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
Using /root/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
Installed CUDA version 11.4 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
Using /root/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py38_cu117/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Adam Optimizer #0 is created with AVX2 arithmetic capability.
Config: alpha=0.000020, betas=(0.900000, 0.999000), weight_decay=0.010000, adam_w=1
Adam Optimizer #0 is created with AVX2 arithmetic capability.
Config: alpha=0.000020, betas=(0.900000, 0.999000), weight_decay=0.010000, adam_w=1
[2023-07-09 16:45:23,230] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.9.5, git-hash=unknown, git-branch=unknown
Adam Optimizer #0 is created with AVX2 arithmetic capability.
Config: alpha=0.000020, betas=(0.900000, 0.999000), weight_decay=0.010000, adam_w=1
Adam Optimizer #0 is created with AVX2 arithmetic capability.
Config: alpha=0.000020, betas=(0.900000, 0.999000), weight_decay=0.010000, adam_w=1
[2023-07-09 16:45:23,538] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
[2023-07-09 16:45:23,541] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer
[2023-07-09 16:45:23,541] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
[2023-07-09 16:45:23,575] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam
[2023-07-09 16:45:23,575] [INFO] [utils.py:54:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'>
[2023-07-09 16:45:23,575] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False
[2023-07-09 16:45:23,576] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer
[2023-07-09 16:45:23,702] [INFO] [utils.py:785:see_memory_usage] Stage 3 initialize beginning
[2023-07-09 16:45:23,703] [INFO] [utils.py:786:see_memory_usage] MA 0.04 GB         Max_MA 0.8 GB         CA 0.81 GB         Max_CA 1 GB 
[2023-07-09 16:45:23,703] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 38.2 GB, percent = 15.2%
[2023-07-09 16:45:23,706] [INFO] [stage3.py:113:__init__] Reduce bucket size 500,000,000
[2023-07-09 16:45:23,706] [INFO] [stage3.py:114:__init__] Prefetch bucket size 50,000,000
[2023-07-09 16:45:23,808] [INFO] [utils.py:785:see_memory_usage] DeepSpeedZeRoOffload initialize [begin]
[2023-07-09 16:45:24,673] [INFO] [stage3.py:387:_setup_for_real_optimizer] optimizer state initialized
labels shape: torch.Size([2])labels shape: torch.Size([2])

input_ids shape:input_ids shape:  {torch.Size([2, 512])}
{torch.Size([2, 512])}
attention_mask shape:attention_mask shape:  {torch.Size([2, 512])}{torch.Size([2, 512])}

labels shape: torch.Size([2])
input_ids shape: {torch.Size([2, 512])}
attention_mask shape: {torch.Size([2, 512])}
[2023-07-09 16:45:24,851] [INFO] [utils.py:785:see_memory_usage] After initializing ZeRO optimizer
[2023-07-09 16:45:24,852] [INFO] [utils.py:786:see_memory_usage] MA 0.96 GB         Max_MA 0.96 GB         CA 1.74 GB         Max_CA 2 GB 
[2023-07-09 16:45:24,853] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 38.32 GB, percent = 15.2%
[2023-07-09 16:45:24,853] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedCPUAdam
[2023-07-09 16:45:24,853] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
[2023-07-09 16:45:24,853] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
[2023-07-09 16:45:24,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05], mom=[(0.9, 0.999)]
[2023-07-09 16:45:24,855] [INFO] [config.py:960:print] DeepSpeedEngine configuration:
[2023-07-09 16:45:24,855] [INFO] [config.py:964:print]   activation_checkpointing_config  {
    "partition_activations": false, 
    "contiguous_memory_optimization": false, 
    "cpu_checkpointing": false, 
    "number_checkpoints": null, 
    "synchronize_checkpoint_boundary": false, 
    "profile": false
}
[2023-07-09 16:45:24,855] [INFO] [config.py:964:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
[2023-07-09 16:45:24,855] [INFO] [config.py:964:print]   amp_enabled .................. False
[2023-07-09 16:45:24,855] [INFO] [config.py:964:print]   amp_params ................... False
[2023-07-09 16:45:24,856] [INFO] [config.py:964:print]   autotuning_config ............ {
    "enabled": false, 
    "start_step": null, 
    "end_step": null, 
    "metric_path": null, 
    "arg_mappings": null, 
    "metric": "throughput", 
    "model_info": null, 
    "results_dir": "autotuning_results", 
    "exps_dir": "autotuning_exps", 
    "overwrite": true, 
    "fast": true, 
    "start_profile_step": 3, 
    "end_profile_step": 5, 
    "tuner_type": "gridsearch", 
    "tuner_early_stopping": 5, 
    "tuner_num_trials": 50, 
    "model_info_path": null, 
    "mp_size": 1, 
    "max_train_batch_size": null, 
    "min_train_batch_size": 1, 
    "max_train_micro_batch_size_per_gpu": 1.024000e+03, 
    "min_train_micro_batch_size_per_gpu": 1, 
    "num_tuning_micro_batch_sizes": 3
}
[2023-07-09 16:45:24,856] [INFO] [config.py:964:print]   bfloat16_enabled ............. True
[2023-07-09 16:45:24,856] [INFO] [config.py:964:print]   checkpoint_parallel_write_pipeline  False
[2023-07-09 16:45:24,856] [INFO] [config.py:964:print]   checkpoint_tag_validation_enabled  True
[2023-07-09 16:45:24,856] [INFO] [config.py:964:print]   checkpoint_tag_validation_fail  False
[2023-07-09 16:45:24,856] [INFO] [config.py:964:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x14ebe59c75e0>
[2023-07-09 16:45:24,856] [INFO] [config.py:964:print]   communication_data_type ...... None
[2023-07-09 16:45:24,856] [INFO] [config.py:964:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
[2023-07-09 16:45:24,856] [INFO] [config.py:964:print]   curriculum_enabled_legacy .... False
[2023-07-09 16:45:24,856] [INFO] [config.py:964:print]   curriculum_params_legacy ..... False
[2023-07-09 16:45:24,856] [INFO] [config.py:964:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
[2023-07-09 16:45:24,857] [INFO] [config.py:964:print]   data_efficiency_enabled ...... False
[2023-07-09 16:45:24,857] [INFO] [config.py:964:print]   dataloader_drop_last ......... False
[2023-07-09 16:45:24,857] [INFO] [config.py:964:print]   disable_allgather ............ False
[2023-07-09 16:45:24,857] [INFO] [config.py:964:print]   dump_state ................... False
[2023-07-09 16:45:24,857] [INFO] [config.py:964:print]   dynamic_loss_scale_args ...... None
[2023-07-09 16:45:24,857] [INFO] [config.py:964:print]   eigenvalue_enabled ........... False
[2023-07-09 16:45:24,857] [INFO] [config.py:964:print]   eigenvalue_gas_boundary_resolution  1
[2023-07-09 16:45:24,857] [INFO] [config.py:964:print]   eigenvalue_layer_name ........ bert.encoder.layer
[2023-07-09 16:45:24,857] [INFO] [config.py:964:print]   eigenvalue_layer_num ......... 0
[2023-07-09 16:45:24,857] [INFO] [config.py:964:print]   eigenvalue_max_iter .......... 100
[2023-07-09 16:45:24,857] [INFO] [config.py:964:print]   eigenvalue_stability ......... 1e-06
[2023-07-09 16:45:24,857] [INFO] [config.py:964:print]   eigenvalue_tol ............... 0.01
[2023-07-09 16:45:24,857] [INFO] [config.py:964:print]   eigenvalue_verbose ........... False
[2023-07-09 16:45:24,857] [INFO] [config.py:964:print]   elasticity_enabled ........... False
[2023-07-09 16:45:24,858] [INFO] [config.py:964:print]   flops_profiler_config ........ {
    "enabled": false, 
    "recompute_fwd_factor": 0.0, 
    "profile_step": 1, 
    "module_depth": -1, 
    "top_modules": 1, 
    "detailed": true, 
    "output_file": null
}
[2023-07-09 16:45:24,858] [INFO] [config.py:964:print]   fp16_auto_cast ............... None
[2023-07-09 16:45:24,858] [INFO] [config.py:964:print]   fp16_enabled ................. False
[2023-07-09 16:45:24,858] [INFO] [config.py:964:print]   fp16_master_weights_and_gradients  False
[2023-07-09 16:45:24,858] [INFO] [config.py:964:print]   global_rank .................. 0
[2023-07-09 16:45:24,858] [INFO] [config.py:964:print]   grad_accum_dtype ............. None
[2023-07-09 16:45:24,858] [INFO] [config.py:964:print]   gradient_accumulation_steps .. 4
[2023-07-09 16:45:24,858] [INFO] [config.py:964:print]   gradient_clipping ............ 0.0
[2023-07-09 16:45:24,858] [INFO] [config.py:964:print]   gradient_predivide_factor .... 1.0
[2023-07-09 16:45:24,858] [INFO] [config.py:964:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
[2023-07-09 16:45:24,858] [INFO] [config.py:964:print]   initial_dynamic_scale ........ 1
[2023-07-09 16:45:24,858] [INFO] [config.py:964:print]   load_universal_checkpoint .... False
[2023-07-09 16:45:24,858] [INFO] [config.py:964:print]   loss_scale ................... 1.0
[2023-07-09 16:45:24,859] [INFO] [config.py:964:print]   memory_breakdown ............. False
[2023-07-09 16:45:24,859] [INFO] [config.py:964:print]   mics_hierarchial_params_gather  False
[2023-07-09 16:45:24,859] [INFO] [config.py:964:print]   mics_shard_size .............. -1
[2023-07-09 16:45:24,859] [INFO] [config.py:964:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='cpu', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True
[2023-07-09 16:45:24,860] [INFO] [config.py:964:print]   zero_enabled ................. True
[2023-07-09 16:45:24,860] [INFO] [config.py:964:print]   zero_force_ds_cpu_optimizer .. True
[2023-07-09 16:45:24,860] [INFO] [config.py:964:print]   zero_optimization_stage ...... 3
[2023-07-09 16:45:24,861] [INFO] [config.py:950:print_user_config]   json = {
    "train_batch_size": 32, 
    "train_micro_batch_size_per_gpu": 2, 
    "gradient_accumulation_steps": 4, 
    "zero_optimization": {
        "stage": 3, 
        "offload_optimizer": {
            "device": "cpu", 
            "nvme_path": null
        }, 
        "offload_param": {
            "device": "cpu", 
            "nvme_path": null
        }, 
        "stage3_gather_16bit_weights_on_model_save": true
    }, 
    "steps_per_print": inf, 
    "bf16": {
        "enabled": true
    }, 
    "fp16": {
        "enabled": false
    }, 
    "zero_allow_untested_optimizer": true
}
***** Running training *****
labels shape: torch.Size([2])
input_ids shape: {torch.Size([2, 512])}
attention_mask shape: {torch.Size([2, 512])}
Traceback (most recent call last):
  File "/workspace/work00/sue-xie/llama/kyano_lora/src/main_paws.py", line 363, in <module>
Traceback (most recent call last):
  File "/workspace/work00/sue-xie/llama/kyano_lora/src/main_paws.py", line 363, in <module>
Traceback (most recent call last):
  File "/workspace/work00/sue-xie/llama/kyano_lora/src/main_paws.py", line 363, in <module>
    main()
  File "/workspace/work00/sue-xie/llama/kyano_lora/src/main_paws.py", line 316, in main
    main()
  File "/workspace/work00/sue-xie/llama/kyano_lora/src/main_paws.py", line 316, in main
    outputs = model(**batch, use_cache=False)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    main()
  File "/workspace/work00/sue-xie/llama/kyano_lora/src/main_paws.py", line 316, in main
    outputs = model(**batch, use_cache=False)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    outputs = model(**batch, use_cache=False)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
    ret_val = func(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1735, in forward
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
        return forward_call(*args, **kwargs)ret_val = func(*args, **kwargs)

  File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1735, in forward
    ret_val = func(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1735, in forward
    loss = self.module(*inputs, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    loss = self.module(*inputs, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    loss = self.module(*inputs, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/peft/peft_model.py", line 575, in forward
    result = forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/peft/peft_model.py", line 575, in forward
    return self.base_model(
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/peft/peft_model.py", line 575, in forward
    return self.base_model(
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    return self.base_model(
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py", line 714, in forward
    result = forward_call(*args, **kwargs)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py", line 714, in forward
    loss = loss_fct(shift_logits, shift_labels)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py", line 714, in forward
    loss = loss_fct(shift_logits, shift_labels)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    loss = loss_fct(shift_logits, shift_labels)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/loss.py", line 1174, in forward
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/loss.py", line 1174, in forward
  File "/workspace/work00/sue-xie/llama/kyano_lora/src/main_paws.py", line 363, in <module>
    main()
    return F.cross_entropy(input, target, weight=self.weight,    
return forward_call(*args, **kwargs)  File "/opt/conda/lib/python3.8/site-packages/torch/nn/functional.py", line 3029, in cross_entropy

  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/loss.py", line 1174, in forward
  File "/workspace/work00/sue-xie/llama/kyano_lora/src/main_paws.py", line 316, in main
    outputs = model(**batch, use_cache=False)
    return F.cross_entropy(input, target, weight=self.weight,
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/functional.py", line 3029, in cross_entropy
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
    return F.cross_entropy(input, target, weight=self.weight,
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/functional.py", line 3029, in cross_entropy
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
    ret_val = func(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1735, in forward
    loss = self.module(*inputs, **kwargs)
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
ValueError: Expected input batch_size (1022) to match target batch_size (1).
  File "/opt/conda/lib/python3.8/site-packages/peft/peft_model.py", line 575, in forward
    return self.base_model(
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
ValueError: Expected input batch_size (1022) to match target batch_size (1).
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
  File "/opt/conda/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py", line 714, in forward
    loss = loss_fct(shift_logits, shift_labels)
ValueError: Expected input batch_size (1022) to match target batch_size (1).
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/loss.py", line 1174, in forward
    return F.cross_entropy(input, target, weight=self.weight,
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/functional.py", line 3029, in cross_entropy
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
ValueError: Expected input batch_size (1022) to match target batch_size (1).```

And My code is as below:

import evaluate
metric = evaluate.load(“accuracy”)
def evaluate(args, model, eval_dataloader, accelerator, eval_dataset):
model.eval()
losses =

for step, batch in enumerate(eval_dataloader):
    with torch.no_grad():
        outputs = model(**batch)

    loss = outputs.loss
    losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))

    logits = outputs.logits
    predictions = torch.argmax(logit, dim=-1)
    metric.add_batch(predictions=predictions, reference = batch["labels"])

losses = torch.cat(losses)
metric.compute()
try:
    eval_loss = torch.mean(losses)
    perplexity = math.exp(eval_loss)
except OverflowError:
    perplexity = float("inf")
return perplexity, eval_loss,metric.compute()

from transformers import AutoTokenizer, DataCollatorWithPadding

#data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,pad_to_multiple_of=8,mlm=False)

#metric = evaluate.load(“accuracy”)

def main():
args = parse_args()

accelerator = Accelerator(log_with="wandb")

hps = {"learning_rate": args.learning_rate}
accelerator.init_trackers(args.wandb_name)

set_random_seed(args.seed)

tokenizer = LlamaTokenizer.from_pretrained(args.model_name_or_path,
                                           fast_tokenizer=True)
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,pad_to_multiple_of=8,
        )
# tokenizer.pad_token_id = (
#     0
# )
# tokenizer.padding_side = "left"

# tokenizer.pad_token = tokenizer.eos_token

model = create_hf_model(LlamaForCausalLM, args.model_name_or_path,
                        tokenizer)

peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=args.lora_dim, lora_alpha=args.lora_alpha, lora_dropout=args.lora_dropout)

model = get_peft_model(model, peft_config)
with accelerator.main_process_first():
    train_dataset = create_dataset(
        args.local_rank, # invalid
        args.data_output_path,
        args.seed,
        tokenizer,
        args.max_seq_len,
        True,
        #imitation_model=args.imitation_model # invalid
    )   
    #print(train_dataset[0])
    eval_dataset = create_dataset(
        args.local_rank,
        args.data_output_path,
        args.seed,
        tokenizer,
        args.max_seq_len,
        False,
        #imitation_model=args.imitation_model
    )

accelerator.wait_for_everyone()

# DataLoaders creation:
train_dataloader = DataLoader(
    train_dataset, collate_fn=data_collator,
    batch_size= args.per_device_train_batch_size
    )


eval_dataloader = DataLoader(
    eval_dataset, collate_fn=data_collator,
    batch_size= args.per_device_eval_batch_size
    )


# train_dataloader = DataLoader(train_dataset,
#                             #   collate_fn=DataCollatorForSeq2Seq(
#                             #     tokenizer, pad_to_multiple_of=8,return_tensors="pt",padding=True
#                             #   ),
#                             #data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
#                             batch_size=args.per_device_train_batch_size)
# eval_dataloader = DataLoader(eval_dataset,
#                             #  collate_fn=DataCollatorForSeq2Seq(
#                             #     tokenizer,pad_to_multiple_of=8,
#                             #     return_tensors="pt",padding=True
#                             #  ),
#                              batch_size=args.per_device_eval_batch_size)
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
# New Code #
# Creates Dummy Optimizer if `optimizer` was specified in the config file else creates Adam Optimizer
optimizer_cls = (
    torch.optim.AdamW
    if accelerator.state.deepspeed_plugin is None
    or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
    else DummyOptim
)

optimizer = optimizer_cls(optimizer_grouped_parameters, lr=args.learning_rate)

num_update_steps_per_epoch = math.ceil(
    len(train_dataloader) / args.gradient_accumulation_steps)
lr_scheduler = get_scheduler(
    name=args.lr_scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=args.num_warmup_steps,
    num_training_steps=args.num_train_epochs * num_update_steps_per_epoch,
)

model, train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare(
    model, train_dataloader, eval_dataloader, optimizer, lr_scheduler)


# Train!
print_rank_0("***** Running training *****", accelerator.process_index)

for epoch in range(args.num_train_epochs):
    current_step = []
    model.train()
    for step, batch in enumerate(train_dataloader):
        #sue
        if step == 0:
            print(f'labels shape: {batch["labels"].shape}')
            print("input_ids shape:",{batch["input_ids"].shape})
            print("attention_mask shape:",{batch["attention_mask"].shape})
            
        outputs = model(**batch, use_cache=False)
        train_loss = outputs.loss
        accelerator.backward(train_loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        accelerator.log({"train_loss": train_loss})
        accelerator.log({"lr": lr_scheduler.get_lr()[0]})
        if step % 300 == 0:
            print_rank_0(f"Epoch is {epoch}, Step is {step}, train_loss is {train_loss.item()}", accelerator.process_index)
        
        
    
    ppl, eval_loss = evaluate(args, model, eval_dataloader, accelerator, eval_dataset)
    if accelerator.is_main_process:
        print_rank_0(f"eval_loss: {eval_loss}, ppl: {ppl}", accelerator.process_index)

if args.output_dir is not None:
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)

    # New Code #
    # Saves the whole/unpartitioned fp16 model when in ZeRO Stage-3 to the output directory if
    # `stage3_gather_16bit_weights_on_model_save` is True in DeepSpeed Config file or
    # `zero3_save_16bit_model` is True in DeepSpeed Plugin.
    # For Zero Stages 1 and 2, models are saved as usual in the output directory.
    # The model name saved is `pytorch_model.bin`
    unwrapped_model.save_pretrained(
        args.output_dir,
        is_main_process=accelerator.is_main_process,
        save_function=accelerator.save,
        state_dict=accelerator.get_state_dict(model),
    )
    if accelerator.is_main_process:
        tokenizer.save_pretrained(args.output_dir)
    

accelerator.end_training()

if name == “main”:
main()

It seems you are using a higher-level API, which hides the implementation of the loss calculation, so it’s unclear to me which operation creates the unexpected shape.
In any case, in a multi-class classification use case nn.CrossEntropyLoss expects the model outputs to contain logits and have the shape [batch_size, nb_classes] while the targets should have the shape [batch_size] containing class indices in the range [0, nb_classes-1].
Based on the error message the batch size changed between the model output (1022) and the target (1).

Thanks for your confirm! Yes, I’m using accelerator to speed up my training, that’s the code I cloned from others but I need to make it suitable for my data. There are 2 categories of the label. It’s the cross entropy layer connected with lr_scheduler_type , the previous one set to cosine, would it be any change with setting of linear?

I don’t think the learning rate scheduler should have any influence on the used criterion (or vice versa). It also won’t change the shape of any tensors so you would still need to check which shape is expected and if the tensor is flattened (by mistake) e.g. in your forward method.

Hi, I encountered the same problem, can you help me? Thanks a lot!

# Define the CNN architecture
class TimeSeriesCNN(nn.Module):
    def __init__(self, input_channels, output_size):
        super(TimeSeriesCNN, self).__init__()
        self.conv_layer = nn.Sequential(
            nn.Conv1d(in_channels=input_channels, out_channels=16, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2)
        )
        self.fc_layer = nn.Sequential(
            #nn.Linear(32 * final_sequence_length, 64),
            nn.Linear(288, 64),
            nn.ReLU(),
            nn.Linear(64, output_size)
        )

    def forward(self, x):
        x = self.conv_layer(x)
        print(x.shape)
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.fc_layer(x)
        return x
# Parameters
input_channels = 1  # Number of input channels (features)
output_size = 4    # Number of output classes or regression values
#output_size = 1    # Number of output classes or regression values
batch_size = 32
learning_rate = 0.001
num_epochs = 10
final_sequence_length = 10  # Adjust this according to your time series sequence length
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Initialize the model, loss function, and optimizer
model = TimeSeriesCNN(input_channels, output_size)
#criterion = nn.MSELoss()  # You can change the loss function as needed
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#target_shape = (19138, 32, 4)
#target_shape = (19138, 4)
#target = torch.zeros(*target_shape)
#target[:, 3] = 1
#target[:, :, 3] = 1
#target = torch.ones(19138, dtype=int)
target = torch.randn(32, 4).softmax(dim=1)
print(target.shape)
print(target[:3])


# Training loop
for epoch in range(num_epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        outputs = model(batch.unsqueeze(1))  # Add the channel dimension
        print(outputs.shape)
        loss = criterion(outputs, target)  # Replace 'target' with your target values
        #loss = criterion(outputs, target.view(1, -1))  # Replace 'target' with your target values
        #loss = criterion(outputs, target.view(-1, 1))  # Replace 'target' with your target values
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
torch.Size([32, 32, 9])
torch.Size([32, 4])
torch.Size([32, 32, 9])
torch.Size([32, 4])
torch.Size([32, 32, 9])
torch.Size([32, 4])
torch.Size([2, 32, 9])
torch.Size([2, 4])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[34], line 23
     21 outputs = model(batch.unsqueeze(1))  # Add the channel dimension
     22 print(outputs.shape)
---> 23 loss = criterion(outputs, target)  # Replace 'target' with your target values
     24 #loss = criterion(outputs, target.view(1, -1))  # Replace 'target' with your target values
     25 #loss = criterion(outputs, target.view(-1, 1))  # Replace 'target' with your target values
     26 loss.backward()

File ~/anaconda3/envs/python310_pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/anaconda3/envs/python310_pytorch/lib/python3.10/site-packages/torch/nn/modules/loss.py:1174, in CrossEntropyLoss.forward(self, input, target)
   1173 def forward(self, input: Tensor, target: Tensor) -> Tensor:
-> 1174     return F.cross_entropy(input, target, weight=self.weight,
   1175                            ignore_index=self.ignore_index, reduction=self.reduction,
   1176                            label_smoothing=self.label_smoothing)

File ~/anaconda3/envs/python310_pytorch/lib/python3.10/site-packages/torch/nn/functional.py:3029, in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
   3027 if size_average is not None or reduce is not None:
   3028     reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 3029 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)

ValueError: Expected input batch_size (2) to match target batch_size (32).

I don’t know where the print statements were added but you can see that your activation has a reduced batch size of 2, which looks valid assuming it could be the last batch containing less samples. Check why the target contains more values as it should also have the same batch size.

EDIT: based on your code you are creating the target once instead of loading it from the DataLoader. I’m not familiar with your use case and don’t know why you are using this approach, but either load the target from the dataset or create it based on the shape of the input inside the data loader loop.

The reason I’m not reading the target from the dataloader is that I haven’t preprocessed the labels in a proper way yet. But anyhow, the label for each data point of this time series snippet is supposed to be the same. I have a multiclass classification problem here: 0-3, 4 classes. The current dataset I’m playing with is let’s say a 10-minute time series data from one designated subject or participant. And I just need to predict the level (4 tiers) of one attribute of this subject. I suppose that I create a target tensor myself and throw it into the criterion function would be fine? Am I wrong?

Yes, you are missing the issue of the variable batch size as your print statements show.
Not all your batches have 32 samples, so either use drop_last=True, assuming only the last batch is smaller, or create the target based on the current batch size instead of a static value.

1 Like

Thanks! That helped fix it.