Dear all,
I’m using a model named Feature-Tokenizer Transformer that was written in PyTorch. I’m running several experiments on a tabular dataset, and until now everything was working fine. But while doing an ablation study (removing some features to check the performance of the model), my jobs were suddenly crashing when I was removing the first features (I have around 10 633 features, and the first 10 481 can be grouped together, which leave only 152 features). If I remove some other features (the last 152 for example, while keeping the first 10 481) it’s working fine, but not if I remove the first group. After some tests, I realized that it was failing only when on multi GPUs (I’m using torch.nn.DataParallel
and for some reason related to the framework that I use I can’t move to torch.nn.parallel.DistributedDataParallel
).
Here is the error I receive:
Traceback (most recent call last):
File "/home/my_user_name/my-framework/main.py", line 38, in <module>
run(args) # Run the framework with the parsed arguments
^^^^^^^^^
File "/home/my_user_name/my-framework/main.py", line 32, in run
pipeline.run(args) # Run the pipeline with the provided arguments
^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/my-framework/pipelines/evaluation.py", line 38, in run
model.run(args, X, y, le_species, le_header) # Run the model's evaluation
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/my-framework/models/ftt.py", line 444, in run
self.evaluate_model(args, X, y, le_species, le_header) # Run evaluation pipeline
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/my-framework/models/ftt.py", line 100, in evaluate_model
best_fold_accuracy = self.evaluate_fold(args, X, y, le_species, le_header, fold, split_assignments)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/my-framework/models/ftt.py", line 316, in evaluate_fold
accuracy, model = self.evaluate_epoch(args, model, train_dataloader, test_dataloader, epoch, device, optimizer, criterion, le_header)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/my-framework/models/ftt.py", line 408, in evaluate_epoch
outputs = model(x_num=batch, x_cat=None) # Forward pass to get output/logits
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/.conda/envs/my-env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/.conda/envs/my-env/lib/python3.11/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/.conda/envs/my-env/lib/python3.11/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/.conda/envs/my-env/lib/python3.11/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply
output.reraise()
File "/home/my_user_name/.conda/envs/my-env/lib/python3.11/site-packages/torch/_utils.py", line 543, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
File "/home/my_user_name/.conda/envs/my-env/lib/python3.11/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
output = module(*input, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/.conda/envs/my-env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/.conda/envs/my-env/lib/python3.11/site-packages/rtdl/modules.py", line 1487, in forward
x = self.transformer(x)
^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/.conda/envs/my-env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/.conda/envs/my-env/lib/python3.11/site-packages/rtdl/modules.py", line 1150, in forward
x_residual, _ = layer['attention'](
^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/.conda/envs/my-env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/.conda/envs/my-env/lib/python3.11/site-packages/rtdl/modules.py", line 893, in forward
k = key_compression(k.transpose(1, 2)).transpose(1, 2)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/.conda/envs/my-env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/my_user_name/.conda/envs/my-env/lib/python3.11/site-packages/torch/nn/modules/linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: size mismatch, got 4096, 4096x153,0
And here is my code:
model_parameters = get_model_parameters(args, X_train.shape[1], len(le_header.classes_)) # Get model parameters
model = FTT(**model_parameters) # Create an instance of the FTT model
device = get_device(args) # Set the device to run the model on to be GPU
model = torch.nn.DataParallel(model) # Run the model parallelly
model.to(device) # Move the model to the specified device (GPU or CPU)
criterion = get_criterion(args, y_train) # Instantiate loss class
criterion.to(device) # Move the loss function to the specified device (GPU or CPU)
optimizer = get_optimizer(args, model) # Instantiate optimizer class
scheduler = get_scheduler(args, optimizer) # Instantiate step learning scheduler class
for epoch in range(args.num_epochs):
for batch, labels in train_dataloader: # Iterate through train dataset
batch = batch.requires_grad_().to(device) # Load batches with gradient accumulation capabilities
labels = labels.to(device) # Use GPU for tensors
optimizer.zero_grad() # Clear gradients w.r.t. parameters
outputs = model(x_num=batch, x_cat=None) # Forward pass to get output/logits
loss = criterion(outputs, labels) # Calculate Loss: softmax --> cross entropy loss
loss.backward() # Getting gradients w.r.t. parameters
optimizer.step() # Updating parameters
# It continues but the crash happens before
The code fails the first time it enters the loop and calculate the forward pass. Once again, I want to insist on these facts:
- it only happens when I remove the first group of features, i.e., the first 10 481 features. If I remove other features (or if I don’t remove any feature), the code works fine.
- it only happens if I’m on a cluster with several GPUs. If I run it on my local machine (on which I have only one GPU), the code works fine.
- the shape of one batch is
[batch_size, num_input_features]
, i.e.,[512, 152]
. I don’t know why the error is mentioning the number 4096 (153 is normal, it might seem strange but the attributen_tokens
of the model is actually the number of input features + 1). - this is what I get if I print the devices of the parameters of my model:
for i in model.named_parameters():
print(f"{i[0]} -> {i[1].device}")
__________________________________________________
module.feature_tokenizer.num_tokenizer.weight -> cuda:0
module.feature_tokenizer.num_tokenizer.bias -> cuda:0
module.cls_token.weight -> cuda:0
module.transformer.blocks.0.attention.W_q.weight -> cuda:0
module.transformer.blocks.0.attention.W_q.bias -> cuda:0
module.transformer.blocks.0.attention.W_k.weight -> cuda:0
module.transformer.blocks.0.attention.W_k.bias -> cuda:0
module.transformer.blocks.0.attention.W_v.weight -> cuda:0
module.transformer.blocks.0.attention.W_v.bias -> cuda:0
module.transformer.blocks.0.attention.W_out.weight -> cuda:0
module.transformer.blocks.0.attention.W_out.bias -> cuda:0
module.transformer.blocks.0.ffn.linear_first.weight -> cuda:0
module.transformer.blocks.0.ffn.linear_first.bias -> cuda:0
module.transformer.blocks.0.ffn.linear_second.weight -> cuda:0
module.transformer.blocks.0.ffn.linear_second.bias -> cuda:0
module.transformer.blocks.0.ffn_normalization.weight -> cuda:0
module.transformer.blocks.0.ffn_normalization.bias -> cuda:0
module.transformer.blocks.0.key_compression.weight -> cuda:0
module.transformer.blocks.0.value_compression.weight -> cuda:0
module.transformer.head.normalization.weight -> cuda:0
module.transformer.head.normalization.bias -> cuda:0
module.transformer.head.linear.weight -> cuda:0
module.transformer.head.linear.bias -> cuda:0
- if I print the shape and device of the batch/labels, I get this:
print(batch.shape)
print(batch.device)
print(labels.shape)
print(labels.device)
__________________________________________________
torch.Size([512, 152])
cuda:0
torch.Size([512])
cuda:0
- if I print the model, I get this:
print(model)
__________________________________________________
DataParallel(
(module): FTT(
(feature_tokenizer): FeatureTokenizer(
(num_tokenizer): NumericalFeatureTokenizer()
)
(cls_token): CLSToken()
(transformer): Transformer(
(blocks): ModuleList(
(0): ModuleDict(
(attention): MultiheadAttention(
(W_q): Linear(in_features=16, out_features=16, bias=True)
(W_k): Linear(in_features=16, out_features=16, bias=True)
(W_v): Linear(in_features=16, out_features=16, bias=True)
(W_out): Linear(in_features=16, out_features=16, bias=True)
(dropout): Dropout(p=0.3, inplace=False)
)
(ffn): FFN(
(linear_first): Linear(in_features=16, out_features=32, bias=True)
(activation): ReGLU()
(dropout): Dropout(p=0.1, inplace=False)
(linear_second): Linear(in_features=16, out_features=16, bias=True)
)
(attention_residual_dropout): Dropout(p=0.0, inplace=False)
(ffn_residual_dropout): Dropout(p=0.0, inplace=False)
(output): Identity()
(ffn_normalization): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(key_compression): Linear(in_features=153, out_features=0, bias=False)
(value_compression): Linear(in_features=153, out_features=0, bias=False)
)
)
(head): Head(
(normalization): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(activation): ReLU()
(linear): Linear(in_features=16, out_features=228, bias=True)
)
)
)
)
Thanks for your help!