Distributed parallel error:TypeError: unsupported operand type(s) for +: 'int' and 'CrossEntropyLoss'

the error message comes from:
return next(self._sampler_iter) # may raise StopIteration
File “/home/nikiguo93/.conda/envs/branch-env/lib/python3.7/site-packages/torch/utils/data/sampler.py”, line 229, in iter
for idx in self.sampler:
python3.7/site-packages/torch/utils/data/distributed.py", line 98, in iter
g.manual_seed(self.seed + self.epoch)
TypeError: unsupported operand type(s) for +: ‘int’ and ‘CrossEntropyLoss’
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 6341) of binary: /home/usrname/.conda/envs/branch-env/bin/python…

can anybody help me with that?

Based on the error message you are trying to add an int to an object, which will fail, as seen here:

criterion = nn.CrossEntropyLoss()
1 + criterion
# TypeError: unsupported operand type(s) for +: 'int' and 'CrossEntropyLoss'

so check where this line of code is executed and make sure to call the criterion with valid tensor inputs instead.

Hi, thanks for your information. Solved now. and another question is that, when I used :
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node=4 file.py,
and executing this line of code:
if args.local_rank == 0:
torch.save()
there is error message saying that: ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0, pytorch ddp.py’ not found.
do you have some ideas about how to fix it?

I haven’t seen this error message before so could you show the content of file.py?

import torch
import gc
gc.collect()
torch.cuda.empty_cache()

import os
import numpy as np
from PIL import Image
from torchvision import transforms,models, utils
from sklearn import preprocessing
import argparse
import random
from torch.utils.tensorboard import SummaryWriter
from collections import defaultdict,OrderedDict
from copy import deepcopy
from torch.optim import Adam as Adam
from torch.nn import CrossEntropyLoss as CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader
import torch
import random
from torch.utils.data.distributed import DistributedSampler
from apex.parallel import DistributedDataParallel as DDP
from apex.parallel import convert_syncbn_model
from apex import amp

#reproducibility
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True

parser = argparse.ArgumentParser()
parser.add_argument(‘–sub_idx’, default=3, type=int)
parser.add_argument(‘–categories’, default=‘51’, type=int)
parser.add_argument(‘–writerpath’, default=‘/home/nikiguo93/python_examples/DataLoader/places365/runs/’, type=str)
parser.add_argument(‘–modelname’, default=‘resnet50’, type=str)
parser.add_argument(‘–arch’, default=‘resnet’, type=str)
parser.add_argument(‘–lr’, default= 5e-6, type=float)
parser.add_argument(‘–MaxEpoch’, default=150, type=int)
parser.add_argument(‘–mtl’, default= ‘True’, type=str)
parser.add_argument(“–local_rank”, default=os.getenv(‘LOCAL_RANK’, 0), type=int)

parser.add_argument(‘–dirpath’, default= ‘/scratch/nikiguo93/results_relu_fc_bk3_bk4/’, type=str)

args = parser.parse_args()

print(args.mtl)

writer = SummaryWriter(args.writerpath + ‘train_places365_mtl’ + str(args.mtl))

‘’’
num_of_gpu = torch.cuda.device_count()
print(num_of_gpu)
os.environ[‘CUDA_VISIBLE_DEVICES’] = ‘0,1,2,3’
os.environ[‘NCCL_DEBUG’]=‘INFO’
os.environ[‘NCCL_DEBUG_SUBSYS’]=‘ALL’
os.environ[‘TORCH_DISTRIBUTED_DEBUG’] = ‘INFO’

set up distributed device

rank = int(os.environ[‘RANK’])
local_rank = int(os.environ[‘LOCAL_RANK’])
torch.cuda.set_device(rank % torch.cuda.device_count())
torch.distributed.init_process_group(backend=‘nccl’ if torch.distributed.is_nccl_available() else ‘gloo’)
device = torch.device(‘cuda’,local_rank)

print(f"[init] == local rank: {local_rank}, global rank: {rank} ==")
‘’’

args.gpu = args.local_rank
torch.cuda.set_device(args.gpu)
torch.distributed.init_process_group(backend=‘nccl’,init_method=‘env://’)

class Places365Dataset(Dataset):

def __init__(self,image_path,file_list,transform,train=True):
    
    train_ImageNames = []
    self.train = train
    self.transform = transform
    
    for FileName in os.listdir(image_path + 'train'):
        if FileName in file_list:
            for ImgName in os.listdir(image_path+'train/'+FileName):
                image_name = os.path.join(image_path,'train/',FileName,ImgName)
                train_ImageNames.append(image_name) 
    train_ImageNames.sort()
    
    test_ImageNames = []
    
    for FileName in os.listdir(image_path + 'val'):
        if FileName in file_list:
            for ImgName in os.listdir(image_path+'val/'+FileName):
                image_name = os.path.join(image_path,'val/',FileName,ImgName)
                test_ImageNames.append(image_name) 
    test_ImageNames.sort()
    
    
    train_label = [train_ImageNames[i].split('/')[5] for i in range(len(train_ImageNames))]
    test_label = [test_ImageNames[i].split('/')[5] for i in range(len(test_ImageNames))]
    
    

    b_train=random.sample(range(0, 999), 400)
    self.train_img_names = []
    self.train_lbl = []
    for i in range(len(train_ImageNames)):
        res = i%1000
        if res in b_train:
            self.train_img_names.append(train_ImageNames[i])
            self.train_lbl.append(train_label[i])
            
    b_test=random.sample(range(0, 99), 50)
    self.test_img_names = []
    self.test_lbl = []
    for i in range(len(test_ImageNames)):
        res = i%100
        if res in b_test:
            self.test_img_names.append(test_ImageNames[i])
            self.test_lbl.append(test_label[i])

    le = preprocessing.LabelEncoder()
    self.train_lbl = le.fit_transform(self.train_lbl)
    self.test_lbl = le.fit_transform(self.test_lbl)
    self.train_lbl = torch.as_tensor(self.train_lbl)
    self.test_lbl = torch.as_tensor(self.test_lbl)
    
        


def __len__(self):
    if self.train:
        return len(self.train_img_names)
    else:
        return len(self.test_img_names)

def __getitem__(self, idx):
    if torch.is_tensor(idx):
        idx = idx.tolist()
        
    if self.train:    
        TheImage = self.transform(Image.open(self.train_img_names[idx]).convert('RGB'))
        TheLabel = self.train_lbl[idx]
    else:
        TheImage = self.transform(Image.open(self.test_img_names[idx]).convert('RGB'))
        TheLabel = self.test_lbl[idx]
        
    sample = {'input': TheImage, 'label': TheLabel}

    return sample

image_path = ‘/scratch/nikiguo93/places365/’

file_list = [‘mountain_snowy’,‘bakery-shop’,‘lighthouse’,‘beach’,‘ball_pit’,‘dining_room’,‘highway’,‘bridge’,‘creek’,
‘waterfall’,‘forest-broadleaf’,‘nursery’,‘campsite’,‘car_interior’,‘toyshop’,‘shoe_shop’,‘bookstore’,‘coast’,‘ocean’,‘corridor’,
‘skyscraper’,‘water_tower’,‘closet’,‘landing_deck’,‘florist_shop-indoor’,‘park’,‘pasture’,‘pantry’,‘swimming_pool-outdoor’,
‘supermarket’,‘conference_room’,‘corn_field’,‘movie_theater-indoor’,‘laundromat’,‘staircase’,‘street’,‘home_theater’,‘utility_room’,
‘auditorium’,‘river’,‘castle’,‘butchers_shop’,‘windmill’,‘desert-sand’,‘tower’,‘cockpit’,‘office’,‘kitchen’,‘carrousel’,
‘playground’,‘wheat_field’]

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])

transform = transforms.Compose([
transforms.Resize(224),
transforms.ToTensor(),
normalize,
])

PLACES_Traindataset = Places365Dataset(image_path,file_list,transform,train=True)
PLACES_Testdataset = Places365Dataset(image_path,file_list,transform,train=False)

train_sampler = torch.utils.data.distributed.DistributedSampler(PLACES_Traindataset,shuffle=True,)
val_sampler = torch.utils.data.distributed.DistributedSampler(PLACES_Testdataset)

TraindataLoader = DataLoader(PLACES_Traindataset, batch_size=64,num_workers=8,pin_memory=True,sampler=train_sampler)
TestdataLoader = DataLoader(PLACES_Testdataset, batch_size=64,num_workers=8,shuffle=False,sampler=val_sampler)

del PLACES_Traindataset
del PLACES_Testdataset

def load_model(model,modelpath):

if os.path.isfile(modelpath):
    print("=> loading checkpoint '{}'".format(modelpath))
    checkpoint = torch.load(modelpath)
    

state_dict = checkpoint['best_model']
model_dict = model.state_dict()
new_state_dict = OrderedDict()
matched_layers, discarded_layers = [], []

for k, v in state_dict.items():
    if k.startswith('module.'):
        k = k[7:] # discard module.

    if k in model_dict and model_dict[k].size() == v.size():
            new_state_dict[k] = v
            matched_layers.append(k)
else:
    discarded_layers.append(k)

model_dict.update(new_state_dict)
model.load_state_dict(model_dict)
model.eval().cuda()

class CustomResNet(torch.nn.Module):
def init(self,modelname):
super(CustomResNet, self).init()
resnet50 = models.dictmodelname
self.features_1= torch.nn.ModuleList(resnet50.children())[:-2]
self.features_2= torch.nn.ModuleList(resnet50.children())[:-3]
self.features_3= torch.nn.ModuleList(resnet50.children())[:-4]
self.features_4= torch.nn.ModuleList(resnet50.children())[:-5]

    self.part_1 = torch.nn.Sequential(
              *self.features_1,
              torch.nn.ReLU(),
              torch.nn.Flatten(),
              torch.nn.Linear(100352,256)
             ) 

    self.part_2 = torch.nn.Sequential(
              *self.features_2,
              torch.nn.ReLU(),
              torch.nn.Flatten(),
              torch.nn.Linear(200704,256)
             ) 
    

    
    self.regression = torch.nn.Sequential(
                      torch.nn.ReLU(),
                      torch.nn.Linear(512,1700))
    
    self.classification = torch.nn.Sequential(
                    *self.features_1,
                    torch.nn.AdaptiveAvgPool2d((1, 1)),
                    torch.nn.Flatten(),
                    torch.nn.Linear(2048,1654))

    
       
def forward(self, x):
    
    x1 = self.part_1(x)
    x2 = self.part_2(x)
    x_eeg = torch.cat([x1,x2],axis=1)
    
    #x_eeg = x_eeg.view(x_eeg.size(0),-1)
    x_eeg = self.regression(x_eeg)
    x_img = self.classification(x)
    
    return x_eeg,x_img

if args.mtl == ‘True’:
print(args.mtl)
mtl = CustomResNet(args.modelname)
modelpath = args.dirpath + ‘real_zeeg_sub’ + str(args.sub_idx) + ‘_resnet50.pth’
load_model(model = mtl,modelpath=modelpath)
model_ft = mtl.classification
del mtl
num_ftrs=model_ft[10].in_features
model_ft[10] = torch.nn.Linear(num_ftrs, args.categories,bias = True)
else:
model_ft = models.resnet50(pretrained=True)
num_ftrs=model_ft.fc.in_features
model_ft.fc = torch.nn.Linear(num_ftrs, 1654,bias = True)
modelpath = args.dirpath + ‘finetune_classification_resnet50_all.pth’
load_model(model_ft,modelpath)
num_ftrs=model_ft.fc.in_features
model_ft.fc = torch.nn.Linear(num_ftrs, args.categories,bias = True)

#freeze part of the resnet
count = 0
for child in model_ft.children():
count+=1
if count < 9:
for param in child.parameters():
param.requires_grad = False

model_ft.cuda()

#parameters to optimized
params_to_update = model_ft.parameters()
print(“Params to learn:”)
for name,param in model_ft.named_parameters():
if param.requires_grad == True:
print(“\t”,name)

#define the optimizer
#optimizer_ft = SGD(params_to_update, lr = 0.01,momentum=0.09,weight_decay=0.0005)
optimizer_ft = Adam(params_to_update, lr=args.lr)

model_ft,optimizer_ft = amp.initialize(model_ft,optimizer_ft,opt_level=‘O2’)
model_ft = DDP(model_ft,delay_allreduce=True)

def reduce_value(value, average=True):
world_size = torch.distributed.get_world_size()
if world_size < 2:
return value

with torch.no_grad():
    torch.distributed.all_reduce(value)
    if average:
        value /= world_size
return value

def train(TraindataLoader,optimizer,model,criterion,ep):

model.train()
train_loss = 0
num_correct = 0.0
batch_size = 0
#set sampler
TraindataLoader.sampler.set_epoch(ep)

for i_batch, sample_batched in enumerate(TraindataLoader):
    local_X = sample_batched['input'].cuda()
    local_Y = sample_batched['label'].cuda()

    local_X =  torch.autograd.Variable(local_X)
    local_Y =  torch.autograd.Variable(local_Y)

        
    # compute gradient and do SGD step
    optimizer.zero_grad()
    
    # compute the model output
    with torch.set_grad_enabled(True):
        yhat = model(local_X)
        print(yhat.size())
        print(local_Y.size())
        
        #yhat= torch.reshape(yhat,[yhat.size()[0],17,100])
        # calculate loss
        loss = criterion(yhat, local_Y)
        
        '''
        pred_category,indx_pred = yhat.max(1)
        num_correct += indx_pred.eq(local_Y).sum().item()
        
        
        # credit assignment
        loss.backward()
        '''
        #############################################################
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

        #############################################################

        #calculate loss
        loss = reduce_value(loss,average=True)
        train_loss += loss.item()*yhat.size()[0]
        batch_size += yhat.size()[0]

        # update model weights
        optimizer.step()

        #measure accuracy
        indx_pred = torch.max(yhat,dim=1)[1]
        num_correct += torch.eq(indx_pred,local_Y).sum()
        
    #scheduler.step()
num_correct = reduce_value(num_correct,average=True).item()
acc = num_correct/batch_size
avg_trainloss = train_loss/batch_size

torch.cuda.synchronize()
            
return avg_trainloss,acc            

def test(TestdataLoader, model,criterion):

test_loss = 0
num_correct = 0.0
batch_size = 0

# switch to evaluate mode
model.eval()

with torch.no_grad():
    for i_batch, sample_batched in enumerate(TestdataLoader):
        local_X = sample_batched['input'].cuda()
        local_Y = sample_batched['label'].cuda()
 
        local_X =  torch.autograd.Variable(local_X)
        local_Y =  torch.autograd.Variable(local_Y)
        
        # compute the model output
        yhat = model(local_X)
        
        #yhat= torch.reshape(yhat,[yhat.size()[0],17,100])

        # calculate loss
        loss = criterion(yhat, local_Y)
        loss = reduce_value(loss,average=True)
        batch_size += yhat.size()[0]
        test_loss += loss.item()*yhat.size()[0]
        
        #measure accuracy
        indx_pred = torch.max(yhat,dim=1)[1]
        num_correct += torch.eq(indx_pred,local_Y).sum()
        
        #batch_size += yhat.size()[0]
    num_correct = reduce_value(num_correct,average=True).item()    
    acc = num_correct/batch_size    
    avg_testloss = test_loss/batch_size

torch.cuda.synchronize()    
    
return avg_testloss,acc

MaxEpoch = args.MaxEpoch
min_loss = 50000
best_model = None
best_epoch = -1
train_batches = 64
test_batches = 64

define the optimization

criterion = CrossEntropyLoss().cuda()
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft, step_size=1000, gamma=0.1)

def adjust_learning_rate(optimizer, epoch, init_lr):
“”“Sets the learning rate to the initial LR decayed by 10 every 30 epochs”“”
lr = init_lr * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group[‘lr’] = lr

for epoch in range(MaxEpoch):
[avg_trainloss,acc_train] = train(TraindataLoader,optimizer_ft,model_ft,criterion,epoch)
writer.add_scalar(“Loss/train”, avg_trainloss, epoch+1)
writer.add_scalar(“acc_train/train”, acc_train, epoch+1)
[avg_testloss,acc_test] = test(TestdataLoader, model_ft,criterion)
#scheduler.step()

if avg_testloss <= min_loss:
    min_loss = avg_testloss
    best_model = deepcopy(model_ft.state_dict())
    best_epoch = epoch+1
writer.add_scalar("Loss/test", avg_testloss, epoch+1)
writer.add_scalar("acc_test/test", acc_test, epoch+1)
writer.flush()

print("Best Epoch is: ", best_epoch , " with test loss: ", min_loss)
model_state_dict= model_ft.state_dict()
results_dir = ‘/scratch/nikiguo93/places365/results/’

save model

if args.local_rank == 0:
torch.save({
‘epoch’: epoch,
‘model_state_dict’: model_state_dict,
‘best_model’: best_model,
‘optimizer_state_dict’: optimizer_ft.state_dict(),
‘loss’: avg_trainloss,
}, results_dir + ‘places365_mtl_’ + str(args.mtl) + ‘.pth’)

writer.close()

FutureWarning,
WARNING:torch.distributed.run:


Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.


/home/nikiguo93/.conda/envs/branch-env/bin/python: can’t open file ‘torch_ddp.py’: [Errno 2] No such file or directory
/home/nikiguo93/.conda/envs/branch-env/bin/python: can’t open file ‘torch_ddp.py’: [Errno 2] No such file or directory
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 120134) of binary: /home/nikiguo93/.conda/envs/branch-env/bin/python
Traceback (most recent call last):
File “/home/nikiguo93/.conda/envs/branch-env/lib/python3.7/runpy.py”, line 193, in _run_module_as_main
main”, mod_spec)
File “/home/nikiguo93/.conda/envs/branch-env/lib/python3.7/runpy.py”, line 85, in _run_code
exec(code, run_globals)
File “/home/nikiguo93/.conda/envs/branch-env/lib/python3.7/site-packages/torch/distributed/launch.py”, line 193, in
main()
File “/home/nikiguo93/.conda/envs/branch-env/lib/python3.7/site-packages/torch/distributed/launch.py”, line 189, in main
launch(args)
File “/home/nikiguo93/.conda/envs/branch-env/lib/python3.7/site-packages/torch/distributed/launch.py”, line 174, in launch
run(args)
File “/home/nikiguo93/.conda/envs/branch-env/lib/python3.7/site-packages/torch/distributed/run.py”, line 713, in run
)(*cmd_args)
File “/home/nikiguo93/.conda/envs/branch-env/lib/python3.7/site-packages/torch/distributed/launcher/api.py”, line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File “/home/nikiguo93/.conda/envs/branch-env/lib/python3.7/site-packages/torch/distributed/launcher/api.py”, line 261, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

torch_ddp.py FAILED

Failures:
[1]:
time : 2023-06-09_02:28:20
host : g010.curta.zedat.fu-berlin.de
rank : 1 (local_rank: 1)
exitcode : 2 (pid: 120135)
error_file: <N/A>
traceback : To enable traceback see: Error Propagation — PyTorch 2.0 documentation

Root Cause (first observed failure):
[0]:
time : 2023-06-09_02:28:20
host : g010.curta.zedat.fu-berlin.de
rank : 0 (local_rank: 0)
exitcode : 2 (pid: 120134)
error_file: <N/A>
traceback : To enable traceback see: Error Propagation — PyTorch 2.0 documentation

Hi my code looks like that and you can find the error message after the code

Your code is not properly formatted so quite hard to read.
However, it also doesn’t contain any ddp.py mention so it’s unclear where the error is being raised from.