I used four GPUs to train a model. My training strategy is divided into two stages. In the first stage, the model is trained normally, and then in the second stage, the model is loaded with the optimal model of the first stage. Continue Training, but at this stage it appeared Cuda out of memory error.
This is the error:
/root/anaconda3/envs/python367/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 1 leaked semaphores to clean up at shutdown
len(cache))
/root/anaconda3/envs/python367/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 1 leaked semaphores to clean up at shutdown
len(cache))
/root/anaconda3/envs/python367/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 1 leaked semaphores to clean up at shutdown
len(cache))
Traceback (most recent call last):
File "dogs_test3.py", line 573, in <module>
my_launch(args)
File "dogs_test3.py", line 563, in my_launch
mp.spawn(train,nprocs=world_size,args=(args,))
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 171, in spawn
while not spawn_context.join():
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 118, in join
raise Exception(msg)
Exception:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/root/dogs_test/dogs_test3.py", line 538, in train
global_feat, local_feat, cls_score = model(image)
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 447, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/root/dogs_test/dogs_test3.py", line 213, in forward
x = self.backbone(x)
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/nn/modules/container.py", line 100, in forward
input = module(input)
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/nn/modules/container.py", line 100, in forward
input = module(input)
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/nn/modules/container.py", line 100, in forward
input = module(input)
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/geffnet/efficientnet_builder.py", line 237, in forward
x = self.conv_pwl(x)
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 345, in forward
return self.conv2d_forward(input, self.weight)
File "/root/anaconda3/envs/python367/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 342, in conv2d_forward
self.padding, self.dilation, self.groups)
RuntimeError: CUDA out of memory. Tried to allocate 126.00 MiB (GPU 1; 10.76 GiB total capacity; 6.98 GiB already allocated; 129.69 MiB free; 7.17 GiB reserved in total by PyTorch)
This is my code:
def my_launch(args):
world_size=args['num_machines']*args['num_gpus_per_machine']
args['world_size']=world_size
os.environ['MASTER_ADDR']='127.0.0.1'
os.environ['MASTER_PORT']='27925'
mp.spawn(train,nprocs=world_size,args=(args,))
I commented out the code of step1 and loaded the checkpoint directly
def train(gpu,args):
rank=gpu
dist.init_process_group(
backend='nccl',
init_method='env://',
world_size=args['world_size'],
rank=rank
)
torch.manual_seed(0)
torch.cuda.set_device(gpu)
train_info, valid_info = stratification_kfold(names, image_label, 5)
train_names, valid_names = train_info[0], valid_info[0]
train_ds = TrainDataset(train_names, image_label, label_map_image, transform_train)
valid_ds = TestDataset(valid_names, image_label, transform_valid)
valid_dl = Data.DataLoader(valid_ds, batch_size=8, drop_last=True)
train_sampler=Data.distributed.DistributedSampler(train_ds,num_replicas=args['world_size'],rank=0)
train_dl = Data.DataLoader(train_ds, batch_size=8, collate_fn=train_collate, shuffle=False,sampler=train_sampler, drop_last=True)
step1_epochs = 30
step2_epochs = 30
criterion = Criterion()
early_stop = EarlyStopping()
model = myNet()
model.cuda(gpu)
model=nn.parallel.DistributedDataParallel(model,device_ids=[gpu])
dist.barrier()
map_loacation={'cuda:%d'%0:'cuda:%d'%gpu}
#
# step1_optimizer = torch.optim.SGD(model.parameters(), lr=0.9, weight_decay=0.0001)
# for epoch in range(step1_epochs):
# with tqdm(total=len(train_dl)) as pbar:
# train_loss = 0
# steps = len(train_dl)
# for image, labels in train_dl:
# model.train()
# step1_optimizer.zero_grad()
#
# image = image.cuda(gpu).float()
# labels=labels.cuda(gpu)
# global_feat, local_feat, cls_score = model(image)
# loss = criterion(global_feat, local_feat, cls_score, labels,gpu)
# train_loss += loss
# loss.backward()
# step1_optimizer.step()
# pbar.update(1)
# print('train_loss:{}'.format(train_loss / steps))
# model.eval()
# metric = evaluate(model, valid_dl)
# early_stop(metric, model)
# if early_stop.early_stop:
# break
checkpoint_path = '/root/dogs/step2.pt'
checkpoint = torch.load(checkpoint_path,map_location=map_loacation)
model.load_state_dict(checkpoint['state'])
dist.barrier()
step2_optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0001)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(step2_optimizer, T_0=5, T_mult=2)
early_stop.counter = 0
early_stop.early_stop = False
early_stop.best_score = 0
early_stop.patience = 8
for epoch in range(step2_epochs):
with tqdm(total=len(train_dl)) as pbar:
train_loss = 0
steps = len(train_dl)
for image, labels in train_dl:
model.train()
step2_optimizer.zero_grad()
image = image.cuda(gpu).float()
labels=labels.cuda(gpu)
global_feat, local_feat, cls_score = model(image)
loss = criterion(global_feat, local_feat, cls_score, labels,gpu)
train_loss += loss
loss.backward()
step2_optimizer.step()
pbar.update(1)
print('train_loss:{}'.format(train_loss / steps))
model.eval()
metric = evaluate(model, criterion)
scheduler.step()
early_stop(metric, model)
if early_stop.early_stop:
break
I saved the checkpoint of the model in early_stop
class EarlyStopping:
"""Early stops the training if validation loss doesn't improve after a given patience."""
def __init__(self, patience=4, best_score=None,delta=0):
self.patience = patience
self.counter = 0
self.best_score = best_score
self.early_stop = False
self.delta = delta
def __call__(self, val_metric, model):
score = val_metric
if self.best_score is None:
self.best_score = score
self.save_checkpoint(val_metric, model)
elif score < self.best_score + self.delta:
self.counter += 1
print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
if self.counter >= self.patience:
self.early_stop = True
else:
self.best_score = score
self.save_checkpoint(val_metric, model)
self.counter = 0
def save_checkpoint(self, metric, model):
state = {'best_metric': metric, 'state': model.state_dict()}
torch.save(state, '/root/dogs/step2.pt')
Why does cuda out of memory error appear after loading checkpoint?