Hi,
I try to evaluate my model after each epoch on the main rank only.
Somehow it freezes after iterating though the complete validation set.
The GPUs are not working anymore but the memory will not get released either.
Here is a sketch on how my training routine looks like:
class Experiment():
def __init__(self, config):
self.config = config
def getModel(self):
(...)
return model
def getDataClass(self):
(...)
return dataclass
def getLossfunction(self):
(...)
return lossfunction
def getOptimizer(self, model):
(...)
return optimizer, scheduler,
def run(self):
"""
The main operator
:return:
"""
world_size = self.config.world_size
mp.spawn(self.mainTrainLoop,
args=(world_size,),
nprocs=world_size,
join=True)
def train(self, dl_train, optimizer, rank, model, lossfunction, logger, epoch):
model.train()
for batch_idx, batch in enumerate(dl_train):
optimizer.zero_grad(set_to_none=True)
x_i, y_i = batch
x_i = x_i.to(rank)
z_i = model(x_i)
# calc loss and do step
loss = lossfunction(z_i, y_i)
loss.backward()
optimizer.step()
if rank == 0:
# logging and eval
if batch_idx % self.config.terminal_interval == 0:
# print to commandline here
pass
if batch_idx % self.config.tensorboard_interval == 0:
# Some tensorboard stuff here
pass
if rank == 0:
self.current_iteration += 1
def validate(self, rank, val_dataloader, tracker, model, logger):
tracker.increment()
model.eval()
current_eval_cycle = tracker.n_steps - 1 # starts with one so set it to zero
with torch.no_grad():
for batch_idx, batch in enumerate(val_dataloader):
x_i, y_i = batch
x_i = x_i.to(rank)
z_i = model(x_i)
tracker.update(z_i,y_i)
for key, val in tracker.compute_all().items():
# write to tensorboard here
# if we have a new best metric save the model
# here use current_eval_cycle instead of tracker.n_steps to index the array
best_res, which_epoch = tracker.best_metric(return_step=True)
for key, epoch in which_epoch.items():
if epoch == current_eval_cycle:
# save model here if best
pass
def mainTrainLoop(self, rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# initialize the process group
dist.init_process_group("gloo", rank=rank, world_size=world_size)
# --------------------------------------------------
# get datasets, samplers for each rank
dataclass = self.getDataClass()
ds_train = dataclass.get_dataset("train")
ds_val = dataclass.get_dataset("val")
sampler_train = DistributedSampler(ds_train,
num_replicas=world_size,
rank=rank,
shuffle=False,
drop_last=self.config.dataloader.drop_last_batch)
dl_train = DataLoader(ds_train,
batch_size=self.config.dataloader.batchsize,
pin_memory=False,
num_workers=0,
drop_last=self.config.dataloader.drop_last_batch,
shuffle=False,
sampler=sampler_train)
# val set works without sampler
# since its only exectuted on rank 0
dl_val = DataLoader(ds_val,
batch_size=self.config.dataloader.batchsize,
pin_memory=False,
num_workers=0,
drop_last=self.config.dataloader.drop_last_batch,
shuffle=False)
# --------------------------------------------------
# get model, optimizer and loss
model = self.getModel().to(rank)
# resnet 18 has batch norm layers so:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
model = DDP(model, device_ids=[rank])
optimizer, sceduler = self.getOptimizer(model)
lossfunction = self.getLossfunction().to(rank)
# --------------------------------------------------
# seeding
torch.cuda.manual_seed(self.config.torchseed)
np.random.seed(self.config.numpyseed)
# --------------------------------------------------
# model summary
logger = None
if rank == 0:
logger = logging.getLogger("Experiment")
# --------------------------------------------------
# Metrics and Tensorboard init
tracker = None
if rank == 0:
list_of_metrics = [MeanSquaredError(), MeanAbsoluteError()]
metric_coll = MetricCollection([m.to(rank) for m in list_of_metrics])
tracker = MetricTracker(metric_coll, maximize=[False, True, True, False, False])
# setup tensorboard writer on rank 0
# somehow its not working if I set it up in the __init__ function and
# then only use it from within rank 0.
self.summary_writer = SummaryWriter(log_dir="/path/to/sth")
# --------------------------------------------------
# start training
for epoch in range(1, self.config.max_epochs + 1):
# if we are using DistributedSampler, we have to tell it which epoch this is
dl_train.sampler.set_epoch(epoch)
self.train(dl_train,
optimizer,
rank,
model,
lossfunction,
logger,
epoch)
# for rank 0 evalute with
# model.module instead of model
if rank == 0:
self.validate(rank,
dl_val,
tracker,
model.module,
logger)
if rank == 0:
self.summary_writer.close()
if __name__ == '__main__':
import hydra
from omegaconf import DictConfig, OmegaConf
@hydra.main(version_base = None, config_path="./configs/", config_name="config")
def main(config : DictConfig) -> None:
exp = Experiment(config)
exp.run()
main()
Does anyone has a clue of what is going on here?
Also: I couldn’t find a good and complete example on multi-GPU training that includes all like training testing and saving (potentially logging) stuff. I would be super great-full about tips here!#
Thanks a lot!