Source: https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html
I am successfully training my models, all is well there… but when I goto log to tensorboard I am getting jumps in the X-axis… leaving me to believe I should look into this log metric piece…
has anyone removed or adapted it to push loss data to tensorboard ?
example follows
def train_one_epoch(model_conf, model, optimizer, data_loader, device, epoch, tfb_logger):
print_freq = model_conf["hyperParameters"]["display_interval"]
iterations_per_epoch = len(data_loader) / model_conf["hyperParameters"]["batch_size"]
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
header = 'Epoch: [{}]'.format(epoch)
iterations = 0
lr_scheduler = None
if epoch == 0:
warmup_factor = 1. / 1000
warmup_iters = min(1000, len(data_loader) - 1)
lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
for images, targets in metric_logger.log_every(data_loader, print_freq, header):
images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
loss_dict = model(images, targets)
iterations += 1
losses = sum(loss for loss in loss_dict.values())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
loss_value = losses_reduced.item()
if not math.isfinite(loss_value):
print("Loss is {}, stopping training".format(loss_value))
print(loss_dict_reduced)
sys.exit(1)
optimizer.zero_grad()
losses.backward()
optimizer.step()
if lr_scheduler is not None:
lr_scheduler.step()
metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
metric_logger.update(lr=optimizer.param_groups[0]["lr"])
if tfb_logger is not None:
info = {
'loss': losses_reduced,
'loss_box_reg': loss_dict["loss_box_reg"],
'loss_classifier': loss_dict["loss_classifier"],
'loss_objectness': loss_dict["loss_objectness"],
'loss_rpn_box_reg': loss_dict["loss_rpn_box_reg"]
}
tfb_logger.add_scalars(main_tag='logs_s_{}/losses'.format("1"),
tag_scalar_dict=info,
global_step=(epoch * len(data_loader)) + iterations)
return metric_logger
This area in particular
for images, targets in metric_logger.log_every(data_loader, print_freq, header):
``
has anyone tried to modify the metric logger ? I am thinking I should but unsure ..