Hi folks,
Recently I’ve tested pytorch profiler to profile the resnet18 during training according to tutorial:
https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html
I just simply copy, paste and run the provided code. The log folder was created and I couldsee the details about gpu and cpu through tensorboard. However, dataloader and cpu exec showed zero, does anyone have the same issue?
On the other hand, I wanted to test a customized pytorch model. After I ran the training script, the log folder is not created, and there is no error appeard in the terminal. Is there any hints about this problem?
The training script is shown below:
def train_model(train_dl,val_dl,model):
bs = 512
epochs = 1
criterion = torch.nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay = 0.00001)
training_loss = []
# print training details
print("Batch size = {}, epoch = {}".format(bs,epochs))
print("Training starts")
# generate header for LossTrend.csv
with open(Loss_path,"a", newline = "") as csvfile:
columns = ["Train Loss gain","Train Loss nv","Train Loss fv","Validation Loss gain","Validation Loss nv","Validation Loss fv"]
writer = csv.DictWriter(csvfile,fieldnames = columns)
for epoch in range(epochs): # loop over the dataset multiple times
# training
model.train(True)
running_loss = 0.0
for i, data in enumerate(train_dl, 0):
input_features1, input_features2,input_features3, input_features4, target = data
input_features1= input_features1.to(torch.float32)
input_features2= input_features2.to(torch.float32)
input_features3= input_features3.to(torch.float32)
input_features4= input_features4.to(torch.float32)
target = target.to(torch.float32)
optimizer.zero_grad() # zero the parameter gradients
# forward + backward + optimize
output = model(input_features1.to(device),input_features2.to(device))
loss = criterion(output ,target .to(device))
running_loss += loss.item()
# optimize every batch
loss.backward()
optimizer.step()
# saving weight every epoch
torch.save(model.state_dict(), "./weight_v1/v1_epoch{}.pt".format(epoch+1))
training_loss.append(running_loss)
# validation
"""some validation code (not shown for clarity)"""
return
prof = torch.profiler.profile(schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),
on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/model_name'),
record_shapes=True,
profile_memory=True,
with_stack=True)
prof.start()
train_model(train_dl,val_dl,model)
prof.stop()
Linux, Ubuntu 20.04
Hardware Used-> Nvidia GeForce RTX 3090
PyTorch version-> 2.0.1+cu118
PyTorch tensorboard profiler version → 0.4.1
Thx in advance~