I used the following code in both repos to measure the running time of each component of the model (ResNet18).
def measure_time(self, model, input):
# Record the start time
self.start_event.record()
# Forward pass
output = model(input)
# Record the end time
self.end_event.record()
# Wait for the events to complete
torch.cuda.synchronize()
# Compute the elapsed time
elapsed_time = self.start_event.elapsed_time(self.end_event)
return elapsed_time, output
and then:
def _train(
self, train_loader, memory_loader, optimizer, criterion
):
total_loss, correct, num_data = 0.0, 0.0, 0.0
self.model.train()
if memory_loader is not None and train_loader is not None:
data_iterator = zip(train_loader, cycle(memory_loader))
elif memory_loader is not None:
data_iterator = memory_loader
elif train_loader is not None:
data_iterator = train_loader
else:
raise NotImplementedError("None of dataloder is valid")
for i, data in enumerate(data_iterator):
if len(data) == 2:
stream_data, mem_data = data
x = torch.cat([stream_data["image"], mem_data["image"]])
y = torch.cat([stream_data["label"], mem_data["label"]])
else:
x = data["image"]
y = data["label"]
x = x.to(self.device)
y = y.to(self.device)
# measure each operation time of the forward pass for one batch
# ---------------------------------------------------
if i==0:
# Conv1
conv1_time, output = self.measure_time(self.model.conv1, x)
print('conv1_time', conv1_time)
# BatchNorm1
bn1_time, output = self.measure_time(self.model.bn1, output)
print('bn1_time', bn1_time)
# ReLU
relu_time, output = self.measure_time(self.model.relu, output)
print('relu_time', relu_time)
# MaxPool
maxpool_time, output = self.measure_time(self.model.maxpool, output)
print('maxpool_time', maxpool_time)
# Layer1
layer1_time, output = self.measure_time(self.model.layer1, output)
print('layer1_time', layer1_time)
# Layer2
layer2_time, output = self.measure_time(self.model.layer2 , output)
print('layer2_time', layer2_time)
# Layer3
layer3_time, output = self.measure_time(self.model.layer3, output)
print('layer3_time', layer3_time)
# Layer4
layer4_time, output = self.measure_time(self.model.layer4, output)
print('layer4_time', layer4_time)
# AvgPool
avgpool_time, output = self.measure_time(self.model.avgpool, output )
print('avgpool_time ', avgpool_time)
output = torch.flatten(output, 1)
# Fc
fc_time, _ = self.measure_time(self.model.fc, output)
print('fc-time ', fc_time)
# ------------------------------------------------------
# this is equivalent to the step code in the test repo
l, c, d = self.update_model(x, y, criterion, optimizer)
# Compute the moving averages - equivalent to MovingAverage in the test repo
total_loss += l
correct += c
num_data += d
I get the following values:
in the fast repo I have:
conv1_time 919.8919677734375
bn1_time 65.71724700927734
relu_time 0.2734079957008362
maxpool_time 0.3357119858264923
layer1_time 1.3231040239334106
layer2_time 1.3721599578857422
layer3_time 1.2472319602966309
layer4_time 1.7541120052337646
avgpool_time 0.18943999707698822
fc-time 0.4814079999923706
and in the slow one:
conv1_time 974.81005859375
bn1_time 112.70028686523438
relu_time 17.454816818237305
maxpool_time 16.718528747558594
layer1_time 266.0933532714844
layer2_time 317.3957824707031
layer3_time 310.3487854003906
layer4_time 317.3232727050781
avgpool_time 9.390975952148438
fc-time 30.7589111328125
The difference is huge. These measurements belong to the first iteration of the first epoch. The values continue to be like this. I checked the dtype of the model parameters and both cases, they are float32.
Is there any reason than can cause this? can I still rule out dataloader performance? I am running my code on the same environment and the same GPU model.