Hi everybody, I have been working for a few months with pytorch and this is the first time I try to use torch.profiler since I’m trying to get information about CUDA time. However, I’m running a small test only with the cpu and I see that there is a very large overhead when running the models inside the profile instance.
I would like to understand if it is typical behavior or if I’m doing something wrong. Thanks in regards and I copy the code below:
start = time.time()
with profile(activities=[ProfilerActivity.CPU]) as prof:
for _ in range (200):
with record_function("forward"):
y = model(a)
with record_function("backward"):
y = model(b)
print(f"True time {time.time() - start} seconds")
print(f"Time calculated by profiler:{prof.key_averages().self_cpu_time_total / 1e6} seconds")
The model CNN1 corresponds to:
class CNN1(Module):
def __init__(self, nclasses:int) -> None:
super().__init__()
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.cnn = Sequential(
Conv2d(1, 32, (2,2), (1,1), (1,1)),
ReLU(),
MaxPool2d((2,2),(1,1),(1,1)),
Conv2d(32, 64, (2,2), (1,1), (0,1)),
ReLU(),
MaxPool2d((2,2), (2,1),(0,1)),
Conv2d(64, 128, (2,2), (2,1), (0,0)),
ReLU(),
MaxPool2d((2,2), (2,1), (0,0)),
Conv2d(128, 512, (2,2), (2,2), (0,0)),
ReLU(),
Conv2d(512, 512, (2,2), (2,2), (0,0)),
)
self.lineal = Sequential(
Linear(512, 256),
ReLU(),
Linear(256, nclasses),
LogSoftmax(dim=2)
)
def forward(self, x:Tensor):
x.to(self.device) # Se envía el tensor a la GPU para las operaciones del modelo
logging.debug(f"Input tensor shape is:{x.size()}")
logits = self.cnn(x) # Forma del tensor es B, 512, 1, L
logits = torch.squeeze(logits, 2)
logits = logits.permute(0, 2, 1) # Forma del tensor es B, L, 512
logits = self.lineal(logits) # Forma del tensor es B, L, C
logging.debug(f"Output tensor (log_logits) shape is:{logits.size()}")
return logits