I’ve try a new approach using torch.cuda.max_memory_allocated() for this I’ve created a class as mentionned on this topic and try to call it in my training loop. The fact is that even if I reorganize my layer repartition between gpus the memory allocated on both gpus continue to grow and the peak are always egual to the first run.
As example if I have a model with 14 layers and I split them for the first run with GPU0 11 and GPU1 3 I will have peak at [1371, 527] MB and even if during the same process I swap some layers to have 7 layers on bot I will always have the same peaks value. And if I run it firstly with 7 on both I will have other memory peak value [785, 673] MB
So the first issue that I imagine is that the memory is never released and never reallocated how to do it and none of torch.cuda reset method works…
Here is my profiling class :
import torch
def BToMb(x): return (x // (2*1024))
class TraceMalloc():
def __init__(self, nb_gpu):
self.nb_gpu = nb_gpu
self.begin = [0] * nb_gpu
self.end = [0] * nb_gpu
self.peak = [0] * nb_gpu
self.peaked = [0] * nb_gpu
def __enter__(self):
for device in range(self.nb_gpu):
torch.cuda.reset_accumulated_memory_stats(device)
self.begin[device] = torch.cuda.memory_allocated(device)
return self
def __exit__(self, *exc):
for device in range(self.nb_gpu):
self.end[device] = torch.cuda.memory_allocated(device)
self.peak[device] = torch.cuda.max_memory_allocated(device)
self.peaked[device] = BToMb(self.peak[device] - self.begin[device])
torch.cuda.reset_accumulated_memory_stats(device)
for device in range(self.nb_gpu):
print(f"GPU n°{device}")
print(f" Memory begin -> {BToMb(self.begin[device])} MB")
print(f" Memory end -> {BToMb(self.end[device])} MB")
# print(f"Memory used -> {self.used[device]} MB")
print(f" Memory peak -> {self.peaked[device]} MB")
Here my training loop where it is call :
def evaluate_mem(self):
from .dataset import PipelineDataset
dataset = PipelineDataset(1024, self.input_shape[1:], [1] if len(self.output_shape) == 1 else self.output_shape[1:])
dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.input_shape[0], shuffle=True)
criterion = torch.nn.CrossEntropyLoss()
model = self.get_modules()
model = torch.distributed.pipeline.sync.Pipe(model, 2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
# self.prof.start()
# with torch.no_grad():
with self.trace_gpu_alloc:
for epoch in range(10):
running_loss = 0.0
for i, (inputs, labels) in enumerate(dataloader):
# optimizer.zero_grad()
inputs = inputs.to(0)
labels = labels.to(self.nb_gpu - 1)
outputs = model(inputs).local_value()
# Forward pass
loss = criterion(outputs, labels.squeeze())
loss.backward()
# self.prof.step()
optimizer.step()
torch.cuda.empty_cache()
print(f"Outer loop --------> {self.trace_gpu_alloc.peaked}")
return self.trace_gpu_alloc.peaked
And finally an example of model used :
class x_layer(nn.Module):
def forward(self, input):
ret = input
return input
class conv1_layer(nn.Module):
def __init__(self) -> None:
super().__init__()
self.fc = nn.Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
def forward(self, input):
ret = self.fc(input)
return ret
class relu_layer(nn.Module):
def forward(self, input):
ret = torch.nn.functional.relu(input, inplace=False)
return ret
class pool_layer(nn.Module):
def __init__(self) -> None:
super().__init__()
self.fc = nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
def forward(self, input):
ret = self.fc(input)
return ret
class conv2_layer(nn.Module):
def __init__(self) -> None:
super().__init__()
self.fc = nn.Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
def forward(self, input):
ret = self.fc(input)
return ret
class relu_1_layer(nn.Module):
def forward(self, input):
ret = torch.nn.functional.relu(input, inplace=False)
return ret
class pool_1_layer(nn.Module):
def __init__(self) -> None:
super().__init__()
self.fc = nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
def forward(self, input):
ret = self.fc(input)
return ret
class flatten_layer(nn.Module):
def forward(self, input):
ret = torch.flatten(input, 1)
return ret
class fc1_layer(nn.Module):
def __init__(self) -> None:
super().__init__()
self.fc = nn.Linear(in_features=400, out_features=120, bias=True)
def forward(self, input):
ret = self.fc(input)
return ret
class relu_2_layer(nn.Module):
def forward(self, input):
input = input.clone()
ret = torch.nn.functional.relu(input, inplace=False)
return ret
class fc2_layer(nn.Module):
def __init__(self) -> None:
super().__init__()
self.fc = nn.Linear(in_features=120, out_features=84, bias=True)
def forward(self, input):
ret = self.fc(input)
return ret
class relu_3_layer(nn.Module):
def forward(self, input):
ret = torch.nn.functional.relu(input, inplace=False)
return ret
class fc3_layer(nn.Module):
def __init__(self) -> None:
super().__init__()
self.fc = nn.Linear(in_features=84, out_features=10, bias=True)
def forward(self, input):
ret = self.fc(input)
return ret
class output_layer(nn.Module):
def forward(self, input):
ret = input
return ret
class PipelinedModel(nn.Module):
def __init__(self) -> None:
super().__init__()
self.s0 = nn.Sequential(x_layer(), conv1_layer(), relu_layer(), pool_layer(), conv2_layer(), relu_1_layer(), pool_1_layer(), flatten_layer(), fc1_layer(), relu_2_layer()).cuda(0)
self.s1 = nn.Sequential(fc2_layer(), relu_3_layer(), fc3_layer(), output_layer()).cuda(1)
def forward(self, input):
ret = input
ret = self.s0(ret.to(0))
ret = self.s1(ret.to(1))
return ret
def get_modules(self):
return nn.Sequential(*[nn.Sequential(*self.s0),nn.Sequential(*self.s1)])
Hope I was clear in my explanation :x