Thanks for your try!
I think we organize code in different ways. The py scripts I tested are:
1. utils.py
import gc
import time
import torch
start_time = None
def start_timer():
global start_time
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()
torch.cuda.synchronize()
start_time = time.time()
def end_timer_and_print(local_msg):
torch.cuda.synchronize()
end_time = time.time()
print("\n" + local_msg)
print("Total execution time = {:.3f} sec".format(end_time - start_time))
print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))
def make_model(in_size, out_size, num_layers):
layers = []
for _ in range(num_layers - 1):
layers.append(torch.nn.Linear(in_size, in_size))
layers.append(torch.nn.ReLU())
layers.append(torch.nn.Linear(in_size, out_size))
return torch.nn.Sequential(*tuple(layers)).cuda()
batch_size = 512
in_size = 4096
out_size = 4096
num_layers = 3
num_batches = 50
epochs = 3
data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)]
targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)]
loss_fn = torch.nn.MSELoss().cuda()
net = make_model(in_size, out_size, num_layers).to('cuda')
opt = torch.optim.SGD(net.parameters(), lr=0.001)
2. precision_default.py
from utils import *
# ====== Default Precision ======
start_timer()
for epoch in range(epochs):
for input, target in zip(data, targets):
output = net(input)
loss = loss_fn(output, target)
loss.backward()
opt.step()
opt.zero_grad()
end_timer_and_print("Default precision:")
3. precision_auto_mix.py
from utils import *
use_amp = True
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
start_timer()
for epoch in range(epochs):
for input, target in zip(data, targets):
with torch.cuda.amp.autocast(enabled=use_amp):
output = net(input)
loss = loss_fn(output, target)
scaler.scale(loss).backward()
scaler.step(opt)
scaler.update()
opt.zero_grad()
end_timer_and_print("Mixed precision:")
I run 2 training scripts precision_default.py and precision_auto_mix.py respectively, and got:
Default precision:
Total execution time = 1.527 sec
Max memory used by tensors = 1367458816 bytes
Mixed precision:
Total execution time = 1.299 sec
Max memory used by tensors = 1434552832 bytes
In my codes, there are no intermediate variables, right? I am curious why your default training’s max memory is 3775580672 bytes, which is much larger than mine.