I’m using Colab T4 GPU, I tried to use it’s TPU but I was getting JAX error, so I gave up.
My training data is around 13500 images, and my batch size is 24, I did a lot of research into optimization trying to get my model to train faster, the best I achieved was 42 minutes/epoch, and that’s a bit slow, since my loss is not decreasing and I need to keep tweaking with my net.
This is my current code:
import os import sys from google.colab import drive import torch from torch import nn from torch.utils.data import DataLoader import torchvision as tv from torchvision.transforms import v2
DRIVE_DEFAULT_PATH = '/content/drive' if not os.path.exists(DRIVE_DEFAULT_PATH): drive.mount(DRIVE_DEFAULT_PATH) DRIVE_DEFAULT_PATH = DRIVE_DEFAULT_PATH + '/MyDrive' CLASS_DEFAULT_PATH = '/RNP' ASSIGNMENT_PATH = '/Trabalho 01/Sports' WORK_PATH = DRIVE_DEFAULT_PATH + CLASS_DEFAULT_PATH + ASSIGNMENT_PATH
def setLoader(path, batch_size, train): if(train): transforms = v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True), v2.Normalize(mean=[0.4713, 0.4699, 0.4548], std=[0.3081, 0.3020, 0.2961])]) return torch.utils.data.DataLoader(tv.datasets.ImageFolder(WORK_PATH + path, transform=transforms), batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True, prefetch_factor=4)
class CNN(nn.Module): def __init__(self): super().__init__() self.step1 = nn.Sequential(nn.Conv2d(3,400,3,padding=1), nn.ReLU(), #2nd nn.Conv2d(400,400,5,padding=2), nn.ReLU(), nn.MaxPool2d(2), #3rd nn.Conv2d(400,200,3,padding=1), nn.ReLU(), #nn.MaxPool2d(2), #4th nn.Conv2d(200,200,7), nn.ReLU(), #nn.MaxPool2d(2), #5th nn.Conv2d(200,100,5,padding=2), nn.ReLU(), nn.MaxPool2d(3), ) self.step2 = nn.Sequential(nn.LazyLinear(100), nn.Softmax(dim=1)) def forward(self, x): return self.step2(torch.flatten(self.step1(x), start_dim=1))
def train(): lr = 0.1 num_epochs = 100 train_then_validation = True model = CNN().to(device) loss_fn = nn.CrossEntropyLoss() optim = torch.optim.SGD(model.parameters(), lr=lr) loss = None for epoch in range(num_epochs): with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof: with record_function("model_inference"): for batch, (X,y) in enumerate(train_loader): X,y = X.to(device), y.to(device) optim.zero_grad() y_hat = model(X) loss = loss_fn(y_hat,y) loss.backward() optim.step() if (batch % 5 == 0): print(f'Batch: {batch}, Loss: {loss}') if(batch == 5): break print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=30)) break print(f'Epoch: {epoch}, Loss: {loss}') device = ( "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" ) torch.backends.cudnn.benchmark = True print(f"Using {device} device") from torch.profiler import profile, record_function, ProfilerActivity train_loader = setLoader("/train", 24, True) train()
I'm breaking on batch 5 to profile.------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ model_inference 0.96% 265.902ms 99.87% 27.790s 27.790s 0.000us 0.00% 4.356s 4.356s 1 aten::to 0.00% 422.000us 61.15% 17.016s 347.255ms 0.000us 0.00% 7.269ms 148.347us 49 aten::_to_copy 0.00% 294.000us 61.15% 17.015s 1.418s 0.000us 0.00% 7.271ms 605.917us 12 aten::copy_ 0.00% 357.000us 61.15% 17.015s 1.418s 7.271ms 0.03% 7.271ms 605.917us 12 cudaStreamSynchronize 61.13% 17.011s 61.14% 17.011s 1.215s 0.000us 0.00% 0.000us 0.000us 14 cudaMemcpyAsync 30.64% 8.524s 30.64% 8.524s 608.869ms 0.000us 0.00% 0.000us 0.000us 14 aten::item 0.00% 41.000us 30.62% 8.521s 2.130s 0.000us 0.00% 2.000us 0.500us 4 aten::_local_scalar_dense 0.00% 70.000us 30.62% 8.521s 2.130s 2.000us 0.00% 2.000us 0.500us 4 enumerate(DataLoader)#_MultiProcessingDataLoaderIter... 7.03% 1.957s 7.03% 1.957s 326.131ms 0.000us 0.00% 0.000us 0.000us 6 autograd::engine::evaluate_function: ConvolutionBack... 0.00% 548.000us 0.10% 27.778ms 925.933us 0.000us 0.00% 20.901s 696.699ms 30 ConvolutionBackward0 0.00% 355.000us 0.10% 27.230ms 907.667us 0.000us 0.00% 20.901s 696.699ms 30 aten::convolution_backward 0.04% 12.381ms 0.10% 26.875ms 895.833us 20.791s 80.24% 20.901s 696.699ms 30 cudaLaunchKernel 0.09% 24.595ms 0.09% 24.595ms 6.694us 0.000us 0.00% 0.000us 0.000us 3674 aten::conv2d 0.00% 208.000us 0.08% 21.959ms 731.967us 0.000us 0.00% 4.032s 134.395ms 30 aten::convolution 0.00% 788.000us 0.08% 21.751ms 725.033us 0.000us 0.00% 4.032s 134.395ms 30 aten::_convolution 0.00% 800.000us 0.08% 20.963ms 698.767us 0.000us 0.00% 4.032s 134.395ms 30 aten::cudnn_convolution 0.03% 8.871ms 0.07% 18.846ms 628.200us 3.809s 14.70% 3.809s 126.953ms 30 Optimizer.step#SGD.step 0.01% 1.959ms 0.01% 2.471ms 411.833us 0.000us 0.00% 5.692ms 948.667us 6 aten::sum 0.00% 1.216ms 0.01% 1.643ms 45.639us 110.117ms 0.43% 110.117ms 3.059ms 36 autograd::engine::evaluate_function: ReluBackward0 0.00% 300.000us 0.01% 1.636ms 54.533us 0.000us 0.00% 319.620ms 10.654ms 30 autograd::engine::evaluate_function: AddmmBackward0 0.00% 211.000us 0.01% 1.568ms 261.333us 0.000us 0.00% 4.123ms 687.167us 6 aten::relu 0.00% 483.000us 0.01% 1.486ms 49.533us 0.000us 0.00% 220.201ms 7.340ms 30 ReluBackward0 0.00% 223.000us 0.00% 1.336ms 44.533us 0.000us 0.00% 319.620ms 10.654ms 30 autograd::engine::evaluate_function: torch::autograd... 0.00% 486.000us 0.00% 1.198ms 16.639us 0.000us 0.00% 0.000us 0.000us 72 aten::threshold_backward 0.00% 734.000us 0.00% 1.113ms 37.100us 319.620ms 1.23% 319.620ms 10.654ms 30 aten::add_ 0.00% 709.000us 0.00% 1.097ms 36.567us 223.258ms 0.86% 223.258ms 7.442ms 30 autograd::engine::evaluate_function: MaxPool2DWithIn... 0.00% 178.000us 0.00% 1.057ms 88.083us 0.000us 0.00% 328.911ms 27.409ms 12 aten::linear 0.00% 59.000us 0.00% 1.045ms 174.167us 0.000us 0.00% 2.205ms 367.500us 6 AddmmBackward0 0.00% 139.000us 0.00% 1.029ms 171.500us 0.000us 0.00% 4.059ms 676.500us 6 aten::clamp_min 0.00% 651.000us 0.00% 1.003ms 33.433us 220.201ms 0.85% 220.201ms 7.340ms 30 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Self CPU time total: 27.825s Self CUDA time total: 25.909s
I need a way to cut the copy and synchronize time.
I thought to send optim or loss function to the GPU, this way when I do backward and step there was no need to send data from CPU to GPU, eliminating the synchronize time and a bit of copy time.
There’s no way to send the DataLoader to the GPU as there is the ParallelLoader for TPU, so a bit of copying on X and y to CUDA will always exist.