Hi,
I want to train a plain 1D conv-net (1 layer). The training runs through, but it is extremely slow. In particular, in the epochs, the first ~75% are super fast. But then it gets very slow. Also the validation takes extermely long (longer than the Epoch itself).
I did some profiling to find out the root cause, and it seems to be related to the transfer of data to GPU.
I use cuda 11.8 (NVIDIA P520) and found the same behaviour on win11 and on win11-WSL2 (ubuntu). It does not make a difference if i run the script from powershell or within Spyder 5. I use pytorch lightning for training.
Do you know what could be the issue. I am a beginner - so it might be something very stupid.
Here is the profiler result:
class Dataset(tdata.Dataset):
def __init__(self, In, Out, transform = None):
self.In = In
self.Out = Out
def __len__(self):
return len(self.Out)
def __getitem__(self, idx):
In = self.In[idx,:,:]
Out = self.Out[idx,:,:]
return torch.from_numpy(In), torch.from_numpy(Out)
I create dataloaders as:
fullset = Dataset(In, Out) # dimensions of numpy arrays [10, 3, 19939]
nTrain = int(np.floor(len(fullset) * 0.9))
nVal = len(fullset) - nTrain
train_set, val_set = tdata.random_split(fullset, [nTrain, nVal])
test_set = Dataset(InTest, OutTest)
# Data Loaders
train_loader = tdata.DataLoader(train_set, batch_size=16, shuffle=True, drop_last=True, pin_memory=True, num_workers=0)
val_loader = tdata.DataLoader(val_set, batch_size=16, shuffle=False, drop_last=False, pin_memory=False, num_workers=0)
test_loader = tdata.DataLoader(test_set, batch_size=16, shuffle=False, drop_last=False, num_workers=0)
My module is
class MyModule(pl.LightningModule):
def __init__(self, model, model_hparams, optimizer_name, optimizer_hparams):
super().__init__()
self.save_hyperparameters(ignore="model")
self.model = model
self.loss_module = nn.MSELoss()
# Example input for visualizing the graph in Tensorboard
self.example_input_array = torch.zeros((10, 3, 19939), dtype=torch.float32)
def forward(self, imgs):
return self.model(imgs)
def configure_optimizers(self):
optimizer = optim.AdamW(self.parameters(), **self.hparams.optimizer_hparams)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150], gamma=0.1)
return [optimizer], [scheduler]
def training_step(self, batch, batch_idx):
In, Out = batch
newOut = self.model(In)
loss = self.loss_module(newOut, Out)
absDiffLoss = (Out == newOut).float().abs().mean() # substitute for accuracy
self.log('train_AbsDiffLoss', absDiffLoss, on_step=False, on_epoch=True)
self.log('train_MSEloss', loss)
return loss # Return tensor to call ".backward" on
def validation_step(self, batch, batch_idx):
In, Out = batch
newOut = self.model(In)
absDiffLoss = (Out == newOut).float().abs().mean()
self.log('val_AbsDiffLoss', absDiffLoss)
def test_step(self, batch, batch_idx):
In, Out = batch
newOut = self.model(In)
absDiffLoss = (Out == newOut).float().abs().mean()
self.log('test_AbsDiffLoss', absDiffLoss)
My model is simple 1D convolution:
class Net(nn.Module):
def __init__(self, dt, tKernelSize, act_fn):
super().__init__()
self.hparams = SimpleNamespace(act_fn=act_fn,
dt=dt,
tKernelSize=tKernelSize)
self._create_network()
self._init_params()
def _create_network(self):
kernel_size = round_to_odd(int(self.hparams.tKernelSize // self.hparams.dt))
self.conv_1D = nn.Conv1d(in_channels=3, out_channels=16,
kernel_size = kernel_size , stride=1, padding='same')
def _init_params(self):
for m in self.modules():
if isinstance(self.hparams.act_fn, nn.Identity):
print('Nothing to initialize')
else:
if isinstance(m, nn.Conv1d):
nn.init.kaiming_normal_(
m.weight, nonlinearity=self.hparams.act_fn)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def forward(self, x):
x_out = self.conv_1D(x)
return x_out
My training is
def train_model(model, train_loader, val_loader, test_loader, save_name, **kwargs):
global CHECKPOINT_PATH
global device
# Create a PyTorch Lightning trainer with the generation callback
trainer = pl.Trainer(default_root_dir=os.path.join(CHECKPOINT_PATH, save_name), # Where to save models
accelerator="gpu" if str(device).startswith("cuda") else "cpu", # We run on a GPU (if possible)
devices=1, # How many GPUs/CPUs we want to use (1 is enough for the notebooks)
max_epochs=2, # How many epochs to train for if no patience is set
callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_AbsDiffLoss"), # Save the best checkpoint based on the maximum val_AbsDiffLoss recorded. Saves only weights and not optimizer
LearningRateMonitor("epoch")], # Log learning rate every epoch
enable_progress_bar=True) # Set to False if you do not want a progress bar
trainer.logger._log_graph = True # If True, we plot the computation graph in tensorboard
trainer.logger._default_hp_metric = None # Optional logging argument that we don't need
# Check whether pretrained model exists. If yes, load it and skip training
pretrained_filename = os.path.join(CHECKPOINT_PATH, save_name + ".ckpt")
if os.path.isfile(pretrained_filename):
print(f"Found pretrained model at {pretrained_filename}, loading...")
model = MyModule.load_from_checkpoint(pretrained_filename) # Automatically loads the model with the saved hyperparameters
else:
pl.seed_everything(42) # To be reproducable
model = MyModule(model, **kwargs)
trainer.fit(model, train_loader, val_loader)
model = MyModule.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) # Load best checkpoint after training
# Test best model on validation and test set
val_result = trainer.test(model, val_loader, verbose=True)
test_result = trainer.test(model, test_loader, verbose=False)
result = {"test": test_result[0]["test_AbsDiffLoss"], "val": val_result[0]["test_AbsDiffLoss"]}
return model, result
I call the trainer
trained_Net = train_model(Net, train_loader, val_loader, test_loader, 'Net',
model_hparams={"act_fn": act_fn},
optimizer_name="Adam",
optimizer_hparams={"lr": 1e-5, "weight_decay": 1e-4})