Hello,
I am fairly new to pytorch, so I’m not sure I’m doing things correctly…
I am using pytorch = 1.7.1
I have the following code for training/testing a Unet, I’ve made sure to set everything to the device
that is set to CUDA, I even checked it by print(var.is_cuda)
. However, for some reason, during the testing part, my GPU usage drops to 0 and my CPU goes to 100%, though everything is set to CUDA.
I am looking for any explanation and help to fix it
def train_model(model,
device,
crit="mse",
opti="adam",
dataset_file: str = "dict_wds-1201-0331-dec-march",
num_epochs: int = 176,
batch_size: int = 1,
learning_rate: float = 1e-3,
gen_load: bool = False,
checkpoint_save: bool = True,
):
train_loader, test_loader, train_dataset, test_dataset = load_data(batch_size, dataset_file=dataset_file, discretize=discretize)
n_train = len(train_dataset)
n_test = len(test_dataset)
writer = SummaryWriter(comment=f'LR_{learning_rate}_BS_{batch_size}_E_{num_epochs}_C_{crit}_O_{opti}_L_{n_train}_D_{discretize}')
global_step = 0
ets_benckmark = 0.0
print("params #: ", sum(x.numel() for x in model.parameters()))
criterion = nn.MSELoss()
criterion = criterion.to(device)
optimizer = torch.optim.Adam(
model.parameters(),
lr=learning_rate,
weight_decay=1e-8
)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 2)
for epoch in range(num_epochs):
running_loss = 0.0
loss_values = []
# TRAINING PART
with tqdm(train_loader, desc=f"Epoch [{epoch + 1}/{num_epochs}]", colour="#009B77") as t:
for i, (ecmwf, truth) in enumerate(t):
model.train()
ecmwf = ecmwf.to(device, dtype=torch.float32)
truth = truth.to(device, dtype=torch.float32)
optimizer.zero_grad()
outputs = model(ecmwf)
if crit == "rmse":
loss = torch.sqrt(criterion(outputs, truth))
else:
loss = criterion(outputs, truth)
running_loss += loss.item()
loss_values.append(loss.item())
loss.backward()
nn.utils.clip_grad_value_(model.parameters(), 0.1)
optimizer.step()
t.set_postfix(**{"iter": f"{i + 1}/{n_train // batch_size}", "batch loss": loss.item(),
"avg loss": sum(loss_values)/n_train})
writer.add_scalar("Loss/Train", loss.item(), global_step)
global_step += 1
# TESTING PART
if global_step % (n_train // (batch_size)) == 0:
tot_l1_loss = 0
tot_mse = 0
tot_ets = 0
with tqdm(test_loader, desc=f"Test", colour="#D2691E") as tl:
for k, (ecmwf, truth) in enumerate(tl):
for name, param in model.named_parameters():
name = name.replace(".", "/")
writer.add_histogram("Weights/" + name, param.data.cpu().numpy(), global_step)
writer.add_histogram("Gradients/" + name, param.grad.cpu().numpy(), global_step)
model.eval()
with torch.no_grad():
ecmwf = ecmwf.to(device)
truth = truth.to(device)
outputs = model(ecmwf)
tot_mse += F.mse_loss(truth, outputs)
# print(f"Validation mse: {tot_mse / len(test_loader)}")
writer.add_scalar('mse/test', tot_mse / len(test_loader), global_step)
tl.set_postfix(**{
"Validation mse": f"{tot_mse / len(test_loader)}",
})
if k == len(test_loader)-1:
writer.add_images('truth', truth, global_step)
writer.add_images('pred', outputs, global_step)
val_loss = tot_mse / len(test_loader)
scheduler.step(val_loss)
print(f"ETS benchmark avg on train_set (ecmwf/cmorph): {ets_benckmark / (n_train)}")
if checkpoint_save:
torch.save(model.state_dict(), f"./cp.pth")
print(f"Checkpoint reached! :)")
writer.close()
def main():
args = arg_parse()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.empty_cache()
if device.type == "cuda":
torch.backends.cudnn.benchmark = True
model = my_UNet()
model.to(device=device)
train_model()