Hi @ptrblck, thanks for helping.
Unfortunately, I do not have a snippet of self running code (the data extraction and preprocessing is a key step in my application), but I can provide a partial view of the training function (some declarations and variables are not reported, in some points it looks like pseudocode).
Here the snippet:
def run_training(models, device, …):
# Set optimizers
optimizerG = optim.Adam(models["generator"].parameters(), lr=lr_G, weight_decay=lp["lambda"])
optimizerD = optim.Adam(models["discriminator"].parameters(), lr=lr_D, weight_decay=lp["lambda"])
# Deine adversial loss
adversial_loss = nn.BCELoss().to(device)
# mini batch stuff
g_minibatch_counter = 0
d_minibatch_counter = 0
minibatch_size = 20
minibatch_status = "discriminator"
for i, index_shuf in enumerate(indexes_shuf):
# Assign images to batch and run only if the selected view matches
if augmented_X is not None and augmented_Y is not None:
batchIndex+=1
# If batch is full, start training
if batchIndex == lp["batchSize"]:
# Define labels for discriminato training
real_label = torch.full((lp["batchSize"], 1), randrange(7, 12, 1)*0.1, dtype=torch.float, device=device)
fake_label = torch.full((lp["batchSize"], 1), randrange(0, 3, 1)*0.1, dtype=torch.float, device=device)
# Convert numpy object to pytorch tensor
Y_batch_tensor, Xes_batch_tensor = convert_arrays_to_tensors(Y_batch, Xes_batch)
# Run conversion
Y_pred = models["generator"](Xes_batch_tensor)
#####################
# Train discriminator
#####################
if minibatch_status == "discriminator":
models["generator"].eval()
models["discriminator"].train()
# Set the discriminator gradients to zero
models["discriminator"].zero_grad()
if d_minibatch_counter <= int(minibatch_size/2):
# real
score_d_real = models["discriminator"](Xes_batch_tensor)
loss_D_real = adversial_loss(score_d_real, real_label)
loss_D_real.backward()
loss_D = loss_D_real
else:
# fake
score_d_fake = models["discriminator"](Y_pred.detach())
loss_D_fake = adversial_loss(score_d_fake, fake_label)
loss_D_fake.backward()
loss_D = loss_D_fake
# Update minibatch info
d_minibatch_counter +=1
if d_minibatch_counter == minibatch_size:
optimizerD.step()
minibatch_status = "generator"
d_minibatch_counter = 0
#####################
# Train generator
#####################
if minibatch_status == "generator":
models["generator"].train()
models["discriminator"].eval()
# Set the generator gradients to zero
models["generator"].zero_grad()
# Compute generative intensity loss
loss_G_intensity, loss_G_type_str = compute_loss(lp, "train", Y_batch_tensor, Y_pred)
# Compute l1 penalty
loss_G_norm = compute_l1_norm(models["generator"], lp["lambda"])
# Compute losses
score_d_g = models["discriminator"](Y_pred)
loss_D_G = adversial_loss(score_d_g, real_label)
loss_G = loss_G_intensity + loss_G_norm + (lp["discriminatorWeight"] * loss_D_G)
# Run optimizer
loss_G.backward()
optimizerG.step()
# Update minibatch info
g_minibatch_counter += 1
if g_minibatch_counter == minibatch_size:
minibatch_status = "discriminator"
g_minibatch_counter = 0
# Reset batch index counter
batchIndex=0
models = {}
models[“generator”] = models_arch.Generator(lp).to(device)
models[“generator”].apply(weight_init.weight_init)
models[“discriminator”] = torch.hub.load(‘pytorch/vision’, ‘resnet152’, pretrained=False)
models[“discriminator”].conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
models[“discriminator”].fc = nn.Sequential(nn.Linear(models[“discriminator”].fc.in_features,512),
nn.ReLU(),
nn.Dropout(),
nn.Linear(512, 1),
nn.Sigmoid())
models[“discriminator”] = models[“discriminator”].to(device)
run_training(…)
On CPU it runs, so I don’t think it’s something related to mismatch or something like that.
python -m torch.utils.collect_env returns the following:
Collecting environment information…
PyTorch version: 1.8.1+cu111
Is debug build: False
CUDA used to build PyTorch: 11.1
ROCM used to build PyTorch: N/A
OS: Arch Linux (x86_64)
GCC version: (GCC) 10.2.0
Clang version: Could not collect
CMake version: version 3.19.3
Python version: 3.9 (64-bit runtime)
Is CUDA available: True
CUDA runtime version: Could not collect
GPU models and configuration:
GPU 0: TITAN Xp
GPU 1: Quadro P6000
Nvidia driver version: 460.32.03
cuDNN version: Probably one of the following:
/opt/cudnn6/lib64/libcudnn.so.6.0.21
/usr/lib/libcudnn.so.8.0.5
/usr/lib/libcudnn_adv_infer.so.8.0.5
/usr/lib/libcudnn_adv_train.so.8.0.5
/usr/lib/libcudnn_cnn_infer.so.8.0.5
/usr/lib/libcudnn_cnn_train.so.8.0.5
/usr/lib/libcudnn_ops_infer.so.8.0.5
/usr/lib/libcudnn_ops_train.so.8.0.5
HIP runtime version: N/A
MIOpen runtime version: N/A
Versions of relevant libraries:
[pip3] numpy==1.20.2
[pip3] torch==1.8.1+cu111
[pip3] torchaudio==0.8.1
[pip3] torchvision==0.9.1+cu111
[conda] Could not collect
Thank you!