Hello,
I’m training a VAE using a database of satellite images (256x256). Here is some of my code:
class VAE(nn.Module):
def __init__(self, inp_s, conv_kernel_size1):
super(VAE, self).__init__()
self.n1=1
self.n2=20
self.n3=50
self.padding=0
self.stride=1
self.size_out_conv1=int(((inp_s[2]-conv_kernel_size1-2*self.padding)/self.stride)+1)
self.size_out_conv2=int(((self.size_out_conv1-conv_kernel_size1-2*self.padding)/self.stride)+1)
self.fc1 = nn.Linear(self.size_out_conv2*self.size_out_conv2*self.n3, 700)
self.fc21 = nn.Linear(700, 20)
self.fc22 = nn.Linear(700, 20)
self.fc3 = nn.Linear(20, 700)
self.fc4 = nn.Linear(700, self.size_out_conv2*self.size_out_conv2*self.n3)
self.conv1=nn.Conv2d(self.n1, self.n2, kernel_size=conv_kernel_size1)
self.conv2=nn.Conv2d(self.n2, self.n3, kernel_size=conv_kernel_size1)
self.deconv1=nn.ConvTranspose2d(self.n2, self.n1, kernel_size=conv_kernel_size1)
self.deconv2=nn.ConvTranspose2d(self.n3, self.n2, kernel_size=conv_kernel_size1)
def encode(self, x):
x=F.relu(self.conv1(x))
x=F.relu(self.conv2(x))
s=x.shape
x=x.view(-1, s[1]*s[2]*s[3])
h1 =F.relu(self.fc1(x))
h1 =F.dropout(F.relu(self.fc1(x)), training=self.training)
return self.fc21(h1), self.fc22(h1), s
def reparameterize(self, mu, logvar):
std = torch.exp(0.5*logvar)
eps = torch.randn_like(std)
return mu + eps*std
def decode(self, z, s):
h3 = F.relu(self.fc3(z))
x=F.relu(self.fc4(h3))
x=x.view(s[0],s[1],s[2],s[3])
x=F.relu(self.deconv2(x))
x=self.deconv1(x)
return torch.sigmoid(x)
def forward(self, x):
mu, logvar, s = self.encode(x)
z = self.reparameterize(mu, logvar)
return self.decode(z, s), mu, logvar
def loss_function(recon_x, x, mu, logvar):
MSE = F.mse_loss(recon_x, x, reduction="sum")
BCE = F.binary_cross_entropy(recon_x, x, reduction='sum')
KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
return BCE +KLD, BCE, KLD, MSE
def train(train_dl, model, epoch, inp_im, out_im, lr1):
torch.backends.cudnn.benchmark = True
optimizer=torch.optim.Adam(model.parameters(), lr=lr1)
model.train()
train_loss=0.0
bce_loss=0.0
kld_loss=0.0
mse_loss=0.0
for idx, (data, label) in enumerate(train_dl):
data, label= Variable(data), Variable(label)
data1=data[:,0,:,:]
data1 = data1.unsqueeze(1)
data1 = data1.to(device)
out, mu, var=model(data1)
loss,bce,kld,mse=loss_function(out,data1, mu, var)
train_loss+=loss.item()
bce_loss+=bce.item()
kld_loss+=kld.item()
mse_loss+=mse.item()
model.zero_grad()
loss.backward()
optimizer.step()
av_loss= train_loss / len(train_dl.dataset)
av_kld_loss=kld_loss / len(train_dl.dataset)
av_bce_loss=bce_loss / len(train_dl.dataset)
av_mse_loss=mse_loss / len(train_dl.dataset)
return av_loss, av_bce_loss, av_kld_loss, av_mse_loss
def test(epoch, model):
model.eval()
test_loss = 0
test_mse_loss = 0
with torch.no_grad():
for i, (data, _) in enumerate(test_loader):
data = data.to(device)
data1=data[:,0,:,:]
data1 = data1.unsqueeze(1)
data1 = data1.to(device)
recon_batch, mu, logvar = model(data1)
loss,bce,kld,mse= loss_function(recon_batch, data1, mu, logvar)
test_loss +=loss.item()
test_mse_loss +=mse.item()
test_loss /= len(test_loader.dataset)
test_mse_loss /= len(test_loader.dataset)
return test_loss, test_mse_loss
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='VAE MNIST Example')
parser.add_argument('--batch-size', type=int, default=128, metavar='N',
help='input batch size for training (default: 128)')
parser.add_argument('-ks','--kernel-size', type=int, default=5, metavar='N',
help='kernel size (convolution)')
parser.add_argument('--epochs', type=int, default=200, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('-lr','--learning-rate', type=float, default=1e-3, metavar='',
help='learning rate (default: 1e-3)')
args = parser.parse_args()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
kwargs = {'num_workers': 1, 'pin_memory': True} if torch.cuda.is_available() else {}
transform1=transforms.Compose([transforms.ToTensor()])
train1=ImageFolder("/home/zaianir/Documents/dataset_s2/train", transform1)
test1=ImageFolder("/home/zaianir/Documents/dataset_s2/test", transform1)
train_loader=torch.utils.data.DataLoader(train1, batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader=torch.utils.data.DataLoader(test1, batch_size=args.batch_size, shuffle=False, **kwargs)
data , target = next(iter(train_loader))
inp_s=data.shape
model = VAE(inp_s, args.kernel_size).to(device)
for epoch in range(1, args.epochs + 1):
moy_train_loss,av_bce_loss, av_kld_loss, av_mse_loss =train(train_loader, model, epoch, args.input_img, args.output_img, args.learning_rate)
test_l, mse_test_l=test(epoch, model)
I’m getting this error:
load conda/4.5.4 : OK
Traceback (most recent call last):
File “/home/uz/zaianir/scratch/VAE22.py”, line 194, in
moy_train_loss,av_bce_loss, av_kld_loss, av_mse_loss =train(train_loader, model, epoch, args.input_img, args.output_img, args.learning_rate)
File “/home/uz/zaianir/scratch/VAE22.py”, line 96, in train
loss.backward()
File “/home/uz/zaianir/.conda/envs/myenv2/lib/python3.7/site-packages/torch/tensor.py”, line 102, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File “/home/uz/zaianir/.conda/envs/myenv2/lib/python3.7/site-packages/torch/autograd/init.py”, line 90, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: CUDA out of memory. Tried to allocate 7.64 GiB (GPU 0; 31.72 GiB total capacity; 20.76 GiB already allocated; 5.01 GiB free; 3.61 GiB cached)
- I’m running my validation code with
torch.no_grad()
- I added
torch.backends.cudnn.benchmark = True
before training - I decreased the batch size from 128 to 20
But I still get the same error.
Can you help me please.
Thank you.