What could be causing this error ("killed" error)?

Hello, everyone

During training, the memory allocation continues to rise every epoch, and eventually training stops with a “killed” error.
Why does this happen??

For example, in the first epoch, the memory utilization is 10%. But, after 20 epoch, memory utilization exceeds 95%. Finally, training is terminated.

from sympy import Ge
import dataset
import network
import utils
import config_256_Test

import numpy as np
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import time
import copy
import os
import torch
import torch.nn as nn
from torch.nn import parameter
import torch.nn.functional as func
import torch.optim as optim
from torch.autograd import Variable
import utils
import numpy as np
import copy
import argparse

timestr = time.strftime("%Y_%m_%d_%H_%M_%S_")

from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter('./log/Test/256/')

GPU_NUM = 0
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device)  # change allocation of current GPU
print('Current cuda device:', torch.cuda.current_device())  # check
model_path = "./modelsave/Test/256/"

params = config_256_Test.set_params()

print('Set dataloader')
train_set = dataset.predDataset('./datasets/train/')
train_loader = DataLoader(train_set, params.batch_size, shuffle=True)

validation_set = dataset.predDataset('./datasets/validation/')
validation_loader = DataLoader(validation_set, params.batch_size, shuffle=True)

test_set = dataset.predDataset('./datasets/test/')
test_loader = DataLoader(test_set, params.batch_size, shuffle=True)

Classifier = network.CNN().to(device)

Encoder = network.Encoder(batch_size=params.batch_size,
                          input_size=params.numData - params.numStart,
                          hidden_size=params.hidden_size[0],
                          num_layers=3,  # 1
                          num_pred=params.numPred,
                          device=device).to(device)

print('Define network')

Generator = network.Generator(batch_size=params.batch_size,
                              input_size=params.hidden_size[0] + params.latent_size + params.numCond,
                              hidden_size=params.hidden_size[1],
                              num_layers=3,
                              num_pred=params.numPred,
                              out_size=params.numOut,
                              device=device).to(device)

Discriminator = network.Discriminator(batch_size=params.batch_size,
                                      input_size=params.numOut + params.numCond + params.hidden_size[0],
                                      hidden_size=params.hidden_size[2],
                                      num_layers=3,
                                      device=device).to(device)

parser = argparse.ArgumentParser(description='PyTorch DSA-GAN Training')
parser.add_argument('--optim', default='vttcg', type=str, help='optimizer')
parser.add_argument('--finallr', type=float, default=0.1, help='final learning rate')
parser.add_argument('--weight_decay', default=5e-4, type=float, help='weight decay for optimizers')
parser.add_argument('--batchsize', type=int, default=256, help='batch size')

args = parser.parse_args()
args.tied = True


class Train():
    def __init__(self, Classifier, Encoder, Generator, Discriminator, learning_rate1, learning_rate2, betas, ratio,
                 device):
        self.Classifier = Classifier
        self.Encoder = Encoder
        self.Generator = Generator
        self.Discriminator = Discriminator
        self.learning_rate1 = learning_rate1
        self.learning_rate2 = learning_rate2
        self.betas = betas
        self.device = device
        self.ratio = ratio
        self.criterionClassifier = nn.CrossEntropyLoss() 
        
        if args.optim == 'adam':
            self.optimClassifier = optim.Adam(self.Classifier.parameters(), lr=self.learning_rate1)

            self.criterionL2 = nn.MSELoss()
            self.criterionLS = nn.MSELoss()

            self.optimGenerator = optim.Adam([{'params': self.Generator.parameters()},
                                              {'params': self.Encoder.parameters()}],
                                             lr=self.learning_rate2,
                                             betas=self.betas
                                             )

            self.optimDiscriminator = optim.Adam([{'params': self.Discriminator.parameters()}],
                                                 lr=self.learning_rate2,
                                                 betas=self.betas
                                                 )

    def trainClassifier(self, data, target):
        self.Classifier.train()
        self.optimClassifier.zero_grad()

        batch_size = data.shape[0]

        data = data.to(self.device)
        target = target.to(self.device)

        logit, out = self.Classifier(data)
        self.lossClassifier = self.criterionClassifier(logit, target[:, 0])
        self.lossClassifier.backward()
        self.optimClassifier.step()

        return self.lossClassifier.item()

    def testClassifier(self, data):
        self.Classifier.eval()

        data = data.to(self.device)

        with torch.no_grad():
            _, out = self.Classifier(data)

        return out

    def trainGenerator(self, data, target, data_pred, numPred, latent_size):
        self.Encoder.train()
        self.Generator.train()
        self.Discriminator.train()
        batch_size = data.shape[0]
        start_traingenerator = time.time()

        self.optimGenerator.zero_grad()
        encoded = self.Encoder(data)

        target_matrix = utils.targetMatrix(numPred, target).to(self.device)
        latent = utils.generateLatent(batch_size, numPred, latent_size).to(self.device)
        z = torch.cat([latent, encoded, target_matrix], dim=2).to(self.device)
        z_fix = torch.cat([torch.zeros(latent.shape).to(self.device), encoded, target_matrix], dim=2).to(self.device)

        y = Variable(torch.ones(batch_size, 1).to(self.device))

        fake = self.Generator(z)
        fake_fix = self.Generator(z_fix)
        fakeD = torch.cat([fake, encoded, target_matrix], dim=2)

        D_fake = self.Discriminator(fakeD)
        weight = torch.ones([data_pred.shape[1], 1]).to(self.device)
        weight = torch.cat([weight, 15 * weight], dim=1)

        weight_true = data_pred[:, :, :2] * weight[np.newaxis, :, :]
        weight_fake = fake_fix[:, :, :2] * weight[np.newaxis, :, :]

        lossL2 = torch.sqrt(self.criterionL2(weight_true, weight_fake))
        lossFake = self.criterionLS(D_fake, y)
        self.lossGenerator = lossFake + self.ratio * lossL2

        self.lossGenerator.backward()

        self.optimGenerator.step()

        return lossL2.item(), lossFake.item(), torch.mean(D_fake)

    def trainDiscriminator(self, data, target, data_pred, numPred, latent_size):
        self.Encoder.train()
        self.Generator.train()
        self.Discriminator.train()
        batch_size = data.shape[0]
        start_traindiscriminator = time.time()

        self.optimDiscriminator.zero_grad()
        y_real = Variable(torch.ones(batch_size, 1).to(self.device))
        y_fake = Variable(torch.zeros(batch_size, 1).to(self.device))

        target_matrix = utils.targetMatrix(numPred, target).to(self.device)
        encoded = self.Encoder(data)

        realD = torch.cat([data_pred.to(self.device), encoded, target_matrix], dim=2).to(self.device)

        D_real = self.Discriminator(realD)
        lossReal = self.criterionLS(D_real, y_real)

        latent = utils.generateLatent(batch_size, numPred, latent_size).to(self.device)
        z = torch.cat([latent, encoded, target_matrix], dim=2).to(self.device)

        fake = self.Generator(z)
        fakeD = torch.cat([fake, encoded, target_matrix], dim=2).to(self.device)

        D_fake = self.Discriminator(fakeD)
        lossFake = self.criterionLS(D_fake, y_fake)

        self.lossDiscriminator = (lossReal + lossFake) * 0.5
        self.lossDiscriminator.backward()
        self.optimDiscriminator.step()

        return lossReal.item(), lossFake.item(), torch.mean(D_real), torch.mean(D_fake)

    def Test(self, data, target, numPred, latent_size):
        with torch.no_grad():
            self.Encoder.eval()
            self.Generator.eval()
            self.Discriminator.eval()
            batch_size = data.shape[0]
            data = data.to(self.device)
            encoded = self.Encoder(data)

            if target.shape[1] == 1:
                target = [target for _ in range(batch_size)]
                target = torch.tensor(target).view([batch_size, 1]).to(self.device)

            latent = utils.generateLatent(batch_size, numPred, latent_size).to(self.device)
            target_matrix = utils.targetMatrix(numPred, target).to(self.device)
            z = torch.cat([latent, encoded, target_matrix], dim=2).to(self.device)
            fake = self.Generator(z)
        return fake

    def Test_fix(self, data, target, numPred, latent_size):
        with torch.no_grad():
            self.Encoder.eval()
            self.Generator.eval()
            self.Discriminator.eval()
            batch_size = data.shape[0]
            data = data.to(self.device)
            encoded = self.Encoder(data)

            if target.shape[1] == 1:
                target = [target for _ in range(batch_size)]
                target = torch.tensor(target).view([batch_size, 1]).to(self.device)

            latent = torch.zeros((batch_size, numPred, latent_size)).to(self.device)
            target_matrix = utils.targetMatrix(numPred, target).to(self.device)
            z = torch.cat([latent, encoded, target_matrix], dim=2).to(self.device)
            fake = self.Generator(z)
        return fake


Trainer = Train(Classifier=Classifier,
                      Encoder=Encoder,
                      Generator=Generator,
                      Discriminator=Discriminator,
                      learning_rate1=params.learning_rate_cls,
                      learning_rate2=params.learning_rate_reg,
                      betas=(0.5, 0.999),
                      device=params.device,
                      ratio=params.loss_ratio)

############################################# train GAN #############################################
if params.status == "train":
    print('Training GAN starts!')
    batch_idx = 0
    prev_err = 1e9
    for epoch in range(params.iteration):
        err = []
        losses = []
        start = time.time()
        for idx, data in enumerate(train_loader):
            dataHist, labelHist, dataPred, labelPred, rpHist = data
            dataHist = dataHist.to(device)
            labelHist = labelHist.to(device)
            dataPred = dataPred.to(device)
            labelPred = labelPred.to(device)
            rpHist = rpHist.to(device)

            labelTrain = Trainer.testClassifier(rpHist)
            lossRe, lossG, D_gen = Trainer.trainGenerator(dataHist[:, :, params.numStart:params.numData], labelTrain,
                                                          dataPred, params.numPred, params.latent_size)

            if (idx % 100 ==0):
                print("Train Generator time (100) :", time.time() - start)

            lossD_R, lossD_F, D_real, D_fake = Trainer.trainDiscriminator(
                dataHist[:, :, params.numStart:params.numData], labelTrain, dataPred, params.numPred,
                params.latent_size)

            if (idx % 100 ==0):
                print("Train Discriminator time (100) :", time.time() - start)

            loss = (lossRe, lossG, lossD_R, lossD_F)
            writer.add_scalar("Loss_pred/train_reconstruction", lossRe, batch_idx)
            writer.add_scalar("Loss_pred/train_generator", lossG, batch_idx)
            writer.add_scalar("Loss_pred/train_discriminator_real", lossD_R, batch_idx)
            writer.add_scalar("Loss_pred/train_discriminator_fake", lossD_F, batch_idx)
            losses.append(loss)
            batch_idx += 1

        losses = np.mean(losses, axis=0)
        print(
            'epoch: {}, lossRe: {:3f}, lossG: {:3f}, lossD_R: {:3f}, lossD_F: {:3f}, D(x|c): {:3f}, Dis D(G(z|c)): {:3f}, Gen D(G(z|c)): {:3f}'.format(
                epoch, losses[0], losses[1], losses[2], losses[3], D_real, D_fake, D_gen))
        print("time :", time.time() - start)
        # Save the model
        torch.save(Classifier.state_dict(), params.model_save_directory + timestr + str(epoch) + "_classifier.pt")
        torch.save(Encoder.state_dict(), params.model_save_directory + timestr + str(epoch) + "_encoder.pt")
        torch.save(Generator.state_dict(), params.model_save_directory + timestr + str(epoch) + "_generator.pt")
        torch.save(Discriminator.state_dict(), params.model_save_directory + timestr + str(epoch) + "_discriminator.pt")

        if (epoch % 1 == 0):

            ades = []
            fdes = []

            for idx, data in enumerate(validation_loader):
                dataHist, labelHist, dataPred, labelPred, rpHist = data
                dataHist = dataHist.to(device)
                labelHist = labelHist.to(device)
                dataPred = dataPred.to(device)
                labelPred = labelPred.to(device)
                rpHist = rpHist.to(device)

                class_list = torch.tensor([[[0]], [[1]], [[2]]]).to(device)
                temp = copy.deepcopy(dataHist).to('cpu').numpy()
                pred_temp = copy.deepcopy(dataPred).to('cpu').numpy()

                labelTest = Trainer.testClassifier(rpHist)

                temp_true = np.cumsum(pred_temp, axis=1)
                out = Trainer.Test_fix(dataHist[:, :, params.numStart:params.numData], labelTest, params.numPred,
                                       params.latent_size)
                out = np.cumsum(out.to('cpu').numpy(), axis=1)
                ades.extend(utils.ade(out, temp_true).reshape([-1]))
                fdes.extend(utils.fde(out, temp_true).reshape([-1]))

            print('ade:', np.mean(ades), 'fde:', np.mean(fdes))
            writer.add_scalar("performance/ade", np.mean(ades), epoch)
            writer.add_scalar("performance/fde", np.mean(fdes), epoch)

That’s a lot of code to debug. Best if you can whittle it down to smaller representation that still produces the error.

This line looks problematic:

return lossReal.item(), lossFake.item(), torch.mean(D_real), torch.mean(D_fake)

Should make use of .detach(), or you may have the graph getting carried over between batches.

Thanks!

Do you mean that

return lossReal.item(), lossFake.item(), torch.mean(D_real), torch.mean(D_fake)

replacing

return lossReal.item(), lossFake.item(), torch.mean(D_real.detach()), torch.mean(D_fake.detach())

?

Yes. Basically, anytime you want to take something out of your training loop, a .detach() should be called. .item() is fine, too, because the graph won’t connect outside of tensor objects.

Thank you so much!
It’s difficult…