GPU high memory usage

rogetrullo · March 24, 2017, 5:47pm

Hi,
I am starting with pytorch, and I have seen that my implementation requires more GPU memory than my tensorflow implementation of the same architecture.
Here’s a part of my code:

class seg_GAN(object):
    def __init__(self, batch_size=10, height=512,width=512,channels=3, wd=0.0005,nfilters_d=64, checkpoint_dir=None, path_imgs=None, learning_rate=2e-8,lr_step=30000,lam_fcn=1, lam_adv=1,adversarial=False,nclasses=5):

        self.adversarial=adversarial
        self.channels=channels
        self.lam_fcn=lam_fcn
        self.lam_adv=lam_adv
        self.lr_step=lr_step
        self.wd=wd
        self.learning_rate=learning_rate
        self.batch_size=batch_size       
        self.height=height
        self.width=width
        self.checkpoint_dir = checkpoint_dir
        self.path_imgs=path_imgs
        self.nfilters_d=nfilters_d
        self.organ_target=1#1 eso 2 heart 3 trach 4 aorta
        self.nclasses=nclasses
        self.netG=UNet(self.nclasses,self.channels)
        self.netG.apply(weights_init)
	if self.adversarial:
	    self.netD=Discriminator(self.nclasses,self.nfilters_d,self.height,self.width)
            self.netD.apply(weights_init)

        self.dst = myDataSet(self.path_imgs, is_transform=True)
        self.trainloader = data.DataLoader(self.dst, batch_size=self.batch_size, shuffle=True, num_workers=2)

    def train(self,config):
        print 'verion ',torch.__version__

        start=0#TODO change this so that it can continue when loading a model
        print("Start from:", start)

        label_ones=torch.ones(self.batch_size)
        label_zeros=torch.zeros(self.batch_size)
        y_onehot = torch.FloatTensor(self.batch_size,self.nclasses,self.height, self.width)            

        #print 'shape y_onehot ',y_onehot.size()
        if self.adversarial:
            self.netD.cuda()
        self.netG.cuda()
        label_ones,label_zeros,y_onehot=label_ones.cuda(),label_zeros.cuda(),y_onehot.cuda()
        
        y_onehot_var= Variable(y_onehot)
        label_ones_var = Variable(label_ones)
        label_zeros_var = Variable(label_zeros)
        if self.adversarial:
            optimizerD = optim.Adam(self.netD.parameters(), lr = self.learning_rate, betas = (0.5, 0.999))
        optimizerG = optim.Adam(self.netG.parameters(), lr = self.learning_rate, betas = (0.5, 0.999))

        for it in range(start,config.iterations):#epochs
            for i, (images,GT) in enumerate(self.trainloader):                    
                
                y_onehot.resize_(GT.size(0),self.nclasses,self.height, self.width)
                y_onehot.zero_()
                label_ones.resize_(GT.size(0))
                label_zeros.resize_(GT.size(0))                    

                images = Variable(images.cuda()) 
                #images = Variable(images)
                #print 'unique ',np.unique(GT.numpy())
                GT=GT.cuda()
                
                #print 'image size ',images.size()
                #print 'GT size ',GT.size()
                #print 'shape y_onehot ',y_onehot.size()    
                y_onehot.scatter_(1,GT.view(GT.size(0),1,GT.size(1),GT.size(2)),1)#we need to add singleton dim so thatnum of dims is equal
                
                GT=Variable(GT)#N,H,W
                if self.adversarial:

                    ##########################
                    #Update Discriminator
                    ##########################
                    #train with real samples
                    self.netD.zero_grad()
                    #print self.netD
                    output=self.netD(y_onehot_var)#this must be in one hot
                    errD_real =F.binary_cross_entropy(output,label_ones_var)#loss_D
                    errD_real.backward()#update grads of netD                        

                    # train with fake
                    fake = self.netG(images)#this is a prob map which we want to be similar to y_onehot
                    #print 'fake sz',fake.size()
                    output = self.netD(fake.detach())#only for speed, so grads of netg are not computed
                    errD_fake = F.binary_cross_entropy(output, label_zeros_var)
                    
                    errD_fake.backward()

                    optimizerD.step()#update the parameters of netD

                ############################
                # Update G network
                ###########################
                self.netG.zero_grad()
                if self.adversarial:
                    output_D=self.netD(fake)
                    output_G, GT,label_ones,output_D
                    errG = self.loss_G(fake,GT, label_ones_var,output_D)#here we should use ones with the fakes
                else:
                    fake = self.netG(images)
                    errG = self.loss_G(fake,GT)
                
                errG.backward()#backprop errors
                optimizerG.step()#optimize only netG params

I guess I am not converting tensors to Variables in a correct way or maybe because I am doing it in the training loop, could you please take a look a let me know what can I change to gain efficiency and memory if possible?
Thanks!

hyqneuron · March 26, 2017, 2:29am

Does it have anything to do with the fact that PyTorch doesn’t use static buffers, and re-allocates buffers in every pass? Like to get an answer from someone who knows this better.

ryanleary · June 6, 2017, 2:45pm

Seeing similar issues here with RNNs. Any thoughts?

dpernes · June 6, 2017, 2:51pm

Try setting requires_grad=False to all Variable 's that do not require gradient computation (i.e. input images and labels). For instance, replace

images = Variable(images.cuda())

by

images = Variable(images.cuda(), requires_grad=False)

During inference (i.e. when you do not want to backpropagate gradients), you should also set volatile=True.

These things will help reducing memory consumption a bit. For more info. on this, check http://pytorch.org/docs/notes/autograd.html.