GPU memory keep going up every iteration

Hello. I encountered a confusing problem when I write a simple GRU demo, that is the GPU memory keep going up every iteration. I don’t know if there are some mistakes in my code.
my code:

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class tdModel(nn.Module):
    def __init__(self, input_dim=101, num_filters=196, win_size=15, stride=4, gru_hidden=128, batch_size=5):
        super(tdModel,self).__init__()

        self.gru_hidden_size = gru_hidden
        self.batch_size = batch_size

        # model architecture
        self.conv2d = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(win_size, input_dim), stride=(stride,1))

        self.gru1 = nn.GRU(input_size=num_filters, hidden_size=gru_hidden,dropout=0.2, batch_first=True)
        self.gru_hidden1 = self.init_gru_hidden(batch_size)

        self.gru2 = nn.GRU(input_size=gru_hidden, hidden_size=gru_hidden, dropout=0.2, batch_first=True)
        self.gru_hidden2 = self.init_gru_hidden(batch_size)

        self.dense = nn.Linear(gru_hidden, 1)

    def init_gru_hidden(self,batch_size):
        return torch.zeros(1, batch_size, self.gru_hidden_size).cuda()

    def forward(self, X):

        X = X.unsqueeze(1)
        X = self.conv2d(X).squeeze()

        X = X.transpose(1,2)
        X, self.gru_hidden1 = self.gru1(X, self.gru_hidden1)

        X, self.gru_hidden2 = self.gru2(X, self.gru_hidden2)

        X = F.sigmoid(self.dense(X))
        return X

class audioDataset(Dataset):

    def __init__(self):
        super(audioDataset,self).__init__()
        self.X = np.load('./XY_train/X.npy')
        self.Y = np.load('./XY_train/Y.npy')
        self.X = torch.FloatTensor(self.X)[:25,:,:].to(device)
        self.Y = torch.FloatTensor(self.Y)[:25,:,:].to(device)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self,idx):
        X = self.X[idx]
        Y = self.Y[idx]

        return (X,Y)

if __name__=='__main__':

    model = tdModel()
    model.to(device)
    opt = Adam(lr=0.0001, weight_decay=0.01,params=model.parameters())
    criterion = nn.BCELoss()

    epochs = 50
    dataset = audioDataset()

    dataloader = DataLoader(dataset, batch_size=5,shuffle=True)
    for ep in range(epochs):
        for batch_sample in dataloader:
            X,Y = batch_sample

            opt.zero_grad()

            ret  = model(X)
            loss = criterion(ret,Y)
            print("Epoch {}: {}".format(ep, loss))

            loss.backward(retain_graph=True)
            opt.step()

I’m no expert in RNN, but it seems you are keeping the complete history of your hidden states, which uses more memory for each epoch and also slows down the code after a while.
If you re-initialize your hidden states in each epoch with:

model.gru_hidden1 = model.init_gru_hidden(batch_size)
model.gru_hidden2 = model.init_gru_hidden(batch_size)

the memory usage and speed is constant.

It works! Thank you very much. But I still confused about its running detail. Why does model allocate another memory rather than rewrite hiddent states?(Actually I’m a rookie with pytorch… OTL)

I’m glad it’s working!
The computation graph keeps growing as you never detach or reset the hidden states.
If I’m not mistaken your original implementation would treat the sequential epochs as one long time series.

OK, I see. Thanks again!