Hello. I encountered a confusing problem when I write a simple GRU demo, that is the GPU memory keep going up every iteration. I don’t know if there are some mistakes in my code.

my code:

```
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class tdModel(nn.Module):
def __init__(self, input_dim=101, num_filters=196, win_size=15, stride=4, gru_hidden=128, batch_size=5):
super(tdModel,self).__init__()
self.gru_hidden_size = gru_hidden
self.batch_size = batch_size
# model architecture
self.conv2d = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(win_size, input_dim), stride=(stride,1))
self.gru1 = nn.GRU(input_size=num_filters, hidden_size=gru_hidden,dropout=0.2, batch_first=True)
self.gru_hidden1 = self.init_gru_hidden(batch_size)
self.gru2 = nn.GRU(input_size=gru_hidden, hidden_size=gru_hidden, dropout=0.2, batch_first=True)
self.gru_hidden2 = self.init_gru_hidden(batch_size)
self.dense = nn.Linear(gru_hidden, 1)
def init_gru_hidden(self,batch_size):
return torch.zeros(1, batch_size, self.gru_hidden_size).cuda()
def forward(self, X):
X = X.unsqueeze(1)
X = self.conv2d(X).squeeze()
X = X.transpose(1,2)
X, self.gru_hidden1 = self.gru1(X, self.gru_hidden1)
X, self.gru_hidden2 = self.gru2(X, self.gru_hidden2)
X = F.sigmoid(self.dense(X))
return X
class audioDataset(Dataset):
def __init__(self):
super(audioDataset,self).__init__()
self.X = np.load('./XY_train/X.npy')
self.Y = np.load('./XY_train/Y.npy')
self.X = torch.FloatTensor(self.X)[:25,:,:].to(device)
self.Y = torch.FloatTensor(self.Y)[:25,:,:].to(device)
def __len__(self):
return self.X.shape[0]
def __getitem__(self,idx):
X = self.X[idx]
Y = self.Y[idx]
return (X,Y)
if __name__=='__main__':
model = tdModel()
model.to(device)
opt = Adam(lr=0.0001, weight_decay=0.01,params=model.parameters())
criterion = nn.BCELoss()
epochs = 50
dataset = audioDataset()
dataloader = DataLoader(dataset, batch_size=5,shuffle=True)
for ep in range(epochs):
for batch_sample in dataloader:
X,Y = batch_sample
opt.zero_grad()
ret = model(X)
loss = criterion(ret,Y)
print("Epoch {}: {}".format(ep, loss))
loss.backward(retain_graph=True)
opt.step()
```