I am not sure your code is correct, I had to modify a couple things to make it work:

```
import torch
import torch.nn as nn
import torch.optim as optim
import string
class MyLSTM(nn.Module):
def __init__(self, input_size, embed_size, hidden_size, output_size, n_layers):
super(MyLSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.embed_size = embed_size
self.output_size = output_size
self.n_layers = n_layers
self.encoder = nn.Embedding(input_size, embed_size)
self.rnn = nn.LSTM(input_size=embed_size,
hidden_size=hidden_size,
num_layers=n_layers,
batch_first=True,
dropout=0.5)
self.decoder = nn.Linear(hidden_size, output_size)
def forward(self, input_):
encoded = self.encoder(input_)
output, _ = self.rnn(encoded)
output = self.decoder(output)
output = output.permute(0, 2, 1) # change dimension to (B, C, T)
return output
device = torch.device("cuda")
model =\
MyLSTM(
input_size=100,
embed_size=8,
hidden_size=256,
output_size=100,
n_layers=1
).to(device)
criterion = nn.CrossEntropyLoss().to(device)
def fit_epoch(model, weights=None):
model.train()
for _ in range(10):
x = torch.zeros(32, 80, dtype=torch.long)
y = torch.zeros(32, 80, dtype=torch.long)
x = x.to(device)
y = y.to(device)
model.zero_grad()
y_pred = model(x).squeeze()
loss = criterion(y_pred, y)
loss.backward()
print(torch.cuda.memory_allocated())
fit_epoch(model)
print(torch.cuda.memory_allocated())
model.zero_grad(set_to_none=True)
print(torch.cuda.memory_allocated())
```

But then running this on colab gives me

108730368

110368768

108730368

Which is what we expect right?