I have a simple network
class MyNet(nn.Module):
def __init__(self):
super().__init__()
self.features = nn.Sequential(OrderedDict())
self.features.add_module("conv1", nn.Conv2d(1, 2, kernel_size=2, stride=1, bias=False))
self.features.add_module("conv2", nn.Conv2d(2, 2, kernel_size=2, stride=1, bias=False))
self.classifier = nn.Linear(2, 100000, bias=False)
def forward(self, inp):
features = self.features(inp)
features = features.view(1, -1)
output = self.classifier(features)
return output
Which is trained via:
my_net = MyNet().cuda()
optimizer = Adam(my_net.parameters())
criterion = nn.MSELoss(reduce=False)
inp = Variable(torch.rand(1, 1, 3, 3)).cuda()
target = Variable(torch.rand(1, 100000)).cuda()
weights = Variable(torch.rand(1, 100000)).cuda()
for i in tqdm(range(1000)):
output = my_net(inp)
loss = criterion(output, target)
loss = torch.mean(loss, dim=0).unsqueeze(0)
torch.autograd.backward(loss, weights)
optimizer.step()
time.sleep(0.5)
After the end of every batch, the memory used by my GPU slowly creeps up, though this is a lot more pronounced when using a network of a bigger size. Why is this happening?