If you store a state_dict using torch.save, and then load that state_dict (or another), it doesn’t just replace the weights in your current model. It loads the new values into GPU memory and then maybe releases the old GPU memory.
If I set my vector length to 4900, PyTorch eventually releases unused GPU memory and everything goes fine… If I set it to 5000, however, GPU memory usage increases and it never lets go.
To reproduce:
(First some info:)
Hardware: AWS P3.2xlarge (which has a Tesla V100)
uname -a
Linux ip-10-5-5-70 4.4.0-1074-aws #84-Ubuntu SMP Thu Dec 6 08:57:58 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
torch.version
‘1.0.0’
import sys
import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
######################
def get_GPU_memory():
"""
Return a list of how much memory is currently used by each GPU
"""
try:
dev = int(os.environ['CUDA_VISIBLE_DEVICES'])
except:
dev = 0
mem = torch.cuda.memory_allocated(device=dev)
return mem
#################################
class StatePredictor(nn.Module):
def __init__(self, dim, options={}):
super(StatePredictor, self).__init__()
self.relu = torch.nn.LeakyReLU(negative_slope=0.001, inplace=False)
self.norm = nn.LayerNorm(dim, elementwise_affine=False)
self.fc1 = nn.Linear(dim, dim).double()
self.fc2 = nn.Linear(dim, dim).double()
self.fc3 = nn.Linear(dim, dim).double()
self.fc4 = nn.Linear(dim, dim).double()
self.fc5 = nn.Linear(dim, dim).double()
self.fc6 = nn.Linear(dim, dim).double()
self.fc_final = nn.Linear(dim, 2).double()
self.dropout_p = 0.1
def forward(self, H):
x = H
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
x = torch.tanh(x)
x = self.fc3(x)
x = self.relu(x)
x = self.fc4(x)
x = torch.tanh(x)
x = self.fc5(x)
x = self.relu(x)
x = self.fc6(x)
x = torch.tanh(x)
x = self.fc_final(x)
x = F.softmax(x, dim=0)
return x
###########################
if __name__ == '__main__':
print("\n[%d] MEMORY before creating layers\n"% get_GPU_memory())
dirpath = "models"
dim = int(sys.argv[1])
predictor = StatePredictor(dim) # Create a simple fully-connected network
if torch.cuda.is_available():
predictor = predictor.cuda() # Load the model into the GPU
print("[%d] MEMORY after creating layers\n"% get_GPU_memory())
torch.save(predictor.state_dict(), dirpath+'/StatePredictor.pth') # Save the state_dict
# If 'load_state_dict' replaces the model's weights and biases, then why does GPU memory usage go up each time?
step = 0
while True:
step += 1
predictor.load_state_dict(torch.load(dirpath+'/StatePredictor.pth')) # Load state_dict
if step > 20:
time.sleep(10)
torch.cuda.empty_cache()
print("%d.) [%d] MEMORY after loading state_dict"% (step, get_GPU_memory()))
###########################
Usage:
python test.py 5000