Hi @albanD,
I am trying to implement a simple model that has only one customized linear layer that you saw earlier. Since there is a single layer, I expect that: (1) there is any tensors that hold onto input
, (2) if I delete input
in the forward pass, the GPU supposes to release input
and free some memory. However, input
seems to exist no matter what…
Would you mind taking a look?
from torch.nn.modules.module import Module
from torch.nn.modules.utils import _single, _pair, _triple
from torch.nn.parameter import Parameter
from tensorly.decomposition import tucker
import tensorly as tl
import gc
import numpy as np
from torch.nn import init
from torch.autograd import Function
from torchvision import datasets, transforms
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
import torch
import math
import argparse
from pytorch_memlab import profile
from pytorch_memlab.mem_reporter import MemReporter
import multiprocessing
multiprocessing.set_start_method('spawn', True)
tl.set_backend('pytorch')
def numpy2pytorch(a, device=torch.device("cuda")):
return torch.from_numpy(a.copy()).to(device)
def pytorch2numpy(a):
return a.detach().cpu().numpy()
# Inherit from Function
class MyLinearFunction(Function):
# Note that both forward and backward are @staticmethods
@staticmethod
# bias is an optional argument
def forward(ctx, input, weight, bias=None):
output = input.mm(weight.t())
if bias is not None:
output += bias.unsqueeze(0).expand_as(output)
shape = input.size()
ranks = [int(shape[0]/8), int(shape[1]/100)]
tl_img = tl.tensor(input, device='cuda:0')
core, tucker_factors = tl.decomposition.tucker(
tl_img, ranks=ranks, init='random', tol=100e-5, n_iter_max=100)
print(pytorch2numpy(input).nbytes/1024/1024,
pytorch2numpy(core).nbytes/1024/1024,
pytorch2numpy(tucker_factors[0]).nbytes/1024/1024,
pytorch2numpy(tucker_factors[1]).nbytes/1024/1024,
)
del input, tl_img
torch.cuda.empty_cache()
gc.collect()
ctx.core = core
ctx.factor0 = tucker_factors[0]
ctx.factor1 = tucker_factors[1]
ctx.save_for_backward(weight, bias)
return output
# This function has only a single output, so it gets only one gradient
@staticmethod
def backward(ctx, grad_output):
# This is a pattern that is very convenient - at the top of backward
# unpack saved_tensors and initialize all gradients w.r.t. inputs to
# None. Thanks to the fact that additional trailing Nones are
# ignored, the return statement is simple even when the function has
# optional inputs.
grad_input = grad_weight = grad_bias = None
core = ctx.core
factor0 = ctx.factor0
factor1 = ctx.factor1
weight, bias = ctx.saved_tensors
input = tl.tucker_to_tensor(
[core, [factor0, factor1]])
del core, factor0, factor1
torch.cuda.empty_cache()
gc.collect()
# These needs_input_grad checks are optional and there only to
# improve efficiency. If you want to make your code simpler, you can
# skip them. Returning gradients for inputs that don't require it is
# not an error.
if ctx.needs_input_grad[0]:
grad_input = grad_output.mm(weight)
if ctx.needs_input_grad[1]:
grad_weight = grad_output.t().mm(input)
if bias is not None and ctx.needs_input_grad[2]:
grad_bias = grad_output.sum(0).squeeze(0)
return grad_input, grad_weight, grad_bias
class MyLinear(nn.Module):
def __init__(self, input_features, output_features, bias=True):
super(MyLinear, self).__init__()
self.input_features = input_features
self.output_features = output_features
# nn.Parameter is a special kind of Tensor, that will get
# automatically registered as Module's parameter once it's assigned
# as an attribute. Parameters and buffers need to be registered, or
# they won't appear in .parameters() (doesn't apply to buffers), and
# won't be converted when e.g. .cuda() is called. You can use
# .register_buffer() to register buffers.
# nn.Parameters require gradients by default.
self.weight = nn.Parameter(
torch.Tensor(output_features, input_features))
if bias:
self.bias = nn.Parameter(torch.Tensor(output_features))
else:
# You should always register all possible parameters, but the
# optional ones can be None if you want.
self.register_parameter('bias', None)
# Not a very smart way to initialize weights
self.weight.data.uniform_(-0.1, 0.1)
if bias is not None:
self.bias.data.uniform_(-0.1, 0.1)
def forward(self, input):
# See the autograd section for explanation of what happens here.
return MyLinearFunction.apply(input, self.weight, self.bias)
def extra_repr(self):
# (Optional)Set the extra information about this module. You can test
# it by printing an object of this class.
return 'in_features={}, out_features={}, bias={}'.format(
self.in_features, self.out_features, self.bias is not None
)
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.fc2 = MyLinear(10000, 10)
def forward(self, x):
x = self.fc2(x)
return x
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc2 = nn.Linear(10000, 10)
def forward(self, x):
x = self.fc2(x)
return x
def train(args, model, device, train_loader, optimizer, epoch):
reporter = MemReporter()
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
# data, target = data.to(device), target.to(device))
data, target = np.random.rand(512,10000).astype(np.float32), np.random.randint(2, size=512)
data, target = numpy2pytorch(data).to(device), numpy2pytorch(target).to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
print('========= before backward =========')
reporter.report()
loss.backward()
print('========= after backward =========')
reporter.report()
optimizer.step()
if batch_idx % args.log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
def test(args, model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
# sum up batch loss
test_loss += F.nll_loss(output, target, reduction='sum').item()
# get the index of the max log-probability
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
def main():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=512, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=512, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
help='SGD momentum (default: 0.5)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--save-model', action='store_true', default=False,
help='For Saving the current Model')
args = parser.parse_args()
use_cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
device = torch.device("cuda")
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.test_batch_size, shuffle=True, **kwargs)
model = Net().to(device)
# model = MyNet().to(device)
optimizer = optim.SGD(model.parameters(), lr=args.lr,
momentum=args.momentum)
for epoch in range(1, args.epochs + 1):
train(args, model, device, train_loader, optimizer, epoch)
test(args, model, device, test_loader)
if (args.save_model):
torch.save(model.state_dict(), "mnist_cnn.pt")
if __name__ == '__main__':
main()
Memory consumption in both cases can be seen below:
with default linear layer
========= before backward =========
/home/minhvu/anaconda3/envs/vqa/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py:86: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead
warnings.warn("torch.distributed.reduce_op is deprecated, please use "
Element type Size Used MEM
-------------------------------------------------------------------------------
Storage on cuda:0
Parameter0 (10, 10000) 391.00K
Parameter1 (10,) 512.00B
Tensor2 (512, 10000) 19.53M
Tensor3 (512,) 4.00K
Tensor4 (512, 10) 20.00K
Tensor5 (1,) 512.00B
-------------------------------------------------------------------------------
Total Tensors: 5225643 Used Memory: 19.94M
The allocated memory on cuda:0: 20.05M
Memory differs due to the matrix alignment or invisible gradient buffer tensors
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Storage on cpu
Tensor6 (60000, 28, 28) 44.86M
Tensor7 (60000,) 469.00K
Tensor8 (10000, 28, 28) 7.48M
Tensor9 (10000,) 78.50K
Tensor10 (512, 1, 28, 28) 1.53M
Tensor11 (512,) 4.00K
Tensor12 (512, 1, 28, 28) 1.53M
Tensor13 (512,) 4.00K
Tensor14 (512,) 4.00K
Tensor15 (512, 1, 28, 28) 1.53M
Tensor16 (512,) 4.00K
Tensor17 (512, 1, 28, 28) 1.53M
-------------------------------------------------------------------------------
Total Tensors: 56557680 Used Memory: 59.01M
-------------------------------------------------------------------------------
========= after backward =========
Element type Size Used MEM
-------------------------------------------------------------------------------
Storage on cuda:0
Parameter0 (10, 10000) 391.00K
Parameter0.grad (10, 10000) 391.00K
Parameter1 (10,) 512.00B
Parameter1.grad (10,) 512.00B
Tensor2 (512, 10000) 19.53M
Tensor3 (512,) 4.00K
Tensor4 (512, 10) 20.00K
Tensor5 (1,) 512.00B
-------------------------------------------------------------------------------
Total Tensors: 5325653 Used Memory: 20.32M
The allocated memory on cuda:0: 20.41M
Memory differs due to the matrix alignment or invisible gradient buffer tensors
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Storage on cpu
Tensor6 (60000, 28, 28) 44.86M
Tensor7 (60000,) 469.00K
Tensor8 (10000, 28, 28) 7.48M
Tensor9 (10000,) 78.50K
Tensor10 (512, 1, 28, 28) 1.53M
Tensor11 (512,) 4.00K
Tensor12 (512, 1, 28, 28) 1.53M
Tensor13 (512,) 4.00K
Tensor14 (512,) 4.00K
Tensor15 (512, 1, 28, 28) 1.53M
Tensor16 (512,) 4.00K
Tensor17 (512, 1, 28, 28) 1.53M
-------------------------------------------------------------------------------
Total Tensors: 56557680 Used Memory: 59.01M
-------------------------------------------------------------------------------
with customized linear layer
========= before backward =========
/home/minhvu/anaconda3/envs/vqa/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py:86: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead
warnings.warn("torch.distributed.reduce_op is deprecated, please use "
Element type Size Used MEM
-------------------------------------------------------------------------------
Storage on cuda:0
Tensor0 (1,) 512.00B
Parameter1 (10, 10000) 391.00K
Parameter2 (10,) 512.00B
Tensor3 (512, 10000) 19.53M
Tensor4 (512,) 4.00K
Tensor5 (512, 10) 20.00K
Tensor6 (64, 100) 25.00K
Tensor7 (10000, 100) 3.81M
Tensor8 (512, 64) 128.00K
-------------------------------------------------------------------------------
Total Tensors: 6264811 Used Memory: 23.90M
The allocated memory on cuda:0: 24.06M
Memory differs due to the matrix alignment or invisible gradient buffer tensors
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Storage on cpu
Tensor9 (10000, 28, 28) 7.48M
Tensor10 (10000,) 78.50K
Tensor11 (512, 1, 28, 28) 1.53M
Tensor12 (512,) 4.00K
Tensor13 (512, 1, 28, 28) 1.53M
Tensor14 (512,) 4.00K
Tensor15 (512,) 4.00K
Tensor16 (512, 1, 28, 28) 1.53M
Tensor17 (512,) 4.00K
Tensor18 (512, 1, 28, 28) 1.53M
Tensor19 (60000, 28, 28) 44.86M
Tensor20 (60000,) 469.00K
-------------------------------------------------------------------------------
Total Tensors: 56557680 Used Memory: 59.01M
-------------------------------------------------------------------------------
========= after backward =========
Element type Size Used MEM
-------------------------------------------------------------------------------
Storage on cuda:0
Parameter1 (10, 10000) 391.00K
Parameter1.grad (10, 10000) 391.00K
Parameter2 (10,) 512.00B
Parameter2.grad (10,) 512.00B
Tensor3 (512, 10000) 19.53M
Tensor4 (512,) 4.00K
Tensor5 (512, 10) 20.00K
Tensor6 (64, 100) 25.00K
Tensor7 (10000, 100) 3.81M
Tensor8 (512, 64) 128.00K
Tensor0 (1,) 512.00B
-------------------------------------------------------------------------------
Total Tensors: 6364821 Used Memory: 24.28M
The allocated memory on cuda:0: 24.44M
Memory differs due to the matrix alignment or invisible gradient buffer tensors
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Storage on cpu
Tensor9 (10000, 28, 28) 7.48M
Tensor10 (10000,) 78.50K
Tensor11 (512, 1, 28, 28) 1.53M
Tensor12 (512,) 4.00K
Tensor13 (512, 1, 28, 28) 1.53M
Tensor14 (512,) 4.00K
Tensor15 (512,) 4.00K
Tensor16 (512, 1, 28, 28) 1.53M
Tensor17 (512,) 4.00K
Tensor18 (512, 1, 28, 28) 1.53M
Tensor19 (60000, 28, 28) 44.86M
Tensor20 (60000,) 469.00K
-------------------------------------------------------------------------------
Total Tensors: 56557680 Used Memory: 59.01M
-------------------------------------------------------------------------------