3D CNN- error: Torch: not enough memory: you tried to allocate 166GB. Buy new RAM

Anna_yah · October 11, 2020, 6:59am

I tried to create 3d CNN using Pytorch. the following code works with 5 images but does not work with 336 images.

The error is: RuntimeError: $ Torch: not enough memory: you tried to allocate 166GB. Buy new RAM! at /opt/conda/conda-bld/pytorch_1550813258230/work/aten/src/TH/THGeneral.cpp:201

Can anyone help me please ?

    def __init__(self):
        super(CNNModel, self).__init__()
        
        self.conv_layer1 = self._conv_layer_set(3, 32) 
                                                      
        self.conv_layer2 = self._conv_layer_set(32, 64)
        self.fc1 = nn.Linear(64*28*28*28, 2)
        self.fc2 = nn.Linear(1404928, num_classes)
        self.relu = nn.LeakyReLU()
        self.batch=nn.BatchNorm1d(2)
        self.drop=nn.Dropout(p=0.15, inplace = True)   
        
    def _conv_layer_set(self, in_c, out_c):
        conv_layer = nn.Sequential(
        nn.Conv3d(in_c, out_c, kernel_size=(3, 3, 3), padding=0),
        nn.LeakyReLU(),
        nn.MaxPool3d((2, 2, 2)),
        )
        return conv_layer
    

    def forward(self, x):
        # Set 1
        out = self.conv_layer1(x)
        out = self.conv_layer2(out)
        out = out.view(out.size(0), -1)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.batch(out)
        out = self.drop(out)
        out = F.softmax(out, dim=1)
        return out

#Definition of hyperparameters
n_iters = 2
num_epochs = 2
# Create CNN
model = CNNModel()
#model.cuda()
print(model)
# Cross Entropy Loss 
error = nn.CrossEntropyLoss()
# SGD Optimizer
learning_rate = 0.001
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

...

#train
#Definition of hyperparameters
n_iters = 2
num_epochs = 2
loss_list_train = []
accuracy_list_train = []

for epoch in range(num_epochs):
    #for i in range(x_train1.shape[0]):
    training_data = torch.Tensor(training_data)
    targets = torch.Tensor(targets) 
    training_data = Variable(training_data.view(336,3,120,120,120))
    labels = targets
    # Clear gradients
    optimizer.zero_grad()
    # Forward propagation
    outputs = model(training_data)
    # Calculate softmax and ross entropy loss
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracyCalc(predicted, targets)
    #labels = labels.tolist()
    #outputs = outputs.tolist()
    labels = labels.long() 
    labels=labels.view(-1) 
    loss = nn.CrossEntropyLoss()
    loss = loss(outputs, labels)    
    # Calculating gradients
    loss.backward()
    # Update parameters
    optimizer.step()
    loss_list_train.append(loss.data)
    accuracy_list_train.append(accuracy)
    print('Iteration: {}/{}  Loss: {}  Accuracy: {} %'.format(epoch+1,  num_epochs, loss.data, accuracy))

BramVanroy · October 11, 2020, 8:12am

Well yes, that’s. Normal. That’s what batch sizes are for: generally speaking you can’t do a forward pass with your full dataset, so you chop it up into batches.

Anna_yah · October 11, 2020, 12:29pm

I tried to modify the code but it doesn’t work (it only works 3 times so with 9 images)

#Definition of hyperparameters
n_iters = 2
num_epochs = 2
loss_list_train = []
accuracy_list_train = []
for epoch in range(num_epochs): 
    outputs = []
    for fold in range(0, len(training_data), 3):
        xtrain = training_data[fold : fold+3]
        #for i in range(x_train1.shape[0]):
        xtrain = torch.Tensor(xtrain)  
        xtrain = Variable(xtrain.view(3,3,120,120,120))
        # Clear gradients
        optimizer.zero_grad()
        # Forward propagation
        v = model(xtrain)
        outputs.append(v)
        # Calculate softmax and ross entropy loss
    targets = torch.Tensor(targets)
    labels = targets
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracyCalc(predicted, targets)
    #labels = labels.tolist()
    #outputs = outputs.tolist()
    labels = labels.long() 
    labels=labels.view(-1) 
    loss = nn.CrossEntropyLoss()
    loss = loss(outputs, labels)    
    # Calculating gradients
    loss.backward()
    # Update parameters
    optimizer.step()
    loss_list_train.append(loss.data)
    accuracy_list_train.append(accuracy)
    print('Iteration: {}/{}  Loss: {}  Accuracy: {} %'.format(epoch+1,  num_epochs, loss.data, accuracy))

BramVanroy · October 11, 2020, 1:48pm

outputs.append(v)

This should probably be

outputs.append(v.detach())

Anna_yah · October 11, 2020, 2:25pm

I tried with ʻoutputs.append (v.detach ()) ` but the problem is:

TypeError                                 Traceback (most recent call last)
<ipython-input-3-051c943b10b7> in <module>
    180     targets = torch.Tensor(targets)
    181     labels = targets
--> 182     _, predicted = torch.max(outputs, 1)
    183     accuracy = accuracyCalc(predicted, targets)
    184     #labels = labels.tolist()

TypeError: max() received an invalid combination of arguments - got (list, int), but expected one of:
 * (Tensor input)
 * (Tensor input, Tensor other, Tensor out)
 * (Tensor input, int dim, bool keepdim, tuple of Tensors out)

Anna_yah · October 11, 2020, 6:44pm

I solved it, but there is another error message.
I think that the problem is with detach() and loss.backward()

#Definition of hyperparameters
n_iters = 2
num_epochs = 2
loss_list_train = []
accuracy_list_train = []
for epoch in range(num_epochs): 
    outputs = []
    outputs= torch.tensor(outputs)
    for fold in range(0, len(training_data), 3):
        xtrain = training_data[fold : fold+3]
        xtrain = torch.Tensor(xtrain)  
        xtrain = Variable(xtrain.view(3,3,120,120,120))
        # Clear gradients
        optimizer.zero_grad()
        # Forward propagation
        v = model(xtrain)
        outputs = torch.cat((outputs,v.detach()),dim=0)
        # Calculate softmax and ross entropy loss
    targets = torch.Tensor(targets)
    labels = targets
    outputs = torch.Tensor(outputs)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracyCalc(predicted, targets)
    labels = labels.long() 
    labels=labels.view(-1)
    loss = nn.CrossEntropyLoss()
    loss = loss(outputs, labels)    
    # Calculating gradients
    loss.backward()
    # Update parameters
    optimizer.step()
    loss_list_train.append(loss.data)
    accuracy_list_train.append(accuracy)
    print('Iteration: {}/{}  Loss: {}  Accuracy: {} %'.format(epoch+1,  num_epochs, loss.data, accuracy))

Result :

RuntimeError                              Traceback (most recent call last)
<ipython-input-2-73901ba48e8b> in <module>
    195     loss = loss(outputs, labels)
    196     # Calculating gradients
--> 197     loss.backward()
    198     # Update parameters
    199     optimizer.step()

/opt/tljh/user/envs/fethi_env/lib/python3.6/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    100                 products. Defaults to ``False``.
    101         """
--> 102         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    103 
    104     def register_hook(self, hook):

/opt/tljh/user/envs/fethi_env/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     88     Variable._execution_engine.run_backward(
     89         tensors, grad_tensors, retain_graph, create_graph,
---> 90         allow_unreachable=True)  # allow_unreachable flag
     91 
     92 

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

BramVanroy · October 12, 2020, 7:05am

You should probably move all your loss calculation, optimizer step, etc. inside your data loop. In most scenarios you want to run one loss backward (and optimizer step) after each model forward.

Anna_yah · October 12, 2020, 3:31pm

Thank you so much