CUDA out of memory remains with SINGLE-MACHINE MODEL PARALLEL

Hello guys. I hope you can help me with this
I have been training my CNN model with input of size (32, 4, 9, 3, 224, 224 batch, (4,9,3 mn3 matrix) Height, weight).

First I have changed the size of the batch from 64 to 32 and then to 16 but the error remains. Then i have decided to use Dataparallale and model paralle but same.

I have to GPU with the following characteristics:

the model:

import torch
import torchvision # torch package for vision related things
import torch.nn.functional as F  # Parameterless functions, like (some) activation functions
import torchvision.datasets as datasets  # Standard datasets
import torchvision.transforms as transforms  # Transformations we can perform on our dataset for augmentation
from torch import optim  # For optimizers like SGD, Adam, etc.
from torch import nn  # All neural network modules
from torch.utils.data import DataLoader  # Gives easier dataset managment by creating mini batches etc.
from tqdm import tqdm  # For nice progress bar!

class CNNskl(nn.Module):
    def __init__(self, in_channels=3, num_classes=120):
        super(CNNskl, self).__init__()
        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=128,
            kernel_size=3,
            stride=1,
            padding=2,
        ).to('cuda:0')
        self.maxpool1 = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(
            in_channels=128,
            out_channels=64,
            kernel_size=(3, 3),
            stride=1,
            padding=2,
        ).to('cuda:1')

        self.maxpool2 = nn.MaxPool2d(kernel_size=2)

        self.conv3 = nn.Conv2d(
            in_channels=64,
            out_channels=32,
            kernel_size=3,
            stride=2,
            padding=2,
        ).to('cuda:0')
        self.maxpool3 = nn.MaxPool2d(kernel_size=2)
        self.fc1 = nn.Linear(32, 200).to('cuda:1')
        self.fc2 = nn.Linear(200, num_classes).to('cuda:0')
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # 64, 49, 100, 3
        # 64, 1200, 4, 9, 3
        #b1, d1, h1, w1, c1 = x.shape

        #x = x.reshape(b1 * d1, c1, h1, w1)
        
        # premier input
        b, n, m, z, H, W = x.shape
        x = x.reshape(b*n*m, z, H, W)
        x = F.relu(self.conv1(x.to("cuda:0")))
        x = self.maxpool1(x)
        x = F.relu(self.conv2(x.to('cuda:1')))
        x = self.maxpool2(x)
        x = F.relu(self.conv3(x.to('cuda:0')))
        x = self.maxpool3(x)
        x = x.reshape(b, n*m, -1)
        x = x.mean(dim=1)
        x = self.fc1(x.to('cuda:1'))
        x = self.dropout(self.fc2(x.to('cuda:0')))
        return x

training script:

import os
import argparse
import torch
import torch.nn as nn
from torch.backends import cudnn
from torch.utils import data
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from Models.PretrainedModel import Model3DResnet
from Models.ModelCnn import CNNskl
from NTU_RGBD_120_SkeletonDataset import SkeletonDataset as SkeletonDataset
import matplotlib.pyplot as plt
import pandas as pd
import time
import copy

class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(0)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

# Plotting the accuracy and the validation accurary and loss
def plot_loss(v_train_loss, v_val_loss, namefile: str):
    epochs = []
    for i in range(1, 201):
        epochs.append(i)

    v_train_loss_rounded = [round(element, 0) for element in v_train_loss]
    v_val_loss_rounded = [round(element, 0) for element in v_val_loss]

    plt.plot(epochs, v_train_loss_rounded)
    plt.plot(epochs, v_val_loss_rounded)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title("Loss")
    plt.legend(["Training Loss", "Validation Loss"], loc = 1)
    plt.savefig(namefile +'.png')
    plt.show()

# def save_loss_accuracy(dic_value, path_file):
#     dataframe = pd.DataFrame(dic_value.items(), columns=['epochs', 'values'])
#     dataframe.to_csv(path_file, sep=',', index=True)

def plot_accurracy(v_train_accuracy, v_val_accuracy, namefile: str):
    epochs = []
    for i in range(1, 201):
        epochs.append(i)

    v_train_accuracy_rounded = [round(element, 0) for element in v_train_accuracy]
    v_val_accuracy_rounded = [round(element, 0) for element in v_val_accuracy]
    plt.plot(epochs, v_train_accuracy_rounded)
    plt.plot(epochs, v_val_accuracy_rounded)
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.title("Accuracy")
    plt.legend(["Training Accuracy", "Validation Accuracy"], loc = 1)
    plt.savefig(namefile +'.png')
    plt.show()

def save_loss_accuracy(dic_value, path_file):
    dataframe = pd.DataFrame(dic_value.items(), columns=['epochs', 'values'])
    dataframe.to_csv(path_file, sep=',', index=True)

# cette function permet de faire le training. Le pourquoi jai voulu l'utiliser est que lie au probleme de
# memory du GPU
def train_model(model, criterion, optimizer, num_epochs, train_loader, val_loader, device):
    since = time.time
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        v_top1 = AverageMeter()
        v_top3 = AverageMeter()
        v_loss = AverageMeter()

        # le paramettre unique doit etre le loader. il prendra la valeur de traininig ou de validation lorsque le 
        # 

        for phase in ["train", "val"]:
            if phase == "train":
                v_loader = train_loader
                model.train()
            if phase == "val":
                v_loader = val_loader
                model.eval()

            for i, (datas, labels) in enumerate(v_loader):
                datas = datas.to(device)
                labels = labels.to('cuda:0')
                labels = labels - 1

                    # get a prediction scores and clculate loss
                    # the model can receive bot data
                pred = model(datas)
                loss = criterion(pred, labels)

                if phase == "train":
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                # calculate loss and acc
                prec1, prec3 = accuracy(pred.data, labels, topk=(1, 3))
                v_loss.update(loss.item(), datas.size(0))
                v_top1.update(prec1.item(), datas.size(0))
                v_top3.update(prec3.item(), datas.size(0))

                    # print a message
                print("[train] epochs: {}/{} batch: {}/{} loss: {:.4f}({:.4f}) proc@1: {:.4f}({:.4f}) proc@3: {:.4f}({:.4f})".format(
                    epoch, num_epochs, i, len(v_loader), v_loss.val, v_loss.avg,
                    v_top1.val, v_top1.avg, v_top3.val, v_top3.avg,
                ))




if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="action based skeleton args")
    parser.add_argument("--arraypath", default="Datasets/NTU/Skeleton/imagerepresentation/spatioVelo/", type=str)
    parser.add_argument("--crossactioncsvpath", default="dataloaders/Labels/cross_action/", type=str)
    parser.add_argument("--crossviewcsvpath", default="dataloaders/Labels/cross_view/", type=str)
    parser.add_argument("--sequencelength", default=4, type=int)
    parser.add_argument("--epochs", default=200, type=int)
    parser.add_argument("--batchsize", default=8, type=int)
    parser.add_argument("--lr", default=1e-3, type=float)
    parser.add_argument("--gpu_number", default=0, type=int)

    # print args
    args = parser.parse_args()
    print(args)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Prepare the loader
    train = SkeletonDataset(array_path=args.arraypath, csv_path=args.crossactioncsvpath + "cross_action_train.csv")
    val = SkeletonDataset(array_path=args.arraypath, csv_path=args.crossactioncsvpath  + "cross_action_val.csv")
    train_loader = DataLoader(train, batch_size=args.batchsize, shuffle=True, num_workers=4)
    val_loader = DataLoader(val, batch_size=args.batchsize, shuffle=False, num_workers=4)
   
    # get a model
    #model = Model3DResnet(num_classes = 120, output_layer = 4)
    model = CNNskl()
    # if torch.cuda.device_count()>1:
    #     print("Let's use", torch.cuda.device_count(), "GPUs!")
    #     model = nn.DataParallel(model)
    
    model.to(device)

    # get a optimizer and scheduler
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9)
    scheduler = ReduceLROnPlateau(optimizer, 'min', patience=1, verbose=True)
    

    #Training the model
    train_model(model, criterion, optimizer, args.epochs, train_loader, val_loader, device)

    #Accuracy and loss plot

    # save_loss_accuracy(dic_pltloss_train, "model_save/loss_train_accuracy_crossubject.csv")
    # save_loss_accuracy(dic_pltaccuracy_train1, "model_save/accuracy_train1_crossubject.csv")
    # save_loss_accuracy(dic_pltaccuracy_train3, "model_save/accuracy_train3_crossubject.csv")

    # save_loss_accuracy(dic_pltloss_val, "model_save/loss_val_accuracy_crossubject.csv")
    # save_loss_accuracy(dic_pltaccuracy_val1, "model_save/accuracy_val1_crossubject.csv")
    # save_loss_accuracy(dic_pltaccuracy_val3, "model_save/accuracy_val3_crossubject.csv")

    # plot_loss(pltloss_train, pltloss_val, namefile = "loss")
    # plot_accurracy(pltaccuracy_train1, pltaccuracy_val1, namefile = "accuracy_1")
    # plot_accurracy(pltaccuracy_train3, pltaccuracy_val3, namefile = "accuracy_3")

i am still having the same error:

Namespace(arraypath='Datasets/NTU/Skeleton/imagerepresentation/spatioVelo/', batchsize=32, crossactioncsvpath='dataloaders/Labels/cross_action/', crossviewcsvpath='dataloaders/Labels/cross_view/', epochs=200, gpu_number=0, lr=0.001, sequencelength=4)
Traceback (most recent call last):
  File "Train.py", line 182, in <module>
    train_model(model, criterion, optimizer, args.epochs, train_loader, val_loader, device)
  File "Train.py", line 120, in train_model
    pred = model(datas)
  File "/home/coco/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/media/coco/coco_dev/My_project/Skeleton_image_representation/Models/ModelCnn.py", line 54, in forward
    x = F.relu(self.conv1(x.to("cuda:0")))
  File "/home/coco/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/coco/.local/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 399, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/home/coco/.local/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 395, in _conv_forward
    return F.conv2d(input, weight, bias, self.stride,
RuntimeError: CUDA out of memory. Tried to allocate 28.06 GiB (GPU 0; 10.75 GiB total capacity; 662.97 MiB already allocated; 852.38 MiB free; 664.00 MiB reserved in total by PyTorch)

I cannot see any obvious issues in the posted code, so you might want to reduce the batch size even more to check which would be the max. size for your used devices.