Cuda out of memory, but batch size is equal to one

Hy to all, i don’t know why i go out of memory (with 11 GiB of nvidia geforce 1080 ti).
The module with my net is this:

import torch.nn as nn
from torchvision.models import resnet50

class Encoder(nn.Module):

    def __init__(self):
        super(Encoder, self).__init__()
        self.model = resnet50(pretrained = True)
        self.model.fc = nn.Sequential(nn.Linear(self.model.fc.in_features, 1024),
                nn.ReLU(),
                nn.Linear(1024, 512),
                nn.ReLU(),
                nn.Linear(512, 256),
                nn.ReLU(),
                nn.Linear(256, 128),
                nn.ReLU(),
                nn.Linear(128, 64),
                nn.ReLU())
    def forward(self, x):
       
        to_return = self.model(x)
        return to_return

encoder = Encoder()

class AutoEncoder(nn.Module):
    def __init__(self, enc):
        super(AutoEncoder, self).__init__()
        self.encoder = enc
        self.decoder = nn.Sequential(
            nn.Conv2d(1, 64, 3, padding=1),
          
            nn.Upsample(scale_factor=5, mode='bicubic'),
            nn.ReLU(),
            nn.Conv2d(64, 128, 3, padding=1),
            nn.Upsample(scale_factor=5, mode='bicubic'),
            nn.ReLU(),
            nn.Conv2d(128, 256, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 512, 3, padding=1),
            nn.Upsample(scale_factor=2.5, mode='bicubic'),
            nn.ReLU(),
            nn.Conv2d(512, 1024, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(1024, 2048, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(2048, 3, 3, padding=1)
        )

    def forward(self, x):
        code = self.encoder(x)
        code2 = code.view(1, 1, 8, 8)
        reconstructed = self.decoder(code2)
        return code, reconstructed

class MovementOrientation(nn.Module):
    def __init__(self):
        super(MovementOrientation, self).__init__()
        self.mov_orie = nn.Sequential(nn.Linear(64, 4))#, nn.ReLU())

    def forward(self, x):

        mov_ori = self.mov_orie(x)

        return mov_ori


class final_model(nn.Module):
    def __init__(self):
        super(final_model, self).__init__()
        self.AutoEncoder = AutoEncoder(encoder)
        self.MovementOrientation = MovementOrientation()

    def forward(self, x):
        code, rec = self.AutoEncoder(x)
        MovementOrientation = self.MovementOrientation(code)

        return code, rec, MovementOrientation


The module of my training is this:

import torch
import torch.nn as nn
from torch.optim import Adam, SGD
import numpy as np
from dataset import Dataset
from torch.utils.data import DataLoader
from torchnet.meter import AverageValueMeter
from torchnet.logger import VisdomPlotLogger, VisdomSaver
from nets import final_model


def train(model, train_loader, valid_loader, exp_name = "prova",  lr=0.000001, epochs=1000, wd = 0.00001):

    #La loss di training
    criterion = nn.MSELoss()
    optimizer = Adam(model.parameters(), lr=lr, weight_decay=wd)

    # meters
    lossAE_meter = AverageValueMeter()
    lossXZ_meter = AverageValueMeter()
    lossUV_meter = AverageValueMeter()
    total_loss_meter = AverageValueMeter()


    # device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(device)

    model.to(device)

    loader = {"train": train_loader, "test": valid_loader}

    loss_AE_logger = VisdomPlotLogger('line', env=exp_name, opts={'title': 'LossAE', 'legend': ['train', 'test']})
    loss_XZ_logger = VisdomPlotLogger('line', env=exp_name, opts={'title': 'LossXZ', 'legend': ['train', 'test']})
    loss_UV_logger = VisdomPlotLogger('line', env=exp_name, opts={'title': 'LossUV', 'legend': ['train', 'test']})
    total_loss_logger = VisdomPlotLogger('line', env=exp_name, opts={'title': 'Total Loss', 'legend': ['train', 'test']})

    visdom_saver = VisdomSaver(envs=[exp_name])

    last_best_loss = np.inf
    for e in range(epochs):
        for mode in ["train", "test"]:

            lossAE_meter.reset()
            lossXZ_meter.reset()
            lossUV_meter.reset()
            total_loss_meter.reset()

            model.train() if mode == "train" else model.eval()

            with torch.set_grad_enabled(mode == "train"):  # abilitiamo i gradienti in training

                for i, batch in enumerate(loader[mode]):


                    x = batch["image"].to(device)
                    Movement = batch['Movement'][:, :2].float().to(device)
                    Orientation = batch['Movement'][:, 2:4].float().to(device)

                    target = batch["image_intensity"].to(device) #Immagine da ricostruire
                  

                    code, reconstructed, MovementOrientation = model(x)


                    out1, out2 = MovementOrientation[:, :2], MovementOrientation[:, 2:4]



                    lossAE = criterion(reconstructed,target)
                    lossXZ = criterion(out1,Movement)
                    lossUV = criterion(out2, Orientation)

                    l = lossAE + lossXZ + lossUV
                    if mode == "train":
                        optimizer.zero_grad()
                        l.backward()
                        optimizer.step()
                    else:
                        if l < last_best_loss:
                            torch.save(model.state_dict(), 'Best_ %s.pth' % exp_name)
                            last_best_loss = l



                    n = x.shape[0]  # numero di elementi nel batch

                    lossAE_meter.add(lossAE.item() * n, n)
                    lossXZ_meter.add(lossXZ.item()*n,n)
                    lossUV_meter.add(lossUV.item()* n, n)



                    if mode == "train":
                        loss_AE_logger.log(e + (i + 1) / len(loader[mode]), lossAE_meter.value()[0], name=mode)
                        loss_XZ_logger.log(e + (i + 1) / len(loader[mode]), lossXZ_meter.value()[0], name=mode)
                        loss_UV_logger.log(e + (i + 1) / len(loader[mode]), lossUV_meter.value()[0], name=mode)


            loss_AE_logger.log(e + (i + 1) / len(loader[mode]), lossAE_meter.value()[0], name=mode)
            loss_XZ_logger.log(e + (i + 1) / len(loader[mode]), lossXZ_meter.value()[0], name=mode)
            loss_UV_logger.log(e + (i + 1) / len(loader[mode]), lossUV_meter.value()[0], name=mode)


        visdom_saver.save()

      
        torch.save(model.state_dict(), '%s.pth' % exp_name)

    return model



def start_all():

 
    torch.set_num_threads(4)

    model = final_model()



    #Define train dataset and loader
    train_dataset = Dataset('./Dataset/Input','./Dataset/ToRec','./Dataset/Train.csv')
    valid_dataset = Dataset('./Dataset/Input','./Dataset/ToRec','./Dataset/Validation.csv')

    train_loader = DataLoader(train_dataset, batch_size=1,num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size=1,num_workers=2)


    model_trained= train(model, train_loader, valid_loader, exp_name="MultiHead", epochs=300)

If i have a batch size of one , how is it possible to address cuda out of memory ?
For precision:
RuntimeError: CUDA out of memory. Tried to allocate 1.91 GiB (GPU 0; 10.92 GiB total capacity; 7.65 GiB already allocated; 698.12 MiB free; 1.82 GiB cached)

Hi,
Your network seems memory greedy.
You have several upscaling x5 with many many channels (1024-2048)

In case you do really need such a scaling factor (actually x(552.5)) you should think of reducing the amount of channels.

Going from 1024 to 2048 means that you are gonna have 1024 * 2048 * H*W float numbers + the mem required for backprop.