Out of memory with completely frozen model

Hello,

I have a problem. I load a model from pytorch lightning, that is completely frozen. I don’t use any optimizer and my data remains with requires_grad = False. If I try to increase the batch size from 1 to something else then I would get a Cuda out of memory error, that I cannot explain as there is nothing to stored during the Feed-forward pass.

def trainingNet(noisy_filenames, noise_filenames, clean_filenames, debuggerMode):
    # dataloading
    data_train = Dataclass(noisy_filenames[:N_train], noise_filenames[:N_train], clean_filenames[:N_train], device)
    data_val = Dataclass(noisy_filenames[N_train:N_train + N_val], noise_filenames[N_train:N_train + N_val],
                         clean_filenames[N_train:N_train + N_val], device)

    test_path = os.path.join('data', 'DNS testset', 'test_set', 'synthetic', 'no_reverb')
    noisy_filenames, _, clean_filenames = readFilenames(test_path)
    data_test = ProcessedDataclass(noisy_filenames, clean_filenames, device=None, SR=SR, segment=duration_sample_sec)

    training_generator = DataLoader(data_train, **params, shuffle=True, drop_last=True)
    val_generator = DataLoader(data_val, **params, shuffle=False)
    test_generator = DataLoader(data_test, batch_size=1, shuffle=False)

    path_pretrained_model = os.path.join(os.getcwd(), 'runs', 'conformer16', 'sv', 'alpha_100')
    checkpoint_filename = os.path.join('checkpoints', 'epoch=23-step=485999.ckpt')

    #loading pretrained frozen model:
    net = lightning_model.DFConvNet()
    df_conformer = net.load_from_checkpoint(os.path.join(path_pretrained_model, checkpoint_filename))
    df_conformer.freeze()
    df_conformer.eval()


    hyperparameters = dict(in_channel = 1, out_channels=1, bias=False, num_layers=layer, num_stacks=stack, kernel_size=3,
                 residual_channels=128, gate_channels=128, skip_out_channels=128, last_channels=(2048, 256),
                           gin_channels=1, path_pretrained_model=path_pretrained_model)

    #loading to GPUs
    df_conformer = nn.DataParallel(df_conformer)
    df_conformer.to(device)

    # tensorboard
    if not debuggerMode:
        writer = SummaryWriter(folderpath)
    else:
        writer = None
    # training + validating and testing in the end
    best_model_name = ''
    min_loss = 1000  # set to something unrealistic high
    len_train_loss = 0
    for epoch in range(max_epochs):
        print(f"Epoch: {epoch}")
        # Training
        running_loss = 0
        running_loss_snr = 0
        loop = tqdm(enumerate(training_generator), total=len(training_generator))
        # nnet.train()
        for batch_idx, (noisy_data, clean_data, noise_data, _, _) in loop:
            # Feedforward and loss

            (estimate, sv_info) = df_conformer(noisy_data)
            estimate = rearr_channel(estimate)
            sp_info_repeated = sv_info.repeat(1, 1, int(lengthOfInputSignal // dim_sv))
            if pad_diff != 0:
                sp_info_repeated = pad(sp_info_repeated)
            sv = sp_info_repeated.transpose(0, 1)  # B,C,N

            l1 = si_snr(rearr_channel_reversed(estimate), clean_data)

            # statistics
            # running_loss += loss.item()
            running_loss_snr += l1.item()

       rnloss_snr=running_loss_snr / (batch_idx + 1))
            # logs for plots
            if batch_idx % num_log_trainstep_per_batch == 0:
                train_loss.append((running_loss / (batch_idx + 1), running_loss_snr / (batch_idx + 1)))
        # validation loss
        len_train_loss = len(train_loss) if len_train_loss == 0 else len_train_loss

and this is how I call this method:


if __name__ == "__main__":
    CUDA_LAUNCH_BLOCKING = 1
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    print(f"Using {device}")
    torch.multiprocessing.set_start_method('spawn')


    # loss functions
    si_snr = SingleSrcNegSDR("sisdr", reduction='mean', zero_mean=False)
    mse = nn.MSELoss()
    l1loss = nn.modules.loss.L1Loss()

    # read filenames from data folder
    datasetpath = os.path.join('data', 'train_set_4slong')
    noisy_filenames, noise_filenames, clean_filenames = readFilenames(datasetpath)

    # Some parameters
    debuggerMode = False
    SR = 16000
    duration_sample_sec = 4  # sec
    lengthOfInputSignal = int(SR * duration_sample_sec)
    N = len(noisy_filenames)  # number of samples_from_ver1
    N_train = int(N * 0.1)
    N_val = int(N * 0.001)

    BS = 32
    num_log_trainstep = 50 #number of when to log train loss
    num_log_trainstep_per_batch = num_log_trainstep//BS  # log each log_number_train_loss-th batch the training loss


    max_epochs = 1
    LEARNING_RATE = 0.0001
    stack = 3
    layer = 27
    dim_sv = 256

    new_pad_dim = int(lengthOfInputSignal // dim_sv) * dim_sv
    pad_diff = lengthOfInputSignal - new_pad_dim
    pad = nn.ConstantPad1d((0, pad_diff), 0)
    rearr_channel = Rearrange('(b c) n -> b c n', c=1)
    rearr_channel_reversed = Rearrange('b c n -> (b c) n')

    params = {'batch_size': BS,
              'num_workers': 1,
              'pin_memory': False}
    train_loss = []
    val_loss = []

    # for tensorboard and saves
    folderpath = ''
    if not debuggerMode:
        _, datetime = str(datetime.datetime.now()).split()
        datetime = datetime[:5].replace(':', '_')
        foldername = 'test'

        log = dict(
            SR=SR,
            duration=duration_sample_sec,
            stack=stack,
            layer=layer
        )

        folderpath = 'wavenet'
        folderpath = os.path.join(os.getcwd(), 'runs', folderpath, foldername, datetime)

        if not os.path.isdir(folderpath + '/model'):
            os.makedirs(folderpath + '/model')

        path_to_log = os.path.join(folderpath, 'hyperparameter.yml')
        with open(path_to_log, 'w') as outfile:
            yaml.dump(log, outfile, default_flow_style=False)

        if not os.path.isdir(os.path.join(folderpath, 'samples')):
            os.makedirs(os.path.join(folderpath, 'samples'))
            os.makedirs(os.path.join(folderpath, 'samples', 'noisy'))
            os.makedirs(os.path.join(folderpath, 'samples', 'noise'))
            os.makedirs(os.path.join(folderpath, 'samples', 'clean'))
            os.makedirs(os.path.join(folderpath, 'samples', 'estimate'))

    trainingNet(noisy_filenames, noise_filenames, clean_filenames, debuggerMode)

I planed to use another model “WavNet” for training. But since already the code above without WavNet doesn’t work, I didn’t include the WavNet architecture here.
I don’t see the problem a completely frozen model shouldn’t use much memory. I checked if Training = False for the parameters for the df_conformer model, which is the case. The precise Error I get is:

**RuntimeError: CUDA out of memory. Tried to allocate 7.82 GiB (GPU 0; 10.76 GiB total capacity; 4.47 GiB already allocated; 3.15 GiB free; 4.51 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
**

on my RTX 2080 Ti with pytorch 1.10.

Okay I think, that I can answer my own question. The reason here was really, that in the Feed-Forward pass the stored matrices are too large. I use attention and the my matrices for Q,K,V (multi-headed self attention) are of shape (BS, heads=8, ~4000, 1024). One matrix is 9Gb large! You will see three of them exceed my available 11Gb.