torch.OutOfMemoryError: CUDA out of memory

Mohan_Krishna · March 10, 2025, 2:46pm

torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 574.00 MiB. GPU 0 has a total capacity of 23.66 GiB of which 587.00 MiB is free. Including non-PyTorch memory, this process has 22.33 GiB memory in use. Of the allocated memory 17.67 GiB is allocated by PyTorch, and 4.39 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (CUDA semantics — PyTorch 2.6 documentation)

Hi everyone, I am using NVIDIA RTX A5000 24GB memory. Currently I am using Driver Version: 550.120, cuda 12.1, cudnn=90100, pytorch=2.5.1. Can someone suggest : to fix the above issue

ptrblck · March 10, 2025, 6:19pm

You could reduce the memory requirement e.g. by reducing the batch size or check if expandable_segments helps reducing the memory fragmentation as explained in the error message.

Mohan_Krishna · March 12, 2025, 8:48am

I have tried expandable_segmens=true, and trying with batch size =2. I am working on video data. Also I have already tried batch size=1. Any other suggestion please?

Mohan_Krishna · March 12, 2025, 12:34pm

The following code i am running on NVIDIA RTX A5000 of 24 GB memory. If I use batch size=2, the cuda out of memory issue is coming.

import os
import pdb
import sys
import copy
import torch
import numpy as np
import torch.nn as nn
from tqdm import tqdm
import torch.nn.functional as F
import matplotlib.pyplot as plt
from evaluation.slr_eval.wer_calculation import evaluate
from torch.cuda.amp import autocast as autocast
from torch.cuda.amp import GradScaler

def seq_train(loader, model, optimizer, device, epoch_idx, recoder):
model. Train()
loss_value =
clr = [group[‘lr’] for group in optimizer.optimizer.param_groups]
scaler = GradScaler()
for batch_idx, data in enumerate(tqdm(loader)):
vid = device.data_to_device(data[0])
vid_lgt = device.data_to_device(data[1])
label = device.data_to_device(data[2])
label_lgt = device.data_to_device(data[3])
optimizer.zero_grad()
with autocast():
ret_dict = model(vid, vid_lgt, label=label, label_lgt=label_lgt)
loss = model.criterion_calculation(ret_dict, label, label_lgt)
if np.isinf(loss.item()) or np.isnan(loss.item()):
print(‘loss is nan’)
#print(data[-1])
print(str(data[1])+’ frames’)
print(str(data[3])+’ glosses’)
continue
scaler.scale(loss).backward()
scaler.step(optimizer.optimizer)
scaler.update()
# nn.utils.clip_grad_norm_(model.rnn.parameters(), 5)
loss_value.append(loss.item())
if batch_idx % recoder.log_interval == 0:
recoder.print_log(
‘\tEpoch: {}, Batch({}/{}) done. Loss: {:.8f} lr:{:.6f}’
.format(epoch_idx, batch_idx, len(loader), loss.item(), clr[0]))
del ret_dict
del loss
optimizer.scheduler.step()
recoder.print_log(‘\tMean training loss: {:.10f}.’.format(np.mean(loss_value)))
return

def seq_eval(cfg, loader, model, device, mode, epoch, work_dir, recoder,
evaluate_tool=“python”):
model.eval()
total_sent =
total_info =
total_conv_sent =
stat = {i: [0, 0] for i in range(len(loader.dataset.dict))}
for batch_idx, data in enumerate(tqdm(loader)):
recoder.record_timer(“device”)
vid = device.data_to_device(data[0])
vid_lgt = device.data_to_device(data[1])
label = device.data_to_device(data[2])
label_lgt = device.data_to_device(data[3])
with torch.no_grad():
ret_dict = model(vid, vid_lgt, label=label, label_lgt=label_lgt)

    total_info += [file_name.split("|")[0] for file_name in data[-1]]
    total_sent += ret_dict['recognized_sents']
    total_conv_sent += ret_dict['conv_sents']
try:
    python_eval = True if evaluate_tool == "python" else False
    write2file(work_dir + "output-hypothesis-{}.ctm".format(mode), total_info, total_sent)
    write2file(work_dir + "output-hypothesis-{}-conv.ctm".format(mode), total_info,
               total_conv_sent)
    conv_ret = evaluate(
        prefix=work_dir, mode=mode, output_file="output-hypothesis-{}-conv.ctm".format(mode),
        evaluate_dir=cfg.dataset_info['evaluation_dir'],
        evaluate_prefix=cfg.dataset_info['evaluation_prefix'],
        output_dir="epoch_{}_result/".format(epoch),
        python_evaluate=python_eval,
    )
    lstm_ret = evaluate(
        prefix=work_dir, mode=mode, output_file="output-hypothesis-{}.ctm".format(mode),
        evaluate_dir=cfg.dataset_info['evaluation_dir'],
        evaluate_prefix=cfg.dataset_info['evaluation_prefix'],
        output_dir="epoch_{}_result/".format(epoch),
        python_evaluate=python_eval,
        triplet=True,
    )
except:
    print("Unexpected error:", sys.exc_info()[0])
    lstm_ret = 100.0
finally:
    pass
del conv_ret
del total_sent
del total_info
del total_conv_sent
del vid
del vid_lgt
del label
del label_lgt
recoder.print_log(f"Epoch {epoch}, {mode} {lstm_ret: 2.2f}%", f"{work_dir}/{mode}.txt")
return lstm_ret

def seq_feature_generation(loader, model, device, mode, work_dir, recoder):
model.eval()

src_path = os.path.abspath(f"{work_dir}{mode}")
tgt_path = os.path.abspath(f"./features/{mode}")
if not os.path.exists("./features/"):
    os.makedirs("./features/")

if os.path.islink(tgt_path):
    curr_path = os.readlink(tgt_path)
    if work_dir[1:] in curr_path and os.path.isabs(curr_path):
        return
    else:
        os.unlink(tgt_path)
else:
    if os.path.exists(src_path) and len(loader.dataset) == len(os.listdir(src_path)):
        os.symlink(src_path, tgt_path)
        return

for batch_idx, data in tqdm(enumerate(loader)):
    recoder.record_timer("device")
    vid = device.data_to_device(data[0])
    vid_lgt = device.data_to_device(data[1])
    with torch.no_grad():
        ret_dict = model(vid, vid_lgt)
    if not os.path.exists(src_path):
        os.makedirs(src_path)
    start = 0
    for sample_idx in range(len(vid)):
        end = start + data[3][sample_idx]
        filename = f"{src_path}/{data[-1][sample_idx].split('|')[0]}_features.npy"
        save_file = {
            "label": data[2][start:end],
            "features": ret_dict['framewise_features'][sample_idx][:, :vid_lgt[sample_idx]].T.cpu().detach(),
        }
        np.save(filename, save_file)
        start = end
    assert end == len(data[2])
os.symlink(src_path, tgt_path)

def write2file(path, info, output):
filereader = open(path, “w”)
for sample_idx, sample in enumerate(output):
for word_idx, word in enumerate(sample):
filereader.writelines(
“{} 1 {:.2f} {:.2f} {}\n”.format(info[sample_idx],
word_idx * 1.0 / 100,
(word_idx + 1) * 1.0 / 100,
word[0]))