Hello,
I have a problem. I load a model from pytorch lightning, that is completely frozen. I don’t use any optimizer and my data remains with requires_grad = False. If I try to increase the batch size from 1 to something else then I would get a Cuda out of memory error, that I cannot explain as there is nothing to stored during the Feed-forward pass.
def trainingNet(noisy_filenames, noise_filenames, clean_filenames, debuggerMode):
# dataloading
data_train = Dataclass(noisy_filenames[:N_train], noise_filenames[:N_train], clean_filenames[:N_train], device)
data_val = Dataclass(noisy_filenames[N_train:N_train + N_val], noise_filenames[N_train:N_train + N_val],
clean_filenames[N_train:N_train + N_val], device)
test_path = os.path.join('data', 'DNS testset', 'test_set', 'synthetic', 'no_reverb')
noisy_filenames, _, clean_filenames = readFilenames(test_path)
data_test = ProcessedDataclass(noisy_filenames, clean_filenames, device=None, SR=SR, segment=duration_sample_sec)
training_generator = DataLoader(data_train, **params, shuffle=True, drop_last=True)
val_generator = DataLoader(data_val, **params, shuffle=False)
test_generator = DataLoader(data_test, batch_size=1, shuffle=False)
path_pretrained_model = os.path.join(os.getcwd(), 'runs', 'conformer16', 'sv', 'alpha_100')
checkpoint_filename = os.path.join('checkpoints', 'epoch=23-step=485999.ckpt')
#loading pretrained frozen model:
net = lightning_model.DFConvNet()
df_conformer = net.load_from_checkpoint(os.path.join(path_pretrained_model, checkpoint_filename))
df_conformer.freeze()
df_conformer.eval()
hyperparameters = dict(in_channel = 1, out_channels=1, bias=False, num_layers=layer, num_stacks=stack, kernel_size=3,
residual_channels=128, gate_channels=128, skip_out_channels=128, last_channels=(2048, 256),
gin_channels=1, path_pretrained_model=path_pretrained_model)
#loading to GPUs
df_conformer = nn.DataParallel(df_conformer)
df_conformer.to(device)
# tensorboard
if not debuggerMode:
writer = SummaryWriter(folderpath)
else:
writer = None
# training + validating and testing in the end
best_model_name = ''
min_loss = 1000 # set to something unrealistic high
len_train_loss = 0
for epoch in range(max_epochs):
print(f"Epoch: {epoch}")
# Training
running_loss = 0
running_loss_snr = 0
loop = tqdm(enumerate(training_generator), total=len(training_generator))
# nnet.train()
for batch_idx, (noisy_data, clean_data, noise_data, _, _) in loop:
# Feedforward and loss
(estimate, sv_info) = df_conformer(noisy_data)
estimate = rearr_channel(estimate)
sp_info_repeated = sv_info.repeat(1, 1, int(lengthOfInputSignal // dim_sv))
if pad_diff != 0:
sp_info_repeated = pad(sp_info_repeated)
sv = sp_info_repeated.transpose(0, 1) # B,C,N
l1 = si_snr(rearr_channel_reversed(estimate), clean_data)
# statistics
# running_loss += loss.item()
running_loss_snr += l1.item()
rnloss_snr=running_loss_snr / (batch_idx + 1))
# logs for plots
if batch_idx % num_log_trainstep_per_batch == 0:
train_loss.append((running_loss / (batch_idx + 1), running_loss_snr / (batch_idx + 1)))
# validation loss
len_train_loss = len(train_loss) if len_train_loss == 0 else len_train_loss
and this is how I call this method:
if __name__ == "__main__":
CUDA_LAUNCH_BLOCKING = 1
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
print(f"Using {device}")
torch.multiprocessing.set_start_method('spawn')
# loss functions
si_snr = SingleSrcNegSDR("sisdr", reduction='mean', zero_mean=False)
mse = nn.MSELoss()
l1loss = nn.modules.loss.L1Loss()
# read filenames from data folder
datasetpath = os.path.join('data', 'train_set_4slong')
noisy_filenames, noise_filenames, clean_filenames = readFilenames(datasetpath)
# Some parameters
debuggerMode = False
SR = 16000
duration_sample_sec = 4 # sec
lengthOfInputSignal = int(SR * duration_sample_sec)
N = len(noisy_filenames) # number of samples_from_ver1
N_train = int(N * 0.1)
N_val = int(N * 0.001)
BS = 32
num_log_trainstep = 50 #number of when to log train loss
num_log_trainstep_per_batch = num_log_trainstep//BS # log each log_number_train_loss-th batch the training loss
max_epochs = 1
LEARNING_RATE = 0.0001
stack = 3
layer = 27
dim_sv = 256
new_pad_dim = int(lengthOfInputSignal // dim_sv) * dim_sv
pad_diff = lengthOfInputSignal - new_pad_dim
pad = nn.ConstantPad1d((0, pad_diff), 0)
rearr_channel = Rearrange('(b c) n -> b c n', c=1)
rearr_channel_reversed = Rearrange('b c n -> (b c) n')
params = {'batch_size': BS,
'num_workers': 1,
'pin_memory': False}
train_loss = []
val_loss = []
# for tensorboard and saves
folderpath = ''
if not debuggerMode:
_, datetime = str(datetime.datetime.now()).split()
datetime = datetime[:5].replace(':', '_')
foldername = 'test'
log = dict(
SR=SR,
duration=duration_sample_sec,
stack=stack,
layer=layer
)
folderpath = 'wavenet'
folderpath = os.path.join(os.getcwd(), 'runs', folderpath, foldername, datetime)
if not os.path.isdir(folderpath + '/model'):
os.makedirs(folderpath + '/model')
path_to_log = os.path.join(folderpath, 'hyperparameter.yml')
with open(path_to_log, 'w') as outfile:
yaml.dump(log, outfile, default_flow_style=False)
if not os.path.isdir(os.path.join(folderpath, 'samples')):
os.makedirs(os.path.join(folderpath, 'samples'))
os.makedirs(os.path.join(folderpath, 'samples', 'noisy'))
os.makedirs(os.path.join(folderpath, 'samples', 'noise'))
os.makedirs(os.path.join(folderpath, 'samples', 'clean'))
os.makedirs(os.path.join(folderpath, 'samples', 'estimate'))
trainingNet(noisy_filenames, noise_filenames, clean_filenames, debuggerMode)
I planed to use another model “WavNet” for training. But since already the code above without WavNet doesn’t work, I didn’t include the WavNet architecture here.
I don’t see the problem a completely frozen model shouldn’t use much memory. I checked if Training = False for the parameters for the df_conformer model, which is the case. The precise Error I get is:
**RuntimeError: CUDA out of memory. Tried to allocate 7.82 GiB (GPU 0; 10.76 GiB total capacity; 4.47 GiB already allocated; 3.15 GiB free; 4.51 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
**
on my RTX 2080 Ti with pytorch 1.10.