Hi everyone,
I am running an inference model to create speaker embeddings from a checkpoint model. The problem I encounter is that I have quite high GPU memory usage (around 6728MiB) but very low volatile GPU-Util (between 7%-12%) even using very high batch size. I do the preprocessing and padding using collate_fn in the torch dataset and when I load the data, the output are already batches of processed spectrograms and lengths information used for the checkpoint model.
I am using: NVIDIA-SMI 525.60.11 Driver Version: 525.60.11 CUDA Version: 12.0
The following is an example how I wrap everything, could anyone give me some advice if you spot something might cause the low GPU usage?
Much thanks!
class EmbeddingDataset(Dataset):
def __init__(self, audio_paths, device):
self.audio_paths = audio_paths #lists of audio path
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def __len__(self):
return len(self.audio_paths)
def __getitem__(self, idx):
audio_file = self.audio_paths[idx]
return audio_file
def collate_fn(self, batch):
specs = []
spec_lens = []
for audio_file in batch:
spec = self.process_audio(audio_file).to(self.device)
specs.append(spec)
spec_lens.append(len(spec))
# Pad spectrograms and calculate spec_lens tensor
max_length = max([len(spec) for spec in specs])
padded_specs = torch.zeros(len(specs), max_length, specs[0].shape[1], device=self.device)
for i, spec in enumerate(specs):
padded_specs[i, :len(spec), :] = spec
spec_lens = torch.tensor(spec_lens, dtype=torch.long, device=self.device)
return padded_specs, spec_lens
def process_audio(self, path_to_reference_audio):
assert os.path.exists(path_to_reference_audio)
wave, sr = soundfile.read(path_to_reference_audio)
self.audio_preprocessor = AudioPreprocessor(input_sr=sr, output_sr=16000, cut_silence=False, device=self.device)
spec = self.audio_preprocessor.audio_to_mel_spec_tensor(wave).transpose(0, 1).to(self.device)
return spec
if __name__ == '__main__':
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
audio_dataset = EmbeddingDataset(audio_dir, device)
audio_dataloader = DataLoader(audio_dataset, batch_size=batch_size, shuffle=False, collate_fn=audio_dataset.collate_fn)
style_emb = StyleEmbedding()
check_dict = torch.load("checkpoint_model.pt"), map_location=device)
style_emb.load_state_dict(check_dict["style_emb_func"])
style_emb.to(device)
style_emb.eval()
embedding_list = []
for batch_specs, batch_spec_lens in audio_dataloader:
batch_specs = batch_specs.to(device)
batch_spec_lens = batch_spec_lens.to(device)
embeddings = style_emb(batch_specs, batch_spec_lens)
embedding_list.extend(embeddings.cpu().detach().numpy())
Much thanks!