Hi,
following the discussion on the memory management, my original dataset can very well fit into the memory but before I’d able to provide my samples for training I have to do some pre-processing including up-sampling. if I up-sample all the entities in my dataset, that cannot be contained in the memory any longer. so using the custom dataset and dataloder, I was hoping that this would be done within a batch only and avoid surpassing the memory limit.
I have a code like this:
Blockquote
class NickiDataset(Dataset):
def __init__(self, X, y, transform=None, target_transform=None):
self.target = y
self.Xdata = X
self.transform = transform
self.target_transform = target_transform
def __len__(self):
return len(self.target)
def __getitem__(self, idx):
data_i = self.Xdata[idx,:,:]
target_i = self.target[idx,:]
if self.transform:
data_i = self.transform(data_i)
if self.target_transform:
target_i = self.target_transform(target_i)
return data_i, target_i
Blockquote
class TrnsDt(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
global count
if count==0:
print(sample.shape)
rsmpl=samplerate.resample(sample, 160 , 'sinc_best')
pt_sample = torch.from_numpy(np.transpose(rsmpl))
if count==0:
print(pt_sample.shape)
#to_net=pt_sample.tolist()
#prc_Wv, mdl_wv
input_values = prc_Wv(pt_sample, return_tensors="pt", padding="longest", sampling_rate= 16e3).input_values
out=mdl_wv(torch.squeeze(input_values))
out_proc = out[0].clone().detach()
if count==0:
print(out_proc.shape)
return out_proc
class TrnsDtTrgt(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
pt_sample = torch.from_numpy(sample)
return pt_sample
train_ds = NickiDataset(train_dt_in, train_trgt,
transform=TrnsDt(), target_transform= TrnsDtTrgt())
batch_size=16
train_dataloader = DataLoader(dataset= train_ds, batch_size=batch_size, shuffle=True)
num_epoch=1
total_train_sample=len(train_ds)
n_iterations= np.ceil(total_train_sample/batch_size)
global count
count=0
for epoch in range(num_epoch):
for inx, (dt_in, trgt) in enumerate(train_dataloader):
if count==0:
print("input tensor shape: ", dt_in.shape)
if (inx+1) % 5 == 0:
print(f'epoch {epoch}/{num_epoch}, step {inx+1}/{n_iterations}, input {dt_in.shape}')
Any suggestion why this still makes a trouble in memory management?
Best,