hi,
I have created a collate class that takes each batch and pads number of zeros = max len of vector in that batch.
The problem is now the training has slowed down considerable. I guess the batch wise padding is slowing it down. How can I speed it up, I need to keep batch wise padding.
class PadCollate:
"""
a variant of callate_fn that pads according to the longest sequence in
a batch of sequences
"""
def __init__(self, dim=0):
"""
args:
dim - the dimension to be padded (dimension of time in sequences)
"""
self.dim = dim
def pad_collate(self, batch):
max_atoms = max(map(lambda x: x[0].shape[self.dim], batch)) # find longest sequence
batch_size = len(batch)
# create zeros array with dim (batch_size, # atoms, feature_len)
desct = np.zeros((batch_size, max_atoms, 12), dtype=float)
energy = []
for i in range(batch_size):
rr, cc = batch[i][0].shape
desct[i] = np.pad(batch[i][0], [(0,max_atoms-rr),(0,12-cc)])
energy.append(batch[i][1])
descriptor = torch.from_numpy(desct).float().to(device)
energy = torch.tensor(energy).float().to(device)
sample = {"mat": descriptor, "energy": energy}
return sample
def __call__(self, batch):
return self.pad_collate(batch)
class createBatch(Dataset):
def __init__(self, file):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
"""
self.data = np.load(file, allow_pickle=True)
self.ener = self.data["ener"]
self.desc = self.data["desr"]
self.desc = [torch.tensor(self.desc[i]) for i in range(len(self.desc))]
self.ener = torch.tensor([j for i in self.ener for j in i], dtype=torch.float)
def __len__(self):
return len(self.ener)
def __getitem__(self, idx):
sample = [self.desc[idx], self.ener[idx]]
return sample
desc_data = createBatch("cluster_data.npz")
trainloader = DataLoader(desc_data, batch_size=32, sampler=train_sampler, collate_fn=PadCollate(dim=0))