I am trying to build a custom dataset and dataloader. The goal was that I wanted to modeify the default dataloader in such a way that with each sample I can load a predifined list of related samples (similar samples). However samples have different sizes of such lists (it could be empty as well). Then I want to calculate a pairwise similarity based loss for each sample against each sample in its accompanying list and then add it to a standard supervised MSE loss. This is how I am trying to build it:
from torch.utils.data import Dataset, DataLoader
null_graph = Data(x = torch.zeros((1,1)) , edge_index = torch.zeros((1,1)) , edge_attr = torch.zeros((1,1)) , y = torch.zeros(1))
class dotdict(dict):
"""dot.notation access to dictionary attributes"""
__getattr__ = dict.get
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
class CustomDataset(Dataset):
def __init__(self, dataframe, graphs):
self.data = dataframe
self.graphs = graphs
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
sample = self.data.iloc[idx]
cmg = []
ac_cmg= []
if (sample[4] == 1):
lis = sample[5]
for ind in lis:
if ind in self.data['index'].to_list():
cmg.append(self.graphs[train_data.index[train_data['index'] == ind].item()])
ac_cmg.append(train_data.activity[train_data['index'] == ind].item())
else:
cmg.append(null_graph)
ac_cmg.append(0)
graph = self.graphs[idx]
return {'y': torch.tensor(sample[3], dtype = torch.float32) ,
'is_cliff':torch.tensor(sample[4], dtype = torch.int8), 'py_cmg':cmg,
'y_cm':torch.tensor(ac_cmg,dtype = torch.float32),
'py_graphs': graph}
def custom_collate_fn(batch):
y = torch.stack([sample['y'] for sample in batch])
is_cliff = torch.stack([sample['is_cliff'] for sample in batch])
x = torch.cat([sample['py_graphs'].x for sample in batch], dim = 0)
edge_index = torch.cat([sample['py_graphs'].edge_index for sample in batch], dim = 1)
edge_attribute = torch.cat([sample['py_graphs'].edge_attr for sample in batch], dim =0)
assignment = torch.cat([ i+torch.zeros(sample['py_graphs'].x.shape[0],dtype= torch.int64) for i,sample in enumerate(batch)])
cliff_mates_num = [len(sample['py_cmg']) for sample in batch]
x_cm = [graph.x for sample in batch for graph in sample['py_cmg']]
assignment2 = [torch.zeros(g.shape[0],dtype= torch.int64) for i,g in enumerate(x_cm)]
edge_index_cm = [graph.edge_index.to(torch.int64) for sample in batch for graph in sample['py_cmg']]
edge_attribute_cm = [graph.edge_attr for sample in batch for graph in sample['py_cmg']]
y_cm = [sample['y_cm'] for sample in batch]
data = dotdict({'x':x, 'edge_index':edge_index, 'edge_attr': edge_attribute , 'batch': assignment})
# data_cm = dotdict({ 'x' : x_cm ,'edge_index' : edge_index_cm,
# 'edge_attr': edge_attribute_cm, 'batch2': assignment2 ,'y' : y_cm})
return dotdict({'y': y, 'cliff_mates_num' : cliff_mates_num,'data': data , 'x_cm' : x_cm ,'edge_index_cm' : edge_index_cm,
'edge_attr_cm': edge_attribute_cm, 'batch2': assignment2 ,'y_cm' : y_cm, 'is_cliff': is_cliff, })```
However, I get it seems there are problems with the dataloader related to Tensor sizes mismatches. Also, this is causing the backward method not to work properly. Here is my training function:
def train(model, train_loader,optimizer,criterion,device=‘cpu’):
model.train()
total_loss = 0
for batch_num,data in enumerate(train_loader):
cliff_total = 0
tmp_embed = 0
embed = 0
optimizer.zero_grad()
pred, graph_embed = model(data.data)
for i in range(bs):
mates =
if(data.is_cliff[i]==1):
print(bsbatch_num+i)
for idx in range(data.cliff_mates_num[i]):
mate = dotdict({‘x’:data.x_cm[i +idx] ,‘edge_index’: data.edge_index_cm[i+idx],
‘edge_attr’:data.edge_attr_cm[i+idx] , ‘batch’:data.batch2[i+idx],
‘y’ : data.y_cm[i][idx]})
_ ,cliff_embed = model(mate)
_,anchor_embed = model(list1[batch_numbs + i])
cliff_loss = 1 - F.cosine_similarity(anchor_embed,cliff_embed) - (data.y[i] - mate.y)
cliff_total = cliff_total + cliff_loss
supervised_loss = criterion(pred,data.y.unsqueeze(1))
loss_combined = supervised_loss + cliff_loss
loss_combined.backward()
total_loss += supervised_loss.item() * bs
optimizer.step()
return total_loss / len(train_loader.dataset)