I am working on Quora duplicate dataset that has 3 columns: ‘Question 1’, ‘Question 2’, ‘duplicate flag’.
I want to use DataLoader to speed up the model. Any idea on how to implement batching in data loader when you have 2 text of different lengths.
PS: Using fastext exbeddings for each word
I created the Dataset as follows:-
import torch.utils.data as Data
from torch.utils.data import Dataset
class QuoraDataset(Dataset):
def __init__(self, csv, transform=None, separator = ','):
df = pd.read_csv(csv, sep=separator)
df['question1'] = df['question1'].astype(str)
self.q_1 = [i.split(' ') for i in df['question1'].values.tolist()]
df['question2'] = df['question2'].astype(str)
self.q_2 = [i.split(' ') for i in train['question2'].values.tolist()]
self.label = df['is_duplicate'].values.tolist()
self.length = len(df)
self.transform = transform
def __len__(self):
return self.length
def __getitem__(self,index):
inputs1 = torch.tensor([get_ft(word) for word in self.q_1[index]])
inputs2 = torch.tensor([get_ft(word) for word in self.q_2[index]])
sample = {'x1':self.q_1, 'x2':self.q_2, 'y':self.label[index]}
return sample