Is converting categories to one hot faster in Numpy than doing it in Data Loader?

Hi,
Is it possible that loops are faster than Data loader?

Earlier:

def one_hot_vector(x_raw, n_uniq):
    #time_strt =  datetime.now()
    input_len = x_raw.shape[0]
    input_col_len = x_raw.shape[1]
    x = np.zeros((input_len*input_col_len,n_uniq),dtype=np.int8)
    x_raw = x_raw.reshape(-1,1)
    for i in range(n_uniq):
        ind, _ = np.where(x_raw == i)
        x[ind, i] = 1
    x = x.reshape(input_len,input_col_len, n_uniq)
    x_raw = x_raw.reshape(input_len, input_col_len)
    #print(f"Completed in {datetime.now()-time_strt}")
    return x

for epoch in range(num_epochs):
    epoch_time = datetime.now()
    for i in range(0,x_train.shape[0],100000):
        #strt_time = datetime.now()
        one_hot_x_train = one_hot_vector(x_train[i:i+100000], 2983)
        one_hot_x_train = torch.from_numpy(one_hot_x_train)
        y_train_ = torch.from_numpy(y_train[i:i+100000].astype(np.int32))
        for j in range(0,one_hot_x_train.shape[0], batch_size):
            
            outputs = model(one_hot_x_train[j:j+batch_size].to(device).float())
            loss = criterion(outputs, y_train_[j:j+batch_size].squeeze().to(device).long())

    #Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch : {epoch}/{num_epochs} Train % : {(i+100000)/(x_train.shape[0])} Loss : {loss.item()} Loop Cost : {datetime.now()-strt_time} ")
    print(f"Epoch time : {datetime.now()-epoch_time}")

And after using Data loader:

import torch
from torch.utils import data
import numpy as np
import torch.nn.functional as F


class Dataset_1(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, list_IDs, labels, n_uniq):
        'Initialization'
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_uniq = n_uniq
        

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.list_IDs)

    
    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        

        # Load data and get label
        X = self.list_IDs[index]
        X = F.one_hot(torch.tensor(X).to(torch.int64), num_classes = self.n_uniq)
        y = self.labels[index]
        
        
        return X, y

params = {'batch_size': 50,
          'shuffle': True,
          'num_workers': 50}

training_set = Dataset_1(x_train, y_train, 2983 )
training_generator = data.DataLoader(training_set, **params)

validation_set = Dataset_1(x_test, y_test, 2983)
validation_generator = data.DataLoader(validation_set, **params)


for epoch in range(num_epochs):
    epoch_time = datetime.now()    
    # Training
    print('Training Start')
    counter = 0 
    for local_batch, local_labels in training_generator:
        # Transfer to GPU
        counter +=1
        local_batch, local_labels = local_batch.to(device).float(), local_labels.to(device).long()

        outputs = model(local_batch)
        loss = criterion(outputs,local_labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if counter%10000 == 0: print(f"Counter : {counter} || Loss : {loss.item()}")
    print(f"Epoch time : {datetime.now()-epoch_time} || Loss : {loss.item()}") 

I have a huge dataset (200M sample) and I do not have exact times but the time to run 2 epochs almost got doubled with the data loader.

Also the model Loss doesn’t seem to improve much it remains between 4 and 5 if I use ADM optimiser and 2 and 3 if I use SGC optimiser, what all could I try for the LSTM models for improving the accuracy.

Here is the model parameters and Model that I am using :


# Hyper-parameters
sequence_length = 10
input_size = 2983
hidden_size = 128
num_layers = 4
num_classes = 100
num_epochs = 2
learning_rate = 0.1
# Recurrent neural network (many-to-one)
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        #self.lsmax = nn.LogSoftmax(hidden_size,hidden_size)
        self.fc = nn.Linear(hidden_size, num_classes)
        #self.fc_ = nn.Linear(num_classes, num_classes)
        #self.fc_2 = nn.Linear(num_classes, num_classes)
    
    def forward(self, x):
        # Set initial hidden and cell states 
        h0 = torch.randn(self.num_layers, x.size(0), self.hidden_size).to(device) 
        c0 = torch.randn(self.num_layers, x.size(0), self.hidden_size).to(device)
        
#         h1 = torch.randn(1, self.hidden_size, self.hidden_size).to(device) 
#         c1 = torch.randn(1, self.hidden_size, self.hidden_size).to(device)
        # Forward propagate LSTM
        
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        
        #out_, _ = self.lsmax(out,(h1,c1))
        # Decode the hidden state of the last time step
        
        out_ = self.fc(out[:, -1, :])
        #out_ = self.fc_(out)
        #out__ =  self.fc_2(out_)
        
        return out_

model = RNN(input_size, hidden_size, num_layers, num_classes).cuda()
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

The manual loop might be faster, since you are just slicing the tensor, while your Dataset copies the data.
Try to use torch.from_numpy in your __getitem__ and compare the results again.