Training time is 14x in comparison to Keras

I am tuning a model with multiple sequences to do a binary classification. Below is my model.

class LSTM_model(nn.Module):
def init(self,vocab_size,hidden_size,add_feats,embed_size):
super(LSTM_model, self).init()
#Model Layers
self.embedding0 = nn.Embedding(vocab_size+1,embed_size,padding_idx=def_val)
self.embedding1 = nn.Embedding(vocab_size+1,embed_size,padding_idx=def_val)
self.embedding2 = nn.Embedding(vocab_size+1,embed_size,padding_idx=def_val)
self.embedding3 = nn.Embedding(vocab_size+1,embed_size,padding_idx=def_val)
self.lstm0 = nn.LSTM(embed_size+add_feats, hidden_size,batch_first=True)#+1 is done to accomodate additional feature i.e. ts
self.lstm1 = nn.LSTM(embed_size+add_feats, hidden_size,batch_first=True)
self.lstm2 = nn.LSTM(embed_size+add_feats, hidden_size,batch_first=True)
self.lstm3 = nn.LSTM(embed_size+add_feats, hidden_size,batch_first=True)
self.fc1 = nn.Linear(hidden_size*4,64)
self.fc2 = nn.Linear(64,32)
self.fc3 = nn.Linear(32,1)
self.drop = nn.Dropout(0.3)
#Activations
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()

def forward(self,a,b,c,d,ts0,ts1,ts2,ts3):
    #Initialize the Embedding Layer
    x0 = self.embedding0(a.long())
    x1 = self.embedding1(b.long())
    x2 = self.embedding2(c.long())
    x3 = self.embedding3(d.long())
    #Numerical Features for Processing data 
    co0 = torch.cat([x0,ts0.unsqueeze(2)],2)
    co1 = torch.cat([x1,ts1.unsqueeze(2)],2)
    co2 = torch.cat([x2,ts2.unsqueeze(2)],2)
    co3 = torch.cat([x3,ts3.unsqueeze(2)],2)
    #Run 4 separate LSTM's
    x0 , (h0,c0) = self.lstm0(co0)
    x1 , (h1,c1) = self.lstm1(co1)
    x2 , (h2,c2) = self.lstm2(co2)
    x3 , (h3,c3) = self.lstm3(co3)
    #Transform the matrics
    h0 = h0[-1,:,:]
    h1 = h1[-1,:,:]
    h2 = h2[-1,:,:]
    h3 = h3[-1,:,:]
    #Transform the 4 layers
    f = torch.cat([h0,h1,h2,h3],1)
    x = self.relu(self.fc1(f))
    x = self.drop(x)
    x = self.relu(self.fc2(x))
    x = self.drop(x)
    x = self.sigmoid(self.fc3(x))
    
    return x    

This model is having the same number of parameters as compared to the keras model. Datasize is 100,000 rows and I am running on the GPU.

Dataloader code:

import torch.utils.data as data_utils
dataset = torch.utils.data.TensorDataset(padded_events[events[0]],padded_events[events[1]],
padded_events[events[2]],padded_events[events[3]],padded_ts[ts[0]],padded_ts[ts[1]],
padded_ts[ts[2]],padded_ts[ts[3]],Y)

loader = torch.utils.data.DataLoader(
dataset,
batch_size=512,
num_workers=0)

Model Training:

#Model
model = LSTM_model(vocab_size=def_val,hidden_size=32,add_feats=1,embed_size=50)
#Loss
loss_fn = nn.BCELoss()
#Optimizer
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)

%%time
n_epochs = 1 # or whatever
losses = []

for epoch in range(n_epochs):

permutation = torch.randperm(df.shape[0])

for batch_idx, (a,b,c,d,ts0,ts1,ts2,ts3,y) in enumerate(loader):
    
    y = y.view(-1,1).float()
    a = a.clone().detach().requires_grad_(False)

    # in case you wanted a semi-full example
    outputs = model.forward(a,b,c,d,ts0,ts1,ts2,ts3)
    loss = loss_fn(outputs,y)
    print (outputs.requires_grad,y.requires_grad)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  
    
    if batch_idx%10 == 0:
        losses.append(loss)
        log = open("/home/aksve/cs_experiment/data/write_output.txt", "a")
        log.write("epoch {}.\tbatch {}.\tloss : {}".format(epoch,batch_idx,loss))
        log.flush()
        log.close()
    
    
    if batch_idx%10 == 0:
        losses.append(loss)
        print("epoch {}.\tbatch {}.\tloss : {}".format(epoch,batch_idx,loss))

one epoch in keras takes 58 seconds whereas it taken 14 min in torch. Is there anything inherently wrong here. If I increase the batch size to 8192, torch time reduces to 68 seconds. Unable to understand why is it happening