I am tuning a model with multiple sequences to do a binary classification. Below is my model.
class LSTM_model(nn.Module):
def init(self,vocab_size,hidden_size,add_feats,embed_size):
super(LSTM_model, self).init()
#Model Layers
self.embedding0 = nn.Embedding(vocab_size+1,embed_size,padding_idx=def_val)
self.embedding1 = nn.Embedding(vocab_size+1,embed_size,padding_idx=def_val)
self.embedding2 = nn.Embedding(vocab_size+1,embed_size,padding_idx=def_val)
self.embedding3 = nn.Embedding(vocab_size+1,embed_size,padding_idx=def_val)
self.lstm0 = nn.LSTM(embed_size+add_feats, hidden_size,batch_first=True)#+1 is done to accomodate additional feature i.e. ts
self.lstm1 = nn.LSTM(embed_size+add_feats, hidden_size,batch_first=True)
self.lstm2 = nn.LSTM(embed_size+add_feats, hidden_size,batch_first=True)
self.lstm3 = nn.LSTM(embed_size+add_feats, hidden_size,batch_first=True)
self.fc1 = nn.Linear(hidden_size*4,64)
self.fc2 = nn.Linear(64,32)
self.fc3 = nn.Linear(32,1)
self.drop = nn.Dropout(0.3)
#Activations
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self,a,b,c,d,ts0,ts1,ts2,ts3):
#Initialize the Embedding Layer
x0 = self.embedding0(a.long())
x1 = self.embedding1(b.long())
x2 = self.embedding2(c.long())
x3 = self.embedding3(d.long())
#Numerical Features for Processing data
co0 = torch.cat([x0,ts0.unsqueeze(2)],2)
co1 = torch.cat([x1,ts1.unsqueeze(2)],2)
co2 = torch.cat([x2,ts2.unsqueeze(2)],2)
co3 = torch.cat([x3,ts3.unsqueeze(2)],2)
#Run 4 separate LSTM's
x0 , (h0,c0) = self.lstm0(co0)
x1 , (h1,c1) = self.lstm1(co1)
x2 , (h2,c2) = self.lstm2(co2)
x3 , (h3,c3) = self.lstm3(co3)
#Transform the matrics
h0 = h0[-1,:,:]
h1 = h1[-1,:,:]
h2 = h2[-1,:,:]
h3 = h3[-1,:,:]
#Transform the 4 layers
f = torch.cat([h0,h1,h2,h3],1)
x = self.relu(self.fc1(f))
x = self.drop(x)
x = self.relu(self.fc2(x))
x = self.drop(x)
x = self.sigmoid(self.fc3(x))
return x
This model is having the same number of parameters as compared to the keras model. Datasize is 100,000 rows and I am running on the GPU.
Dataloader code:
import torch.utils.data as data_utils
dataset = torch.utils.data.TensorDataset(padded_events[events[0]],padded_events[events[1]],
padded_events[events[2]],padded_events[events[3]],padded_ts[ts[0]],padded_ts[ts[1]],
padded_ts[ts[2]],padded_ts[ts[3]],Y)
loader = torch.utils.data.DataLoader(
dataset,
batch_size=512,
num_workers=0)
Model Training:
#Model
model = LSTM_model(vocab_size=def_val,hidden_size=32,add_feats=1,embed_size=50)
#Loss
loss_fn = nn.BCELoss()
#Optimizer
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
%%time
n_epochs = 1 # or whatever
losses = []
for epoch in range(n_epochs):
permutation = torch.randperm(df.shape[0])
for batch_idx, (a,b,c,d,ts0,ts1,ts2,ts3,y) in enumerate(loader):
y = y.view(-1,1).float()
a = a.clone().detach().requires_grad_(False)
# in case you wanted a semi-full example
outputs = model.forward(a,b,c,d,ts0,ts1,ts2,ts3)
loss = loss_fn(outputs,y)
print (outputs.requires_grad,y.requires_grad)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch_idx%10 == 0:
losses.append(loss)
log = open("/home/aksve/cs_experiment/data/write_output.txt", "a")
log.write("epoch {}.\tbatch {}.\tloss : {}".format(epoch,batch_idx,loss))
log.flush()
log.close()
if batch_idx%10 == 0:
losses.append(loss)
print("epoch {}.\tbatch {}.\tloss : {}".format(epoch,batch_idx,loss))
one epoch in keras takes 58 seconds whereas it taken 14 min in torch. Is there anything inherently wrong here. If I increase the batch size to 8192, torch time reduces to 68 seconds. Unable to understand why is it happening