Here are the main parts of my code:
class conv(nn.Module):
def __init__(self):
super(conv, self).__init__()
self.model_vgg = models.vgg16(pretrained = True)
def forward(self, x):
#feature extraction
self.output = create_feature_extractor(self.model_vgg, return_nodes={'features':9}) #return 10th layer as in paper
self.val = self.output(x)['9']
#flatten
#self.output = torch.flatten(self.output)
return self.val
#return self.model_vgg(x)
class lstm(nn.Module):
def __init__(self, input_size = 25088, hidden_size = 512, num_layers = 2, bidirectional = True, n_class=2):
super(lstm, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bidirectional = bidirectional
self.model_lstm = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=self.num_layers , bidirectional=self.bidirectional, batch_first=True)
self.hidden_state = None
self.output_layer = nn.Sequential(
nn.Linear(2 * self.hidden_size if self.bidirectional==True else self.hidden_size, n_class)
)
def reset_hidden_state(self):
self.hidden_state = None
def forward(self, x):
lstm_output, hidden_state = self.model_lstm(x, self.hidden_state)
output = self.output_layer(lstm_output)
return output
modela = conv().to('cpu')
modelb = lstm().to(device)
## Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(modelb.parameters(), lr=0.001)
modelb.train()
batch_size = 10
frames = 20 #per video
#example = [torch.rand(20,512,7,7) for i in range(10)]
#tar = targets
for epoch in range(50):
loss_epoch = 0
for batch_idx, (inputs, labels) in enumerate(zip(features_train, labels_train)):
#for batch_idx, (inputs, labels) in enumerate(zip(example, targets)):
del inputs, labels
#del inputs
#return a tuple with two lists, each of size 10
#inputs[0].shape = 20x25088
#labels[0].shape = 10x2
inputs, labels = batch_creator(features_train, labels_train, batch_idx+1)
#inputs, labels = example,tar
optimizer.zero_grad()
modelb.reset_hidden_state()
scores = modelb(torch.stack(inputs).view(batch_size, frames, -1).to(device)) #cuda
loss = criterion(scores[:, -1, :],torch.stack(labels).max(1)[1].type(torch.FloatTensor).to(device))
loss.backward()
optimizer.step()
loss_epoch += loss.item()
del inputs
del labels
print(f"Loss in epoch {epoch} :::: {loss_epoch/len(features_train)}")
So to explain some bits. Because the dataset was large and I could create it in one go, I created the function batch_creator(features_train, labels_train, batch)
to use the datasets (features_train, labels_train are dataloaders) and create the required batch at each time. In order to save some GPU ram otherwise, it would have failed, the convolutional model (modela) runs on CPU. Although, it takes some time to run, it is not as bad as loss.backwards
.
As you understand my issue is with loss.backwards
which takes a lot of time to run (2-2.5mins). Is it because of my data?
I use frames from videos and for each video I use 20 frames and from each frame I have 25088 features (I try to replicate this paper)
I get the feeling though from this paragraph in the paper, that it should take some time to train:
Regarding the explained performance results, the most remarkable contribution
of our work is the considerable improvement on the action recognition task on
the KTH and UCF-11 datasets, and the evaluation of different numbers of LSTM
units. KTH and UCF-11 datasets have lower sizes than HMDB-51, and perhaps
neural models simpler than very deep or intricated architectures as transformers can
provide good results, without the need of compute-intensive and time-consuming
training and testing cycles.
Did anyone do anything similar?