I am working on action recognition where my inputs are frames of a video. Each video have different number of frames and I wish to create my custom collate_fn function to deal with different number of frames each video have.
Here is my model:
class MLP(nn.Module):
def __init__(self, num_classes, rnn_layers, hidden_size, fc_size):
super(MLP, self).__init__()
self.hidden_size = hidden_size
self.num_classes = num_classes
self.fc_size = fc_size
self.apply(self._init_weights)
self.fc_pre= nn.Sequential(
nn.Linear(201, fc_size),
#nn.ReLU(),
#nn.Dropout(p=0.5),
# nn. Linear(750, 500),
# #nn.Dropout(p=0.5),
# #nn.ReLU(),
# nn.Linear(500, fc_size),
#nn.Dropout(p=0.5),
nn.ReLU())
#Defines the number of features that define each element (time-stamp) of the input sequence
self.rnn = nn.GRU(input_size = fc_size,
hidden_size = hidden_size,
num_layers = rnn_layers,
batch_first = True)
self.fc = nn.Linear(hidden_size, num_classes)
#initialize hidden state for RNN
def init_hidden(self, num_layers, batch_size):
return (torch.zeros(num_layers, batch_size, self.hidden_size).cuda(),
torch.zeros(num_layers, batch_size, self.hidden_size).cuda())
#inital hidden state made of zeros
def forward(self, inputs, hidden=None, steps=0):
length = len(inputs)
# print('inputs size')
# print(inputs[3].size())
#Input data: RNN should have 3 dimensions. (Batch Size, Sequence Length and Input Dimension (the number of expected features which is 201)
fs = torch.zeros(inputs[0].size(0), length, self.rnn.input_size).cuda()
for i in range(length):
f = inputs[i]
#flattens the tensor
f = f.view(f.size(0), -1)
#print(f.size())
f = self.fc_pre(f)
# stores the tensor f in the sha pe [fs.size(0), fs.size(2)] to each “row” in fs to prepare for learning the sequence in RNN
fs[:, i, :] = f
# print('fs')
# print(fs.size())
#outputs : batch size, seq legnth, hidden size
outputs, hidden = self.rnn(fs, hidden)
#print(outputs.size())
#the training code applies crossentropyloss as criterion which also applies softmax to the output so I don't have to use softmax here
outputs = self.fc(outputs)
#print(outputs)
#print(outputs.size())
return outputs
def _init_weights(self, module):
if isinstance(module, nn.Linear):
module.weight.data.uniform_(mean=0.0, std=1.0)
if module.bias is not None:
module.bias.data.zero_()
I have tried to define my own collate_fn like this in my main:
def my_collate(batch):
for classes, videos, frames in os.walk(traindir):
frames_list = pad_sequence(frames, batch_first=True, padding_value=0)
return frames_list
but I get this error:
return torch._C._nn.pad_sequence(sequences, batch_first, padding_value)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: received an empty list of sequences
Any insights or suggestions for me is appreciated. Thank you in advance