Behavioral classifier using skeletal features

I am using the Keypoint RCNN to predict keypoints. Using those keypoints, I am able to create a skeletal for detected objects (please see example in image below). What I want to do now is collect the keypoints of the detected object over a number of frames. I want to then classify these frames/keypoints as whether the pedestrian is going to cross the road or not. I hope my explanation makes sense.

I am thinking to feed in only the keypoints into the model without the corresponding images. I want the network to be dependent on just the keypoints for prediction. Would this be something that could work?


The issue I am having is how to feed the list of keypoints into the model. I have some experience working with image classification, but not with 3D inputs (i.e. the list of keypoints). Do I use torch.nn.Conv3d for this task?

Any advise would be greatly appreciated.

I made a dummy model just to have a play around with

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv3d(in_channels=1, out_channels=64, kernel_size=(1,7,7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
        self.bn1 = nn.BatchNorm3d(64, track_running_stats=True)
        self.relu = nn.ReLU(inplace=True)
        self.pool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
        self.conv2 = nn.Conv3d(in_channels=64, out_channels=32, kernel_size=(1,7,7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
        self.fc1 = nn.Linear(320, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.pool(x)
        x = x.reshape(x.size(0), -1) 
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
net = Net().cuda()

import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

for epoch in range(2): 

    running_loss = 0.0
    optimizer.zero_grad()
    inputs = torch.stack(kypnts_list, dim=1).unsqueeze(0).to("cuda")
    outputs = net(inputs)
    loss = criterion(inputs.squeeze(0), intentions.cuda())
    loss.backward()
    optimizer.step()

But when I try to train the model, I get the following error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-184-78c208c097a2> in <module>
     45 
     46     outputs = net(inputs)
---> 47     loss = criterion(inputs.squeeze(0), intentions.cuda())
     48     loss.backward()
     49     optimizer.step()

~\miniconda3\envs\torch_env\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
    530             result = self._slow_forward(*input, **kwargs)
    531         else:
--> 532             result = self.forward(*input, **kwargs)
    533         for hook in self._forward_hooks.values():
    534             hook_result = hook(self, input, result)

~\miniconda3\envs\torch_env\lib\site-packages\torch\nn\modules\loss.py in forward(self, input, target)
    914     def forward(self, input, target):
    915         return F.cross_entropy(input, target, weight=self.weight,
--> 916                                ignore_index=self.ignore_index, reduction=self.reduction)
    917 
    918 

~\miniconda3\envs\torch_env\lib\site-packages\torch\nn\functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
   2019     if size_average is not None or reduce is not None:
   2020         reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2021     return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
   2022 
   2023 

~\miniconda3\envs\torch_env\lib\site-packages\torch\nn\functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
   1838         ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
   1839     elif dim == 4:
-> 1840         ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
   1841     else:
   1842         # dim == 3 or dim > 4

RuntimeError: 1only batches of spatial targets supported (non-empty 3D tensors) but got targets of size: : [1]

The label that I am feed in here is 0, where 0 means the pedestrian will not be crossing and 1 where the pedestrian will cross.

For a binary classification use case with nn.CrossEntropyLoss the number of output values should be 2, so you would need to use self.fc3 = nn.Linear(84, 2).
Alternatively, you could keep the single output unit and use nn.BCEWithLogitsLoss as the criterion.

@ptrblck thank you very much for your suggestion. I will have a play around with it and see what sort of results I get.

I have tried to implement the suggestions that @ptrblck made in the previous post, however, the loss for my model is 0.00 from start to finish and doesn’t change at all during the training phase.

This is my model and training phase:

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv3d(in_channels=1, out_channels=64, kernel_size=(1,7,7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
        self.bn1 = nn.BatchNorm3d(64, track_running_stats=True)
        self.relu = nn.ReLU(inplace=True)
        self.pool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
        self.conv2 = nn.Conv3d(in_channels=64, out_channels=32, kernel_size=(1,7,7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
        self.fc1 = nn.Linear(64000, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 2)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.pool(x)
        x = x.reshape(x.size(0), -1) 
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        x = x.unsqueeze(dim=1)
        return x
    
net = Net().to("cuda")

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

for epoch in range(10): 
    running_loss = 0.0
    optimizer.zero_grad()
    
    x = [torch.mean(i,0) for i in kypnts_list]
    x1 = (torch.stack(x, dim=0))
    x1 = F.pad(x1, pad=(0, 0, 0, 0, 0, 300))
    x1 = (x1.unsqueeze(dim=0)).unsqueeze(1)
    x1 = x1.to("cuda")

    labels = torch.tensor(intentions_data[0]['intent'][0]).to("cuda")
    outputs = net(x1)
    loss = criterion(outputs.squeeze(dim=1), labels.long())
    loss.backward()
    optimizer.step()
        
    running_loss += loss.item()
    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000))
    running_loss = 0.0

print('Finished Training')

I have a feeling it’s due to my generic model that I made, but I thought I would see some attempt at the model “learning” (i.e. some changes in loss values)

I just noticed I made a mistake for self.fc3 = nn.Linear(84, 1), where I forgot to change it to self.fc3 = nn.Linear(84, 2), and it seems to be training now.