As dataset I am using npz files. One npz file represents one video and contains keypoints (x,y, confidence score). I am trying to build 1D CNN network that will classifies those npz sample to one of the classes. I notices that no matter what I try I am always getting quite the same result. Loss decreases only little bit and validation accuracy is constant. When checking outputs, I saw that at the end network classifies all samples as the class 2. I assume that reason for that is, that the class 2 has the largest number of samples.
I tried several things to improve the model but no success:
a) two different models (both contain 1D CNN layers, but have different receiptive fields)
b) class weights in cross entropy loss because of my unbalanced dataset
c) adding LSTM
All of this did lead only to a slight changes, but all in all my training loss never goes under 1.7.
Can someone give me tip what could be wrong here?
class Dilated_blocks(nn.Module): def __init__(self, in_feat, out_feat, stride, dilation): super(Dilated_blocks, self).__init__() self.dilated_conv = nn.Conv1d(in_feat, out_feat, kernel_size = 3, stride = stride, dilation = dilation, padding = 1) self.conv_tranform = nn.Conv1d(out_feat, out_feat, kernel_size = 3, padding = 1) def forward(self, x): x = self.dilated_conv(x) x = self.conv_tranform(x) return x class Net(nn.Module): def __init__(self): super().__init__() self.conv1 = nn.Conv1d(in_channels = 51, out_channels = 64, kernel_size = 3) #, dilation=2) self.batch1 = nn.BatchNorm1d(64) self.conv_block2 = Dilated_blocks(in_feat = 64,out_feat = 128, stride = 2, dilation = 2) self.batch2 = nn.BatchNorm1d(128) self.conv_block3 = Dilated_blocks(in_feat = 128, out_feat = 256, stride = 2, dilation = 2) self.batch3 = nn.BatchNorm1d(256) self.lstm_extractor = nn.LSTM(input_size=256, hidden_size=512, num_layers=10, dropout=0.2, batch_first=True) self.relu = nn.ReLU() self.pool = nn.MaxPool1d(2) self.flat = nn.Flatten() self.fc1 = nn.Linear(512, 256) self.fc2 = nn.Linear(256, n_outputs) def forward(self, x): x = self.conv1(x) x = self.relu(x) x = self.batch1(x) x = self.conv_block2(x) x = self.relu(x) x = self.batch2(x) x = self.conv_block3(x) x = self.relu(x) x = self.batch3(x) out = x.permute(0, 2, 1) out, (ht, ct) = self.lstm_extractor(out) out = ht[-1] out = self.flat(out) out = self.fc1(out) out = self.fc2(out) return F.softmax(out, dim = 1)
weights = tensor([ 6.8684, 4.1429, 11.8636, 20.0769, 5.4375, 7.4571, 10.4400, 15.3529]) criterion = nn.CrossEntropyLoss(weight = classes_distribution.to(device)) optimizer = torch.optim.Adam(model.parameters(), lr = lr)
for iterat, data in enumerate(train_dataloader): model.zero_grad() model.train() poses = data.type(torch.float).permute(0,2,1).to(device) # shape [48, 51, 45] = [batch size, input features, sample size] labels = data.type(torch.long).to(device) # shape [48, 45] = [batch size, sample size] optimizer.zero_grad() #getting labels in the right format for loss function most_frequent_values = torch.mode(labels, dim=1).values #shape  = [batch_size] -> one label for each npz file in batch outputs = model(poses) #shape [48, 8] = [batch size, num of classes] loss = criterion(outputs, most_frequent_values) running_loss += loss.item() loss.backward() optimizer.step() #Accuracy acc = torchmetrics.functional.accuracy(outputs, most_frequent_values, task = 'multiclass', num_classes=8) accuracy += acc.item()