NN returning only one label

Hello, I am tryig to create neural network for audio recognition. I want to recognize 6 types of speech. I have several problems maybe they could relate. I copied and a bit modified my nn from here. I want to put in neural network 8000 samples and get number from 0 to 5.

  1. Problem is that I cant put there array of size [64, 8000] and I need to put there [64, 1, 8000]. Why is this? I think that I undestand that 64 is my batch size and 8000 is number of samples but why i need 3rd dim?
  2. NN returns array (predicted) has size [64, 6] (64 is batch and 6 is full of zeros). Second dim should be only 1 and there should be number from 0 to 5 (my categories).

I am new in pytorch before that I used Matlab where it is from my view much easier.

# My model
class NeuralNetwork(nn.Module):
    def __init__(self, n_input=1, n_output=6, stride=16, n_channel=32): # nchalnnesl 32
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(n_channel)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(2 * n_channel)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
        self.bn4 = nn.BatchNorm1d(2 * n_channel)
        self.pool4 = nn.MaxPool1d(4)
        self.fc1 = nn.Linear(2 * n_channel, n_output)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = F.avg_pool1d(x, x.shape[-1])
        x = x.permute(0, 2, 1)
        x = self.fc1(x)
        return F.log_softmax(x, dim=2)
# My training
criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    n_iterations = int(len(train_dataset)/batch_size)
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.cuda(), labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if (i+1) % 5 == 0:
                print(f'epoch {epoch+1}/{num_epochs}, step {i+1}/{n_iterations}, input {inputs.shape}')
correct = 0
    total = 0
    model = NeuralNetwork()
    model.load_state_dict(torch.load("./model.pth"))
    correct_pred = {classname: 0 for classname in v.LABELS}
    total_pred = {classname: 0 for classname in v.LABELS}
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for data in test_loader:
            inputs, labels = data
            # calculate outputs by running images through the network
            outputs = model(inputs)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted[:, 0] == labels).sum().item()

            for label, predict in zip(labels, predicted[:,0]):
                if label == predict:
                    correct_pred[v.LABELS[label]] += 1
                total_pred[v.LABELS[label]] += 1

    print('Accuracy: %d %%' % (100 * correct / total))
    # print accuracy for each class
    for classname, correct_count in correct_pred.items():
        accuracy = 100 * float(correct_count) / total_pred[classname]
        print("Accuracy for class {:5s} is: {:.1f} %".format(classname,
                                                    accuracy))

Hi @martin.hajek1

Problem is that I cant put there array of size [64, 8000] and I need to put there [64, 1, 8000]. Why is this?

So your model uses nn.Conv1d layers that requires input_channel and output_channel arguments. 64 is the batch size, 8000 is the number of audio samples in your batch. The input channel is 1 as there is only one channel of audio. That’s why you need your input to be of size [64, 1, 8000].