Hello, I am tryig to create neural network for audio recognition. I want to recognize 6 types of speech. I have several problems maybe they could relate. I copied and a bit modified my nn from here. I want to put in neural network 8000 samples and get number from 0 to 5.
- Problem is that I cant put there array of size [64, 8000] and I need to put there [64, 1, 8000]. Why is this? I think that I undestand that 64 is my batch size and 8000 is number of samples but why i need 3rd dim?
- NN returns array (predicted) has size [64, 6] (64 is batch and 6 is full of zeros). Second dim should be only 1 and there should be number from 0 to 5 (my categories).
I am new in pytorch before that I used Matlab where it is from my view much easier.
# My model
class NeuralNetwork(nn.Module):
def __init__(self, n_input=1, n_output=6, stride=16, n_channel=32): # nchalnnesl 32
super().__init__()
self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
self.bn1 = nn.BatchNorm1d(n_channel)
self.pool1 = nn.MaxPool1d(4)
self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
self.bn2 = nn.BatchNorm1d(n_channel)
self.pool2 = nn.MaxPool1d(4)
self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
self.bn3 = nn.BatchNorm1d(2 * n_channel)
self.pool3 = nn.MaxPool1d(4)
self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
self.bn4 = nn.BatchNorm1d(2 * n_channel)
self.pool4 = nn.MaxPool1d(4)
self.fc1 = nn.Linear(2 * n_channel, n_output)
def forward(self, x):
x = self.conv1(x)
x = F.relu(self.bn1(x))
x = self.pool1(x)
x = self.conv2(x)
x = F.relu(self.bn2(x))
x = self.pool2(x)
x = self.conv3(x)
x = F.relu(self.bn3(x))
x = self.pool3(x)
x = self.conv4(x)
x = F.relu(self.bn4(x))
x = self.pool4(x)
x = F.avg_pool1d(x, x.shape[-1])
x = x.permute(0, 2, 1)
x = self.fc1(x)
return F.log_softmax(x, dim=2)
# My training
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
n_iterations = int(len(train_dataset)/batch_size)
for epoch in range(num_epochs):
running_loss = 0.0
for i, (inputs, labels) in enumerate(train_loader):
inputs, labels = inputs.cuda(), labels.cuda()
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs.squeeze(), labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if (i+1) % 5 == 0:
print(f'epoch {epoch+1}/{num_epochs}, step {i+1}/{n_iterations}, input {inputs.shape}')
correct = 0
total = 0
model = NeuralNetwork()
model.load_state_dict(torch.load("./model.pth"))
correct_pred = {classname: 0 for classname in v.LABELS}
total_pred = {classname: 0 for classname in v.LABELS}
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
for data in test_loader:
inputs, labels = data
# calculate outputs by running images through the network
outputs = model(inputs)
# the class with the highest energy is what we choose as prediction
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted[:, 0] == labels).sum().item()
for label, predict in zip(labels, predicted[:,0]):
if label == predict:
correct_pred[v.LABELS[label]] += 1
total_pred[v.LABELS[label]] += 1
print('Accuracy: %d %%' % (100 * correct / total))
# print accuracy for each class
for classname, correct_count in correct_pred.items():
accuracy = 100 * float(correct_count) / total_pred[classname]
print("Accuracy for class {:5s} is: {:.1f} %".format(classname,
accuracy))