hey guys i wrote model to classification audio and use MFCC transform
but after model training i cant use model to predict class
i got this error:
x = F.relu(self.fc1(x.reshape(-1,x.shape[1] * x.shape[2]*x.shape[3])))
x = self.dropout5(x)
IndexError: tuple index out of range
I guess your x
input tensor doesn’t have 3 dimensions:
x = torch.randn(2, 3)
x.shape[0]
x.shape[1]
x.shape[2]
# IndexError: tuple index out of range
so check its shape and make sure you can index it.
i check the size
the size is : torch.size([1,16,3,8])
also i don’t have problem with model because i can train by data and can’t test the model by recording my own voice and take it to the model
This description sounds as if the error is in the test case which you might not have checked yet.
i use these code for the model
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
class NN2D(nn.Module):
def __init__(self, num_class):
super(NN2D,self).__init__()
self.conv1 = nn.Conv2d(in_channels=1,out_channels=8,kernel_size=3,stride=1)
self.dropout1 = nn.Dropout(0.3)
self.conv2 = nn.Conv2d(in_channels=8,out_channels=16,kernel_size=3,stride=1)
self.dropout2 = nn.Dropout(0.3)
self.fc1 = nn.Linear(384, 256)
self.dropout5 = nn.Dropout(0.3)
self.fc2 = nn.Linear(256,128)
self.dropout6 = nn.Dropout(0.3)
self.fc3 = nn.Linear(128, num_class)
def forward(self, x):
x = F.max_pool2d(F.relu(self.conv1(x)),kernel_size=3)
x = self.dropout1(x)
x = F.max_pool2d(F.relu(self.conv2(x)),kernel_size=3)
x = self.dropout2(x)
#print(x.shape)
x = F.relu(self.fc1(x.reshape(-1,x.shape[1] * x.shape[2]*x.shape[3])))
x = self.dropout5(x)
x = F.relu(self.fc2(x))
x = self.dropout6(x)
x = self.fc3(x)
#print(x.shape)
return x
and also i use this for the data loader:
from torch.utils.data import DataLoader,random_split,Dataset
class SpeechDataLoader(Dataset):
def __init__(self,data,labels,list_dir,transform):
self.data = data
self.labels = labels
self.label_dict = list_dir
self.transform = transform
def __len__(self):
return len(self.data)
def __getitem__(self,idx):
waveform = self.data[idx]
waveform = self.transform(waveform)
if self.labels[idx] in self.label_dict:
out_labels = self.label_dict.index(self.labels[idx])
return waveform, out_labels
and its my train and test function:
import torch,os
from tqdm import tqdm
import torch.optim as optim
best_acc=0
def train(net,trainloader,optim,scheduler,criterion,epoch,device):
print("Training")
net.train()
train_loss = 0
total = 0
total_correct = 0
iterator = tqdm(trainloader)
for inputs,targets in iterator:
inputs,targets = inputs.to(device), targets.to(device)
optim.zero_grad()
outputs = net(inputs)
loss = criterion(outputs,targets)
loss.backward()
optim.step()
scheduler.step()
train_loss += loss.item()
_,predicted = torch.max(outputs.data,1)
total_correct += (predicted == targets).sum().item()
total += targets.size(0)
print("Epoch: [{}] loss: [{:.2f}] Accuracy [{:.2f}] ".format(epoch+1,train_loss/len(trainloader),
total_correct*100/total))
def test(net,testloader,optim,criterion,epoch,device,results_txt,model_name):
global best_acc
print("validation")
net.eval()
test_loss,total,total_correct = 0,0,0
iterator = tqdm(testloader)
for inputs, targets in iterator:
inputs, targets = inputs.to(device), targets.to(device)
outputs = net(inputs)
loss = criterion(outputs, targets)
test_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += targets.size(0)
total_correct += (predicted == targets).sum().item()
# Save checkpoint when best model
acc = 100. * total_correct / total
print("\nValidation Epoch #%d\t\t\tLoss: %.4f Acc@1: %.2f%%" %(epoch+1, test_loss/len(testloader), acc))
f = open(results_txt+".txt","a+")
f.write("Validation Epoch #%d\t\t\tLoss: %.4f Acc@1: %.2f%% \n" %(epoch+1, test_loss/len(testloader), acc))
f.close()
if acc > best_acc:
if isinstance(net, torch.nn.DataParallel):
print("multiple GPU")
print('Saving Best model...\t\t\tTop1 = %.2f%%' %(acc))
state = {
'model':net.module.state_dict(),
'model1': net.state_dict(),
'model2': net,
'acc':acc,
'epoch':epoch,
}
else:
print("not multiple GPU")
state = {
'model':net,
'acc':acc,
'epoch':epoch,
}
if not os.path.isdir('checkpoint'):
os.mkdir('checkpoint')
save_point = './checkpoint/'
if not os.path.isdir(save_point):
os.mkdir(save_point)
torch.save(state, save_point+model_name+'.t7')
best_acc = acc
return best_acc
then i use this code to load wave and label
path_to_text = '/content/dataset_folder/'
dataset_generation_array = []
daataset = []
for n in os.listdir(path_to_text):
path1 = path_to_text+n+'/'
path_generate =glob.glob(path1+'*.WAV')+glob.glob(path1+'*.wav')
for b in path_generate:
dataset_generation_array.append(b)
random.shuffle(dataset_generation_array)
for i in dataset_generation_array:
make_label_with_wave = []
signalss, sample_raterr = torchaudio.load(i)
make_label_with_wave.append(signalss)
make_label_with_wave.append(sample_raterr)
make_label_with_wave.append(i[len(path_to_text):len(path_to_text)+1])
daataset.append(make_label_with_wave)
train_audio_path = '/content/dataset_folder/'
labels_dict=os.listdir(train_audio_path)
plt.plot(daataset[0][0].t())
plt.show()
print (labels_dict)
print (daataset[0][2])
Audio(daataset[0][0], rate = sample_raterr)
then use this to do zero padding to make all data same size
wave1 = []
labels = []
wave = []
max = 0
for i in daataset:
wave1.append(i[0])
labels.append(i[2])
for i in wave1:
if max < i.shape[1]:
max = i.shape[1]
#print (max)
for j in wave1:
wave.append(F.pad(input=j, pad=(max - j.shape[1], 0), mode='constant', value=0))
now select the model and transform and also data loader
train_audio_transforms = nn.Sequential(torchaudio.transforms.MFCC(log_mels=False))
net = NN2D(num_class=35)
dataset= SpeechDataLoader(wave,labels,labels_dict, train_audio_transforms)
traindata, testdata = random_split(dataset, [round(len(dataset)*.8), round(len(dataset)*.2)])
trainloader = torch.utils.data.DataLoader(traindata, batch_size=1, shuffle=True)
testloader = torch.utils.data.DataLoader(testdata, batch_size=1, shuffle=True)
at the end use it to do train
num_epochs=15
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
result_file = 'results_txt'
net = net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(),lr=0.001)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
steps_per_epoch=int(len(trainloader)),
epochs=num_epochs,
anneal_strategy='linear')
for epoch in range(0, num_epochs):
train(net,trainloader,optimizer,scheduler,criterion,epoch,device)
best_acc = test(net,testloader,optimizer,criterion,epoch,device,result_file,'0')
until this step all things works good
but when i want to load the model and predict new data i get the error that i said.
i found the problem:
the normal shape of data is [16,3,8]
but batch make shape: [1,16,3,8] because batch size is: 1
so in test data need to use
inputs = inputs.unsqueeze(0)
to make shape [1,16,3,8] and solve the problem
thanks guys