I also down with CTCLoss, I don’t know how to fix it. plz help me point it out.
Problem: ASR
Questions:
- These results are not correct, how to correctly update parameters?
- how to decode CTC to calculate Acc?
Data input:
X: AllMFCCs: (48840, 247, 20): batchs, mfcc_len, feature, (batch will be 16 or 32 or more each)
Y: char_vec: (48840, 30), values range: 0 … 6
Ylen: char_length: (48840,), values range 1 … 30
Define Net to train:
'''ResNet in PyTorch.
For Pre-activation ResNet, see 'preact_resnet.py'.
Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
Deep Residual Learning for Image Recognition. arXiv:1512.03385
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion*planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion*planes)
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x)
out = F.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes=256*8):
super(ResNet, self).__init__()
Nsize=32
self.in_planes = Nsize
self.conv1 = nn.Conv2d(1, Nsize, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(Nsize)
self.layer1 = self._make_layer(block,Nsize,num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 64, num_blocks[1], stride=1)
self.layer3 = self._make_layer(block, 128, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 256, num_blocks[3], stride=2)
self.linear = nn.Linear(3840, num_classes) #
self.Smax = nn.Softmax(dim=-1)
def _make_layer(self, block, planes, num_blocks, stride):
# print('_make_layer:',block, planes, num_blocks, stride)
strides = [stride] + [1]*(num_blocks-1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x))); #print(1,out.size())
out = self.layer1(out); #print(2,out.size())
out = self.layer2(out); #print(3,out.size())
out = self.layer3(out); #print(4,out.size())
out = self.layer4(out); #print(5,out.size())
out = F.avg_pool2d(out, 4); #print(6,out.size())
out = out.view(out.size(0), -1);#print(7,out.size())
out = self.linear(out); #print(8,out.size())
MFs=[]
for mf in out:
mf=mf.view(-1,8)
MFs.append(mf)
out = torch.stack(MFs)
# out = out.view(out.size(0),-1,8) # Train 3: torch.Size([32, 50, 8])
# print(out)
out = self.Smax(out);
return out
def ResNet18(): return ResNet(BasicBlock, [2,2,2,2])
def test():
net = ResNet18()
# bat=AllMFCCs[0:BatchSize]
# bat out: torch.Size([32, 1, 247, 20])
y = net(bat)
print(y.size())
# print(y)
# print(net)
test()
import time;print(time.asctime())
Code in training Main:
from torch import nn
from tensorflow.python.ops import array_ops
from torch import nn, autograd, FloatTensor, optim
ctc_loss = nn.CTCLoss(reduction='elementwise_mean')
net = ResNet18()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = optim.SGD(net.parameters(), lr=0.2, momentum=0.9, weight_decay=5e-4)
net = net.to(device)
if device == 'cuda':
net = torch.nn.DataParallel(net)
cudnn.benchmark = True
print(time.asctime())
net.train()
train_loss = 0
correct = 0
total = 0
k=0
BatchSize=16
for batch_idx in range(0, len(AllMFCCs),BatchSize):
Batch_Input = AllMFCCs[batch_idx:BatchSize+batch_idx]
target_lengths= char_len[batch_idx:BatchSize+batch_idx]
targets = char_vec[batch_idx:BatchSize+batch_idx]
targets = targets+1
targets =torch.Tensor(targets).long()
target_lengths=torch.Tensor(target_lengths).long()
optimizer.zero_grad()
Batch_Input1=ConvertNpArray3D_2Tensor4D(Batch_Input)
# bat inp: (32, 247, 20)
# bat out: torch.Size([32, 1, 247, 20])
Batch_Input1=autograd.Variable(Batch_Input1)
targets=autograd.Variable(targets)
Batch_Input1,targets = Batch_Input1.to(device), targets.to(device)
log_probs=net(Batch_Input1)
log_probs=log_probs.detach().requires_grad_()
log_probs = log_probs.transpose(1,0) # 500,32,8
input_lengths=torch.full ((log_probs.shape[1],), log_probs.shape[0], dtype=torch.long);
input_lengths=autograd.Variable(input_lengths)
target_lengths=autograd.Variable(target_lengths)
loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
loss.backward()
optimizer.step()
train_loss += loss.item()
print(k,'loss:',loss.item(),train_loss)
k+=1
if k==100: break
# _, predicted = log_probs.max(1)
# total += targets.size(0)
# correct += predicted.eq(targets).sum().item()
# progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
# % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
# print('\rTrain:',batch_idx,'/', len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
# % (train_loss/(batch_idx+1), 100.*correct/total, correct, total), end=' ',flush=True)
print('Done!')
# train()
Results:
0 loss: -8.53072738647461 -8.53072738647461
1 loss: -8.66711711883545 -17.19784450531006
2 loss: -8.59152889251709 -25.78937339782715
3 loss: -8.418535232543945 -34.207908630371094
...........................
97 loss: -8.594472885131836 -840.0682668685913
98 loss: -8.93405532836914 -849.0023221969604
99 loss: -8.47213363647461 -857.4744558334351