Hi,
I have just built my first torch model which was originally written tensorflow/keras. But, it seems like the training is taking 4x longer in pytorch. Any suggestion would be a great help:
class Env2Acl(nn.Module):
def __init__(self, input_length, n_class, sr):
super(Env2Acl, self).__init__();
self.input_length = input_length;
stride1 = 2;
stride2 = 2;
channels = 8;
k_size = (3, 3);
n_frames = (sr/1000)*10; #No of frames per 10ms
self.filter_bank_pool_size = int(n_frames/(stride1*stride2));
self.pool_size = (2,2);
self.conv1, self.bn1 = ConvLayer(1, channels, (1, 9), (1, stride1)).get();
self.conv2, self.bn2 = ConvLayer(channels, channels*8, (1, 5), (1, stride2)).get();
self.conv3, self.bn3 = ConvLayer(1, channels*4, k_size, padding=1).get();
self.conv4, self.bn4 = ConvLayer(channels*4, channels*8, k_size, padding=1).get();
self.conv5, self.bn5 = ConvLayer(channels*8, channels*8, k_size, padding=1).get();
self.conv6, self.bn6 = ConvLayer(channels*8, channels*16, k_size, padding=1).get();
self.conv7, self.bn7 = ConvLayer(channels*16, channels*16, k_size, padding=1).get();
self.conv8, self.bn8 = ConvLayer(channels*16, channels*32, k_size, padding=1).get();
self.conv9, self.bn9 = ConvLayer(channels*32, channels*32, k_size, padding=1).get();
self.conv10, self.bn10 = ConvLayer(channels*32, channels*64, k_size, padding=1).get();
self.conv11, self.bn11 = ConvLayer(channels*64, channels*64, k_size, padding=1).get();
self.conv12, self.bn12 = ConvLayer(channels*64, n_class, (1, 1)).get();
self.maxpool1 = nn.MaxPool2d(kernel_size=(1, self.filter_bank_pool_size));
self.maxpool2 = nn.MaxPool2d(kernel_size=(2,2));
self.avgpool = nn.AvgPool2d(kernel_size=(2,4));
self.fcn = nn.Linear(n_class, n_class);
nn.init.kaiming_normal_(self.fcn.weight, nonlinearity='relu')
def forward(self, x):
#Start: Filter bank
x = F.relu(self.bn1(self.conv1(x)));
x = F.relu(self.bn2(self.conv2(x)));
x = self.maxpool1(x);
#Start: Filter bank
#swapaxes
x = x.permute((0, 2, 1, 3));
x = self.maxpool2(F.relu(self.bn3(self.conv3(x))));
x = F.relu(self.bn4(self.conv4(x)));
x = self.maxpool2(F.relu(self.bn5(self.conv5(x))));
x = F.relu(self.bn6(self.conv6(x)));
x = self.maxpool2(F.relu(self.bn7(self.conv7(x))));
x = F.relu(self.bn8(self.conv8(x)));
x = self.maxpool2(F.relu(self.bn9(self.conv9(x))));
x = F.relu(self.bn10(self.conv10(x)));
x = self.maxpool2(F.relu(self.bn11(self.conv11(x))));
x = nn.Dropout(0.2)(x);
x = self.avgpool(F.relu(self.bn12(self.conv12(x))));
x = nn.Flatten()(x);
y = F.softmax(self.fcn(x), dim=1);
return y;
class ConvLayer:
def __init__(self, in_channels, out_channels, kernel_size, stride=(1,1), padding=0, bias=False):
self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias);
nn.init.kaiming_normal_(self.conv.weight, nonlinearity='relu')
self.bn = nn.BatchNorm2d(out_channels);
def get(self):
return self.conv, self.bn;
net = model.GetModel();
lossFunc = torch.nn.KLDivLoss();
optimizer = optim.SGD(net.parameters(), lr=self.opt.LR, weight_decay=self.opt.weightDecay, momentum=self.opt.momentum, nesterov=True)
testData = np.load(os.path.join(self.opt.data, self.opt.dataset, 'aug-data/sp-{}/{}.npz'.format(self.split, 'test/test4000')), allow_pickle=True);
testX = torch.tensor(np.moveaxis(testData['x'], 3, 1));
testY = torch.tensor(testData['y']);
for epochIdx in range(self.opt.nEpochs):
epoch_start_time = time.time();
optimizer.param_groups[0]['lr'] = self.__get_lr(epochIdx+1);
data = np.load(os.path.join(self.opt.data, self.opt.dataset, 'aug-data/sp-{}/{}.npz'.format(self.split, 'train/train{}'.format(epoch))), allow_pickle=True);
trainX = torch.tensor(np.moveaxis(data['x'], 3, 1));
trainY = torch.tensor(data['y']);
running_loss = 0.0;
running_acc = 0.0;
n_batches = math.ceil(len(trainX)/64);
for batchIdx in range(n_batches):
x = self.trainX[index*self.opt.batchSize : (index+1)*self.opt.batchSize];
y = self.trainY[index*self.opt.batchSize : (index+1)*self.opt.batchSize];
# zero the parameter gradients
optimizer.zero_grad();
# forward + backward + optimize
outputs = net(x);
running_acc += ((outputs.data.argmax(dim=1) == y.argmax(dim=1))*1).float().mean().item();
loss = lossFunc(outputs, y)
loss.backward()
optimizer.step()
running_loss += loss.item()
tr_loss = running_loss / n_batches;
#Epoch wise validation Validation
net.eval();
with torch.no_grad():
y_pred = None;
batch_size = (self.opt.batchSize//self.opt.nCrops)*self.opt.nCrops
for idx in range(math.ceil(len(self.testX)/batch_size)):
x = self.testX[idx*batch_size : (idx+1)*batch_size];
scores = net(x);
y_pred = scores.data if y_pred is None else torch.cat((y_pred, scores.data));
acc, loss = self.__compute_accuracy(y_pred, self.testY);
net.train();
print('Epoch: {}/{} | Train: Loss {:.3f}% | Val: Acc(top1) {:.3f}%'.format(epochIdx+1, self.opt.nEpochs,tr_loss, val_acc));
running_loss = 0;
running_acc = 0;
I have tried to paste the simplified code here. Can any body help me identifying where is the hole that is taking 4x longer execution time? My understanding is that torch should be around 2x faster.
Kind Regards,
Mohaimen