Performance different between tensorflow and pytoch implement

Recently, I decided to move to PyTorch from TensorFlow. So I try to convert my TensorFlow CNN code to PyTorch and create a training loop accordingly to the tutorial. However, the performance of the model drop dramatically with training accuracy only <50% for 15 epochs while TensorFlow achieve almost 93%.
I would like to confirm if my implementation was correct

This is my model definition in both TensorFlow and PyTorch

TensorFlow \w Fit method

def TSNet(Chans = 19, Samples = 1000, output_classes = 4):
    
    input1   = Input(shape = (Chans, Samples, 1),name='input')
    block1       = Conv2D(32, (1, 20), strides = (1,1),input_shape = (Chans, Samples, 1), name='conv1')(input1)
    block1       = Conv2D(32, (3, 1),strides = (1,1),name='conv2')(block1)
    block1       = BatchNormalization(name='bn1')(block1)
    block1       = Activation('relu',name='relu1')(block1)
    block1       = MaxPooling2D((1, 5), strides = (1,2),name='pool1')(block1)
    
    block2      = Conv2D(64, (1, 20),name='conv3')(block1)
    block2      = BatchNormalization(name='bn2')(block2)
    block2      = Activation('relu',name='relu2')(block2)
    block2      = MaxPooling2D((1, 7), strides = (1,2),name='pool2')(block2)
    
    block3      = Conv2D(64, (1,10),strides = (1,1), name='conv4')(block2)
    block3      = BatchNormalization(name='bn3')(block3)
    block3      = Activation('relu',name='relu3')(block3)
    block3      = MaxPooling2D((1, 5), strides = (1,2),name='pool3')(block3)
    
    flatten     = Flatten(name='f1')(block3)
    dp1         = Dropout(0.5,name='drop1')(flatten)
    dense1      = Dense(32,name='ful1')(dp1)
    nrm         = BatchNormalization(name='bn7')(dense1)
    act         = Activation('relu',name='relu7')(nrm)
    
    dp2         = Dropout(0.3, name='drop3')(act)
    dense2      = Dense(output_classes,name='out')(dp2)
    out         = Activation('softmax',name='soft1_out')(dense2)
    
    return Model(inputs=input1, outputs=out)

# TRAIN LOOP------------------------------------------------------------------------------------
x,y = load() #load 540 instances with shape 19x1000 data, label was one-hot
x = np.expand_dims(x, 3)  #change shape to 540x19x1000x1

classifier_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
classifier_loss = tf.keras.losses.CategoricalCrossentropy()
clf = TSNet(Chans = 19, Samples = 1000, output_classes = 4)
clf.compile(optimizer = classifier_optimizer, loss= classifier_loss , metrics=['accuracy'])
clf.fit(x, y,
        batch_size=8, 
        epochs = 15, 
        verbose = True)

PyTorch

class PTNet(torch.nn.Module):
    def __init__(self, channels = 19, samples = 1000.0, outputs = 4):
        super(PTNet, self).__init__()
        #Sequential 1
        
        self.seq1 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels = 1, out_channels = 32, 
                            kernel_size = (1,20), stride = 1),
            torch.nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = (3,1), stride = 1),
            torch.nn.BatchNorm2d(32, eps = 0.001, momentum = 0.99),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size = [1,5], stride = [1,2])
        )
        #calculate output of sample at each opeartion
        samples = (samples - 20) + 1
        samples = (samples - 1) + 1
        channels = channels - 3 + 1
        samples = floor((samples - 5) / 2 + 1)


        #Sequential 2
        self.seq2 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = (1,20)),
            torch.nn.BatchNorm2d(64, eps = 0.001, momentum = 0.99), #tensorflow duplicate
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size = [1,7], stride = [1,2])
        )
        samples = (samples - 20) + 1
        samples = floor((samples- 7) / 2 + 1)


        #Sequential 3
        self.seq3 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = (1,10)),
            torch.nn.BatchNorm2d(64, eps = 0.001, momentum = 0.99),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size = [1,5], stride = [1,2])
        ) 
        samples = (samples - 10) + 1
        samples = floor((samples - 5) / 2 + 1)


        #fully connect
        self.fc = torch.nn.Sequential(
            torch.nn.Dropout1d(p = 0.5),
            #cal from (initla_ch - 2) * last_layer_fmap * final datapoint from conv 
            torch.nn.Linear(in_features = channels * 64 * samples, out_features = 32),
            torch.nn.BatchNorm1d(32, eps = 0.001, momentum = 0.99),
            torch.nn.ReLU(),
            torch.nn.Dropout1d(p = 0.3),
            torch.nn.Linear(in_features = 32, out_features = outputs),
            torch.nn.Softmax()
        )

    def forward(self, x):
        x = self.seq1(x)
        x = self.seq2(x)
        x = self.seq3(x)
        x = torch.flatten(x, start_dim = 1, end_dim = -1)
        x = self.fc(x)

        return x

# TRAIN LOOP------------------------------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = PTNet()
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0.0001)
criterion = torch.nn.CrossEntropyLoss()
criterion.to(device)

x,y = load() #load 540 instances with shape 19x1000 data, label was one-hot
x = np.expand_dims(x, 1) #Change shape to 540 x 1 x 19 x 1000 for PyTorch

train_dat    = torch.utils.data.TensorDataset(torch.tensor(x).to(device), torch.tensor(y).to(device))
train_loader = torch.utils.data.DataLoader(train_dat, batch_size = 8, shuffle = True)

for epoch in range(15):
    res = 0
    #train loop
    for i, data in enumerate(train_loader ,0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        #zero grad
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        _, predicted = outputs.max(1)
        _, gt = labels.max(1)
        
        res += predicted.eq(gt).sum().item()

    print(f'epoch {epoch + 1} | acc {100 * res / (67*8)}')

The noticeable different is that input shape of TensorFlow is 540 x 19 x 1000 x 1 as last dimension is channel while PyTorch is 540 x 1 x 19 x 1000

I print out model summary and output shape from each layers are almost identical (except for batch normalize layer). I guess is that there will be a performance different but not that high (40% diff).

The general architecture looks alright (at least I cannot spot anything obviously wrong). The issue Iā€™m seeing is in the usage of the nn.Softmax layer since nn.CrossEntropyLoss expects raw logits. Remove this layer and see if this improves the accuracy.

1 Like

Thank you for your response, although I removed nn.Softmax, the result still not changed