Performance different between tensorflow and pytoch implement

synizter · December 20, 2022, 4:26am

Recently, I decided to move to PyTorch from TensorFlow. So I try to convert my TensorFlow CNN code to PyTorch and create a training loop accordingly to the tutorial. However, the performance of the model drop dramatically with training accuracy only <50% for 15 epochs while TensorFlow achieve almost 93%.
I would like to confirm if my implementation was correct

This is my model definition in both TensorFlow and PyTorch

TensorFlow \w Fit method

def TSNet(Chans = 19, Samples = 1000, output_classes = 4):
    
    input1   = Input(shape = (Chans, Samples, 1),name='input')
    block1       = Conv2D(32, (1, 20), strides = (1,1),input_shape = (Chans, Samples, 1), name='conv1')(input1)
    block1       = Conv2D(32, (3, 1),strides = (1,1),name='conv2')(block1)
    block1       = BatchNormalization(name='bn1')(block1)
    block1       = Activation('relu',name='relu1')(block1)
    block1       = MaxPooling2D((1, 5), strides = (1,2),name='pool1')(block1)
    
    block2      = Conv2D(64, (1, 20),name='conv3')(block1)
    block2      = BatchNormalization(name='bn2')(block2)
    block2      = Activation('relu',name='relu2')(block2)
    block2      = MaxPooling2D((1, 7), strides = (1,2),name='pool2')(block2)
    
    block3      = Conv2D(64, (1,10),strides = (1,1), name='conv4')(block2)
    block3      = BatchNormalization(name='bn3')(block3)
    block3      = Activation('relu',name='relu3')(block3)
    block3      = MaxPooling2D((1, 5), strides = (1,2),name='pool3')(block3)
    
    flatten     = Flatten(name='f1')(block3)
    dp1         = Dropout(0.5,name='drop1')(flatten)
    dense1      = Dense(32,name='ful1')(dp1)
    nrm         = BatchNormalization(name='bn7')(dense1)
    act         = Activation('relu',name='relu7')(nrm)
    
    dp2         = Dropout(0.3, name='drop3')(act)
    dense2      = Dense(output_classes,name='out')(dp2)
    out         = Activation('softmax',name='soft1_out')(dense2)
    
    return Model(inputs=input1, outputs=out)

# TRAIN LOOP------------------------------------------------------------------------------------
x,y = load() #load 540 instances with shape 19x1000 data, label was one-hot
x = np.expand_dims(x, 3)  #change shape to 540x19x1000x1

classifier_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
classifier_loss = tf.keras.losses.CategoricalCrossentropy()
clf = TSNet(Chans = 19, Samples = 1000, output_classes = 4)
clf.compile(optimizer = classifier_optimizer, loss= classifier_loss , metrics=['accuracy'])
clf.fit(x, y,
        batch_size=8, 
        epochs = 15, 
        verbose = True)

PyTorch

class PTNet(torch.nn.Module):
    def __init__(self, channels = 19, samples = 1000.0, outputs = 4):
        super(PTNet, self).__init__()
        #Sequential 1
        
        self.seq1 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels = 1, out_channels = 32, 
                            kernel_size = (1,20), stride = 1),
            torch.nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = (3,1), stride = 1),
            torch.nn.BatchNorm2d(32, eps = 0.001, momentum = 0.99),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size = [1,5], stride = [1,2])
        )
        #calculate output of sample at each opeartion
        samples = (samples - 20) + 1
        samples = (samples - 1) + 1
        channels = channels - 3 + 1
        samples = floor((samples - 5) / 2 + 1)


        #Sequential 2
        self.seq2 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = (1,20)),
            torch.nn.BatchNorm2d(64, eps = 0.001, momentum = 0.99), #tensorflow duplicate
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size = [1,7], stride = [1,2])
        )
        samples = (samples - 20) + 1
        samples = floor((samples- 7) / 2 + 1)


        #Sequential 3
        self.seq3 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = (1,10)),
            torch.nn.BatchNorm2d(64, eps = 0.001, momentum = 0.99),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size = [1,5], stride = [1,2])
        ) 
        samples = (samples - 10) + 1
        samples = floor((samples - 5) / 2 + 1)


        #fully connect
        self.fc = torch.nn.Sequential(
            torch.nn.Dropout1d(p = 0.5),
            #cal from (initla_ch - 2) * last_layer_fmap * final datapoint from conv 
            torch.nn.Linear(in_features = channels * 64 * samples, out_features = 32),
            torch.nn.BatchNorm1d(32, eps = 0.001, momentum = 0.99),
            torch.nn.ReLU(),
            torch.nn.Dropout1d(p = 0.3),
            torch.nn.Linear(in_features = 32, out_features = outputs),
            torch.nn.Softmax()
        )

    def forward(self, x):
        x = self.seq1(x)
        x = self.seq2(x)
        x = self.seq3(x)
        x = torch.flatten(x, start_dim = 1, end_dim = -1)
        x = self.fc(x)

        return x

# TRAIN LOOP------------------------------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = PTNet()
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 0.0001)
criterion = torch.nn.CrossEntropyLoss()
criterion.to(device)

x,y = load() #load 540 instances with shape 19x1000 data, label was one-hot
x = np.expand_dims(x, 1) #Change shape to 540 x 1 x 19 x 1000 for PyTorch

train_dat    = torch.utils.data.TensorDataset(torch.tensor(x).to(device), torch.tensor(y).to(device))
train_loader = torch.utils.data.DataLoader(train_dat, batch_size = 8, shuffle = True)

for epoch in range(15):
    res = 0
    #train loop
    for i, data in enumerate(train_loader ,0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        #zero grad
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        _, predicted = outputs.max(1)
        _, gt = labels.max(1)
        
        res += predicted.eq(gt).sum().item()

    print(f'epoch {epoch + 1} | acc {100 * res / (67*8)}')

The noticeable different is that input shape of TensorFlow is 540 x 19 x 1000 x 1 as last dimension is channel while PyTorch is 540 x 1 x 19 x 1000

I print out model summary and output shape from each layers are almost identical (except for batch normalize layer). I guess is that there will be a performance different but not that high (40% diff).

ptrblck · December 20, 2022, 7:01am

The general architecture looks alright (at least I cannot spot anything obviously wrong). The issue I’m seeing is in the usage of the nn.Softmax layer since nn.CrossEntropyLoss expects raw logits. Remove this layer and see if this improves the accuracy.

synizter · December 21, 2022, 3:28am

Thank you for your response, although I removed nn.Softmax, the result still not changed