Difference in convergence between pytorch and tensorflow models

Hi All,

I implemented a basic 4-layer fully connected NN using both pytorch and tensorflow. However, the tensorflow model is converging much faster and better when compared to the pytorch model even under same weight initialisations. The entire pipeline is identical (only difference in gradient computation). I would be grateful if anyone could help me in identifying the cause.

Pytorch Model

def denoise_loss_mse(denoise, clean):    
    loss = nn.MSELoss(reduction = 'mean')
    return loss(denoise, clean)

def train(model, noiseEEG_train, EEG_train, noiseEEG_val, EEG_val):
        print("=============Training Start================")
        optimizer = torch.optim.RMSprop(model.parameters(), lr=0.00005, alpha=0.9)
        current_step = 0
        for epoch in range(10):
            print("-" * 50 + str(epoch) + "-" * 50)
            train_loss, current_step = __train_on_epoch(model, noiseEEG_train, EEG_train, optimizer, current_step)
            valid_loss = __val_on_epoch(model, noiseEEG_val, EEG_val)

def __train_on_epoch(model, noiseEEG, EEG, optimizer, current_step):
        model.train()
        start = time.time()
        batch_num = math.ceil(noiseEEG.shape[0]/batch_size)
        print(noiseEEG.shape, batch_num)
        train_loss = 0
        with tqdm(total=batch_num, position=0, leave=True) as pbar:
            for n_batch in range(batch_num):
                current_step += 1
                if n_batch == batch_num:
                    noiseEEG_batch,EEG_batch =  noiseEEG[batch_size*n_batch :] , EEG[batch_size*n_batch :]
                else:
                    noiseEEG_batch,EEG_batch =  noiseEEG[batch_size*n_batch : batch_size*(n_batch+1)] , EEG[batch_size*n_batch : batch_size*(n_batch+1)]
                
                # if current_step % 300 == 0 and current_step > 0:
                #     print("-" * 50 + str(current_step) + "-" * 50)
                #     print(noiseEEG_batch.min(), noiseEEG_batch.max(), np.mean(noiseEEG_batch), np.std(noiseEEG_batch))
                #     print(EEG_batch.min(), EEG_batch.max(), np.mean(EEG_batch), np.std(EEG_batch))

                noiseEEG_batch, EEG_batch = torch.FloatTensor(noiseEEG_batch), torch.FloatTensor(EEG_batch)
                with torch.set_grad_enabled(True):
                    optimizer.zero_grad()
                    model(noiseEEG_batch)
                    denoiseout = model.predicted
                    mse_loss = denoise_loss_mse(denoiseout, EEG_batch)
                    loss = mse_loss
                    # if current_step % 300 == 0 and current_step > 0:
                    #     print(f"Loss value: {loss.data}")
                    assert not torch.isnan(loss.data), "Loss is NaN"
                    loss.backward()
                    optimizer.step()
                    train_loss += loss.data / float(batch_num)

                    pbar.update()
            pbar.close()
        end = time.time()
        print(f"Train loss: {train_loss}, time = {end-start}/per epoch")
        return {"epoch_train_loss": train_loss}, current_step

def __val_on_epoch(model, noiseEEG, EEG):
    model.eval()
    batch_num = math.ceil(noiseEEG.shape[0]/batch_size)
    valid_loss = 0
    with tqdm(total=batch_num, position=0, leave=True) as pbar:
        for n_batch in range(batch_num):
            if n_batch == batch_num:
                noiseEEG_batch,EEG_batch =  noiseEEG[batch_size*n_batch :] , EEG[batch_size*n_batch :]
            else:
                noiseEEG_batch,EEG_batch =  noiseEEG[batch_size*n_batch : batch_size*(n_batch+1)] , EEG[batch_size*n_batch : batch_size*(n_batch+1)]
            noiseEEG_batch, EEG_batch = torch.FloatTensor(noiseEEG_batch), torch.FloatTensor(EEG_batch)
            with torch.no_grad():
                model(noiseEEG_batch)
                denoiseout = model.predicted
                mse_loss = denoise_loss_mse(denoiseout, EEG_batch)
                loss = mse_loss 
                assert not torch.isnan(loss.data), "Loss is NaN"

                valid_loss += loss.data / float(batch_num)

                pbar.update()
        pbar.close()
    print(f"Validation loss: {valid_loss}")
    return {"epoch_valid_loss": valid_loss}

class FcNN(nn.Module):
    def __init__(self):
        super(FcNN, self).__init__()
        self.fc1 = nn.Linear(512, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 512)
        self.fc4 = nn.Linear(512, 512)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        hf = F.relu(self.fc1(x))
        hf = self.dropout(hf)

        hf = F.relu(self.fc2(hf))
        hf = self.dropout(hf)

        hf = F.relu(self.fc3(hf))
        hf = self.dropout(hf)

        self.predicted = self.fc4(hf)
    
model = FcNN()
train(model, noiseEEG_train, EEG_train, noiseEEG_val, EEG_val)

Tensorflow Model

def denoise_loss_mse_tf(denoise, clean):      
  loss = tf.losses.mean_squared_error(denoise, clean)
  return tf.reduce_mean(loss)

def test_step(model, noiseEEG_test, EEG_test):

  denoiseoutput_test = model(noiseEEG_test)
  loss = denoise_loss_mse_tf(EEG_test, denoiseoutput_test)
  #loss_rrmset = denoise_loss_rrmset(denoiseoutput_test, EEG_test)

  return denoiseoutput_test, loss

def train_step(model, noiseEEG_batch, EEG_batch, optimizer, batch_size, datanum):

    mse_grads = 0
    m_loss = 0 
    with tf.GradientTape() as loss_tape:
        batch_size = noiseEEG_batch.shape[0]
        noiseeeg_batch = tf.reshape(noiseEEG_batch, [batch_size,datanum])
        eeg_batch=tf.reshape(EEG_batch, [batch_size,datanum,1])
        denoiseoutput = model(noiseeeg_batch)
        denoiseoutput = tf.reshape(denoiseoutput, [batch_size,datanum,1]) 
        M_loss = denoise_loss_mse_tf(denoiseoutput,eeg_batch)
        mse_grads = loss_tape.gradient(M_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(mse_grads, model.trainable_variables))

    return  M_loss,  mse_grads[0]

def trainTF(model, noiseEEG, EEG, noiseEEG_val, EEG_val, epochs, batch_size, optimizer):

    batch_num = math.ceil(noiseEEG.shape[0]/batch_size)
    datanum = noiseEEG.shape[1]
    current_step = 0
    for epoch in range(10):
        print("-" * 50 + str(epoch) + "-" * 50)
        start = time.time()

        # initialize  loss value for every epoch
        mse_grads , train_mse = 0, 0

        with tqdm(total=batch_num, position=0, leave=True) as pbar:
    
            for n_batch in range(batch_num):

                if n_batch == batch_num:
                    noiseEEG_batch,EEG_batch =  noiseEEG[batch_size*n_batch :] , EEG[batch_size*n_batch :]
                else:
                    noiseEEG_batch,EEG_batch =  noiseEEG[batch_size*n_batch : batch_size*(n_batch+1)] , EEG[batch_size*n_batch : batch_size*(n_batch+1)]
                
                mse_loss_batch, mse_grads_batch = train_step(model, noiseEEG_batch, EEG_batch, optimizer, batch_size, datanum)

                # convert variables to usable format
                mse_grads_batch = tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(mse_grads_batch)))).numpy()
                mse_loss_batch = mse_loss_batch.numpy()

                # store history 
                train_mse += mse_loss_batch/float(batch_num)
                mse_grads += mse_grads_batch/float(batch_num)

                pbar.update()
            pbar.close()

        # calculate mse loss for validation set
        #denoiseoutput, val_mse, loss_rrmset = test_step(model, noiseEEG_val, EEG_val)
        denoiseoutput, val_mse = test_step(model, noiseEEG_val, EEG_val)

        print ('Epoch #: {}/{}, Time taken: {} secs,\n Grads: mse= {},\n Losses: train_mse= {}, val_mse={}'\
                     .format(epoch+1,epochs,time.time()-start , mse_grads,  train_mse, val_mse))
def fcNN(datanum):

    model = tf.keras.Sequential()
    model.add(Input(shape=(datanum,)))
    model.add(layers.Dense(datanum, activation=tf.nn.relu))
    model.add(layers.Dropout(0.3))



    model.add(layers.Dense(datanum))
    model.add(layers.ReLU())
    model.add(layers.Dropout(0.3))


    model.add(layers.Dense(datanum))
    model.add(layers.ReLU())
    model.add(layers.Dropout(0.3))


    model.add(layers.Dense(datanum))
    model.summary()
    return model


model = fcNN(512)
optimizer = tf.optimizers.RMSprop(lr=0.00005, rho=0.9)
trainTF(model, noiseEEG_train, EEG_train, noiseEEG_val, EEG_val, 10, 40, optimizer)

Loss Metric: MSE Loss
After 10 epochs, the tensorflow model is able to converge to training loss 0.056 and validation loss 0.076, where as the pytorch model is converging to training loss 0.137 and validation loss 0.119.

The colab notebook for the above experiment can be found here. The dataset can found here.