Hi All,
I implemented a basic 4-layer fully connected NN using both pytorch and tensorflow. However, the tensorflow model is converging much faster and better when compared to the pytorch model even under same weight initialisations. The entire pipeline is identical (only difference in gradient computation). I would be grateful if anyone could help me in identifying the cause.
Pytorch Model
def denoise_loss_mse(denoise, clean):
loss = nn.MSELoss(reduction = 'mean')
return loss(denoise, clean)
def train(model, noiseEEG_train, EEG_train, noiseEEG_val, EEG_val):
print("=============Training Start================")
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.00005, alpha=0.9)
current_step = 0
for epoch in range(10):
print("-" * 50 + str(epoch) + "-" * 50)
train_loss, current_step = __train_on_epoch(model, noiseEEG_train, EEG_train, optimizer, current_step)
valid_loss = __val_on_epoch(model, noiseEEG_val, EEG_val)
def __train_on_epoch(model, noiseEEG, EEG, optimizer, current_step):
model.train()
start = time.time()
batch_num = math.ceil(noiseEEG.shape[0]/batch_size)
print(noiseEEG.shape, batch_num)
train_loss = 0
with tqdm(total=batch_num, position=0, leave=True) as pbar:
for n_batch in range(batch_num):
current_step += 1
if n_batch == batch_num:
noiseEEG_batch,EEG_batch = noiseEEG[batch_size*n_batch :] , EEG[batch_size*n_batch :]
else:
noiseEEG_batch,EEG_batch = noiseEEG[batch_size*n_batch : batch_size*(n_batch+1)] , EEG[batch_size*n_batch : batch_size*(n_batch+1)]
# if current_step % 300 == 0 and current_step > 0:
# print("-" * 50 + str(current_step) + "-" * 50)
# print(noiseEEG_batch.min(), noiseEEG_batch.max(), np.mean(noiseEEG_batch), np.std(noiseEEG_batch))
# print(EEG_batch.min(), EEG_batch.max(), np.mean(EEG_batch), np.std(EEG_batch))
noiseEEG_batch, EEG_batch = torch.FloatTensor(noiseEEG_batch), torch.FloatTensor(EEG_batch)
with torch.set_grad_enabled(True):
optimizer.zero_grad()
model(noiseEEG_batch)
denoiseout = model.predicted
mse_loss = denoise_loss_mse(denoiseout, EEG_batch)
loss = mse_loss
# if current_step % 300 == 0 and current_step > 0:
# print(f"Loss value: {loss.data}")
assert not torch.isnan(loss.data), "Loss is NaN"
loss.backward()
optimizer.step()
train_loss += loss.data / float(batch_num)
pbar.update()
pbar.close()
end = time.time()
print(f"Train loss: {train_loss}, time = {end-start}/per epoch")
return {"epoch_train_loss": train_loss}, current_step
def __val_on_epoch(model, noiseEEG, EEG):
model.eval()
batch_num = math.ceil(noiseEEG.shape[0]/batch_size)
valid_loss = 0
with tqdm(total=batch_num, position=0, leave=True) as pbar:
for n_batch in range(batch_num):
if n_batch == batch_num:
noiseEEG_batch,EEG_batch = noiseEEG[batch_size*n_batch :] , EEG[batch_size*n_batch :]
else:
noiseEEG_batch,EEG_batch = noiseEEG[batch_size*n_batch : batch_size*(n_batch+1)] , EEG[batch_size*n_batch : batch_size*(n_batch+1)]
noiseEEG_batch, EEG_batch = torch.FloatTensor(noiseEEG_batch), torch.FloatTensor(EEG_batch)
with torch.no_grad():
model(noiseEEG_batch)
denoiseout = model.predicted
mse_loss = denoise_loss_mse(denoiseout, EEG_batch)
loss = mse_loss
assert not torch.isnan(loss.data), "Loss is NaN"
valid_loss += loss.data / float(batch_num)
pbar.update()
pbar.close()
print(f"Validation loss: {valid_loss}")
return {"epoch_valid_loss": valid_loss}
class FcNN(nn.Module):
def __init__(self):
super(FcNN, self).__init__()
self.fc1 = nn.Linear(512, 512)
self.fc2 = nn.Linear(512, 512)
self.fc3 = nn.Linear(512, 512)
self.fc4 = nn.Linear(512, 512)
self.dropout = nn.Dropout(0.3)
def forward(self, x):
hf = F.relu(self.fc1(x))
hf = self.dropout(hf)
hf = F.relu(self.fc2(hf))
hf = self.dropout(hf)
hf = F.relu(self.fc3(hf))
hf = self.dropout(hf)
self.predicted = self.fc4(hf)
model = FcNN()
train(model, noiseEEG_train, EEG_train, noiseEEG_val, EEG_val)
Tensorflow Model
def denoise_loss_mse_tf(denoise, clean):
loss = tf.losses.mean_squared_error(denoise, clean)
return tf.reduce_mean(loss)
def test_step(model, noiseEEG_test, EEG_test):
denoiseoutput_test = model(noiseEEG_test)
loss = denoise_loss_mse_tf(EEG_test, denoiseoutput_test)
#loss_rrmset = denoise_loss_rrmset(denoiseoutput_test, EEG_test)
return denoiseoutput_test, loss
def train_step(model, noiseEEG_batch, EEG_batch, optimizer, batch_size, datanum):
mse_grads = 0
m_loss = 0
with tf.GradientTape() as loss_tape:
batch_size = noiseEEG_batch.shape[0]
noiseeeg_batch = tf.reshape(noiseEEG_batch, [batch_size,datanum])
eeg_batch=tf.reshape(EEG_batch, [batch_size,datanum,1])
denoiseoutput = model(noiseeeg_batch)
denoiseoutput = tf.reshape(denoiseoutput, [batch_size,datanum,1])
M_loss = denoise_loss_mse_tf(denoiseoutput,eeg_batch)
mse_grads = loss_tape.gradient(M_loss, model.trainable_variables)
optimizer.apply_gradients(zip(mse_grads, model.trainable_variables))
return M_loss, mse_grads[0]
def trainTF(model, noiseEEG, EEG, noiseEEG_val, EEG_val, epochs, batch_size, optimizer):
batch_num = math.ceil(noiseEEG.shape[0]/batch_size)
datanum = noiseEEG.shape[1]
current_step = 0
for epoch in range(10):
print("-" * 50 + str(epoch) + "-" * 50)
start = time.time()
# initialize loss value for every epoch
mse_grads , train_mse = 0, 0
with tqdm(total=batch_num, position=0, leave=True) as pbar:
for n_batch in range(batch_num):
if n_batch == batch_num:
noiseEEG_batch,EEG_batch = noiseEEG[batch_size*n_batch :] , EEG[batch_size*n_batch :]
else:
noiseEEG_batch,EEG_batch = noiseEEG[batch_size*n_batch : batch_size*(n_batch+1)] , EEG[batch_size*n_batch : batch_size*(n_batch+1)]
mse_loss_batch, mse_grads_batch = train_step(model, noiseEEG_batch, EEG_batch, optimizer, batch_size, datanum)
# convert variables to usable format
mse_grads_batch = tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(mse_grads_batch)))).numpy()
mse_loss_batch = mse_loss_batch.numpy()
# store history
train_mse += mse_loss_batch/float(batch_num)
mse_grads += mse_grads_batch/float(batch_num)
pbar.update()
pbar.close()
# calculate mse loss for validation set
#denoiseoutput, val_mse, loss_rrmset = test_step(model, noiseEEG_val, EEG_val)
denoiseoutput, val_mse = test_step(model, noiseEEG_val, EEG_val)
print ('Epoch #: {}/{}, Time taken: {} secs,\n Grads: mse= {},\n Losses: train_mse= {}, val_mse={}'\
.format(epoch+1,epochs,time.time()-start , mse_grads, train_mse, val_mse))
def fcNN(datanum):
model = tf.keras.Sequential()
model.add(Input(shape=(datanum,)))
model.add(layers.Dense(datanum, activation=tf.nn.relu))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(datanum))
model.add(layers.ReLU())
model.add(layers.Dropout(0.3))
model.add(layers.Dense(datanum))
model.add(layers.ReLU())
model.add(layers.Dropout(0.3))
model.add(layers.Dense(datanum))
model.summary()
return model
model = fcNN(512)
optimizer = tf.optimizers.RMSprop(lr=0.00005, rho=0.9)
trainTF(model, noiseEEG_train, EEG_train, noiseEEG_val, EEG_val, 10, 40, optimizer)
Loss Metric: MSE Loss
After 10 epochs, the tensorflow model is able to converge to training loss 0.056 and validation loss 0.076, where as the pytorch model is converging to training loss 0.137 and validation loss 0.119.
The colab notebook for the above experiment can be found here. The dataset can found here.