I have been trying to replicate a model I build in tensorflow/keras in Pytorch. I saw that the performance worsened a lot after training the model in my Pytorch implementation. So I tried replicating a simpler model and figured out that the problem depends on the optimizer I used, since I get different results when using Adam (and some of the other optimizers I have tried) but the same for SGD. Can someone help me out with fixing this?

Underneath the code showing that the results are the same for SGD (except for rounding errors) and differing for Adam:

import torch

from torch import nn

from torch.nn import Dropout, SiLU, Linear

import torch.optim as optim

import numpy as np

import tensorflow as tf

from tensorflow import keras

from tensorflow.keras import regularizers

class NN_torch(nn.Module):

def **init**(self, input_dim, output_dim, nodes):

super().**init**()

self.dense_0 = Linear(input_dim, nodes)

self.swish_0 = SiLU()

self.out = Linear(nodes, output_dim)

```
def forward(self, x):
out = self.dense_0(x)
out = self.swish_0(out)
out = self.out(out)
return out
```

def init_keras_model(nodes, optimizer):

model = keras.Sequential()

model.add(keras.layers.Dense(nodes,activation=“linear”))

model.add(keras.layers.Activation(tf.keras.activations.swish))

model.add(keras.layers.Dense(1, activation=“linear”))

model.compile(loss=‘mse’,

optimizer=optimizer)

return model

def compare_keras_torch(Adam=True, lr = 0.01):

# Set seeds and initialize data

torch.manual_seed(42)

np.random.seed(42)

x_sample = np.array(np.random.uniform(size=[10, 10]), dtype=np.float)

y_sample = np.array(np.random.uniform(size=[10, 1]), dtype=np.float)

```
# Initialize models
model_torch = NN_torch(x_sample.shape[1], 1, 10)
if Adam:
optimizer_keras = keras.optimizers.Adam(lr)
optimizer_torch = optim.Adam(model_torch.parameters(), lr=lr)
else:
optimizer_keras = keras.optimizers.SGD(lr)
optimizer_torch = optim.SGD(model_torch.parameters(), lr=lr)
model_keras = init_keras_model(x_sample.shape[0], optimizer_keras)
model_keras.build(input_shape=[None, x_sample.shape[1]])
# Set same weights
weight_0 = np.transpose(model_torch.dense_0.weight.detach().numpy())
bias_0 = np.transpose(model_torch.dense_0.bias.detach().numpy())
model_keras.get_layer(index=0).set_weights([weight_0, bias_0])
weight_1 = np.transpose(model_torch.out.weight.detach().numpy())
bias_1 = np.transpose(model_torch.out.bias.detach().numpy())
model_keras.get_layer(index=2).set_weights([weight_1, bias_1])
# Confirm models create same output
keras_pred = model_keras.predict(x_sample)
torch_pred = model_torch(torch.tensor(x_sample).float())
print((keras_pred - torch_pred.detach().numpy()))
# One epoch with same data
model_keras.fit(x_sample, y_sample, batch_size=x_sample.shape[0], shuffle=False, epochs=1)
criterion = nn.MSELoss()
outputs = model_torch(torch.tensor(x_sample).float())
loss = criterion(outputs, torch.tensor(y_sample.reshape(-1, 1)).float())
loss.backward()
optimizer_torch.step()
optimizer_torch.zero_grad()
# Confirm models create same output
keras_pred = model_keras.predict(x_sample)
torch_pred = model_torch(torch.tensor(x_sample).float())
print((keras_pred - torch_pred.detach().numpy()))
```

compare_keras_torch(Adam=True, lr = 0.1)

compare_keras_torch(Adam=False, lr = 0.1)