Keras vs Torch implementation. Same results for SGD, different results for Adam

I have been trying to replicate a model I build in tensorflow/keras in Pytorch. I saw that the performance worsened a lot after training the model in my Pytorch implementation. So I tried replicating a simpler model and figured out that the problem depends on the optimizer I used, since I get different results when using Adam (and some of the other optimizers I have tried) but the same for SGD. Can someone help me out with fixing this?

Underneath the code showing that the results are the same for SGD (except for rounding errors) and differing for Adam:

import torch
from torch import nn
from torch.nn import Dropout, SiLU, Linear
import torch.optim as optim
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers

class NN_torch(nn.Module):
def init(self, input_dim, output_dim, nodes):
super().init()
self.dense_0 = Linear(input_dim, nodes)
self.swish_0 = SiLU()
self.out = Linear(nodes, output_dim)

def forward(self, x):
    out = self.dense_0(x)
    out = self.swish_0(out)
    out = self.out(out)
    return out

def init_keras_model(nodes, optimizer):
model = keras.Sequential()
model.add(keras.layers.Dense(nodes,activation=“linear”))
model.add(keras.layers.Activation(tf.keras.activations.swish))
model.add(keras.layers.Dense(1, activation=“linear”))
model.compile(loss=‘mse’,
optimizer=optimizer)
return model

def compare_keras_torch(Adam=True, lr = 0.01):
# Set seeds and initialize data
torch.manual_seed(42)
np.random.seed(42)
x_sample = np.array(np.random.uniform(size=[10, 10]), dtype=np.float)
y_sample = np.array(np.random.uniform(size=[10, 1]), dtype=np.float)

# Initialize models 
model_torch = NN_torch(x_sample.shape[1], 1, 10)

if Adam:
    optimizer_keras = keras.optimizers.Adam(lr)
    optimizer_torch = optim.Adam(model_torch.parameters(), lr=lr)
else:
    optimizer_keras = keras.optimizers.SGD(lr)
    optimizer_torch = optim.SGD(model_torch.parameters(), lr=lr)

model_keras = init_keras_model(x_sample.shape[0], optimizer_keras)
model_keras.build(input_shape=[None, x_sample.shape[1]])

# Set same weights
weight_0 = np.transpose(model_torch.dense_0.weight.detach().numpy())
bias_0 = np.transpose(model_torch.dense_0.bias.detach().numpy())
model_keras.get_layer(index=0).set_weights([weight_0, bias_0])
weight_1 = np.transpose(model_torch.out.weight.detach().numpy())
bias_1 = np.transpose(model_torch.out.bias.detach().numpy())
model_keras.get_layer(index=2).set_weights([weight_1, bias_1])

# Confirm models create same output
keras_pred = model_keras.predict(x_sample)
torch_pred = model_torch(torch.tensor(x_sample).float())
print((keras_pred - torch_pred.detach().numpy()))

# One epoch with same data
model_keras.fit(x_sample, y_sample, batch_size=x_sample.shape[0], shuffle=False, epochs=1)

criterion = nn.MSELoss()
outputs = model_torch(torch.tensor(x_sample).float())
loss = criterion(outputs, torch.tensor(y_sample.reshape(-1, 1)).float())
loss.backward()
optimizer_torch.step()
optimizer_torch.zero_grad()

# Confirm models create same output
keras_pred = model_keras.predict(x_sample)
torch_pred = model_torch(torch.tensor(x_sample).float())
print((keras_pred - torch_pred.detach().numpy()))

compare_keras_torch(Adam=True, lr = 0.1)
compare_keras_torch(Adam=False, lr = 0.1)