I’m recreating a model I made in Keras, that worked without any problems, in PyTorch. Interestingly enough, the model in PyTorch gets stuck 9 out of 10 times and trains to almost the exact same performance as the one in Keras once every 10 times.

My model looks like this:

```
class Network(nn.Module):# Create a class that extends the nn.Module class
# This means that we now have a class that has all of the functionality of the nn.Module class
def __init__(self):
super(Network, self).__init__()
self.conv_1 = nn.Conv1d(in_channels = 41,
out_channels = 500,
kernel_size = 13,
padding = 6)
self.conv_2 = nn.Conv1d(in_channels = 500,
out_channels = 500,
kernel_size = 13,
padding = 6)
self.conv_3 = nn.Conv1d(in_channels = 500,
out_channels = 500,
kernel_size = 13,
padding = 6)
self.conv_4 = nn.Conv1d(in_channels = 500,
out_channels = 500,
kernel_size = 13,
padding = 6)
self.conv_5 = nn.Conv1d(in_channels = 500,
out_channels = 500,
kernel_size = 13,
padding = 6)
self.conv_6 = nn.Conv1d(in_channels = 500,
out_channels = 6,
kernel_size = 13,
padding = 6)
def forward(self, t):
t = self.conv_1(t)
t = F.relu(t)
t = self.conv_2(t)
t = F.relu(t)
t = self.conv_3(t)
t = F.relu(t)
t = self.conv_4(t)
t = F.relu(t)
t = self.conv_5(t)
t = F.relu(t)
t = self.conv_6(t)
t = torch.tanh(t)
return t
```

The layers in my Keras model are initialized with the Glorot Uniform function, so I applied this to my model with the weights_init(m) function below.

```
# Instantiating model
model = Network()
model = model.cuda()
# Initializing weights
def weights_init(m):
if isinstance(m, nn.Conv1d):
nn.init.xavier_uniform_(m.weight.data)
model.apply(weights_init)
```

I’m using custom loss functions that are the MAE and MSE. The only difference is that the labels contain masking values and are applied to the loss in the loss function. This works as intended in Keras and also in PyTorch the one time it does train.

```
# Creating custom loss functions
def MAE_masked(y_pred, y_true):
y_true_no_mask = y_true[:, :6, :]
mask = y_true[:, 6, :]
mask = mask.unsqueeze(dim = 1) # This makes the mask broadcastable
mea_masked = torch.mean(torch.abs(y_pred - y_true_no_mask) * mask)
return mea_masked
def MSE_masked(y_pred, y_true):
y_true_no_mask = y_true[:, :6, :]
mask = y_true[:, 6, :]
mask = mask.unsqueeze(dim = 1)
mse_masked = torch.mean((torch.abs(y_pred - y_true_no_mask)**2) * mask)
return mse_masked
```

I then create the optimizer as follows:

```
# Creating optimizer
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(),lr = learning_rate)
```

And then I create my training loop:

```
train_loss_list = []
val_loss_list = []
epochs = 3
start = time.time()
for epoch in range(epochs):
# Resetting the gradient. PyTorch keeps track of the gradient during a forward pass, but we don't want to accumulate the gradient over every epoch.
# Instead, we want the gradient to be calculated from scratch every epoch.
#optimizer.zero_grad()
# Forward to get output
# Training
train_loss_batch = []
for train_sample in train_loader:
optimizer.zero_grad()
X_train = train_sample[:, :41, :]
y_train = train_sample[:, 41:, :]
train_output = model(X_train)
train_loss = MSE_masked(train_output, y_train)
train_loss.backward()
optimizer.step()
train_loss_batch.append(train_loss.data)
ave_train_loss = torch.mean(torch.stack(train_loss_batch))
train_loss_list.append(ave_train_loss)
# Evaluating
val_loss_batch = []
for val_sample in val_loader:
X_val = val_sample[:, :41, :]
y_val = val_sample[:, 41:, :]
val_output = model(X_val)
val_loss = MSE_masked(val_output, y_val)
val_loss_batch.append(val_loss.data)
ave_val_loss = torch.mean(torch.stack(val_loss_batch))
val_loss_list.append(ave_val_loss)
# Print the epochs and losses
#if(epoch % 100 == 0):
print('epoch: {}, train_loss: {}, val_loss: {}'.format(epoch, ave_train_loss, ave_val_loss))
end = time.time()
training_time = end - start
```

What could be leading to this behavior?