Converting tensorflow model to pytorch: the traininning isn't the same

Greetings,

My data consists of time-series samples with 925 steps, each containing 2 features. In other words, my data is shaped as (samples, steps, features)

The model I’m currently implementing works in TensorFlow, but I’m having trouble properly implementing it in PyTorch class.

For the model in Pytorch it is difficult to converge.

Keras Model

def sampling(samp_args):
    z_mean, z_log_sigma = samp_args

    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    # by default, random_normal has mean = 0 and std = 1.0
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_sigma) * epsilon

# encoder
encoder = Bidirectional(GRU(gru_size, name="encoder1", return_sequences=True))(
    main_input
)
encoder = Dropout(dropout_val, name="drop_encoder1")(encoder)
encoder = Bidirectional(GRU(gru_size, name="encoder2", return_sequences=False))(encoder)
encoder = Dropout(dropout_val, name="drop_encoder2")(encoder)
codings_mean = Dense(units=output_size, name="encoding_mean", activation="linear")(
    encoder
)
codings_log_var = Dense(
    units=output_size, name="encoding_log_var", activation="linear"
)(encoder)
codings = Lambda(sampling, output_shape=(output_size,))([codings_mean, codings_log_var])


# decoder
decoder = RepeatVector(lcs_scaled.shape[1], name="repeat")(codings)
decoder = merge.concatenate([aux_input, decoder])
decoder = GRU(gru_size, name="decoder1", return_sequences=True)(decoder)
decoder = Dropout(dropout_val, name="drop_decoder1")(decoder)
decoder = GRU(gru_size, name="decoder2", return_sequences=True)(decoder)
decoder = TimeDistributed(Dense(1, activation="linear"), name="time_dist")(decoder)

# VAE
model = Model(model_input, decoder)

Pytorch

class TimeDistributedLayer(nn.Module):
    def __init__(self, module, batch_first=False):
        super(TimeDistributedLayer, self).__init__()
        self.module = module
        self.batch_first = batch_first

    def forward(self, x):

        if len(x.size()) <= 2:
            return self.module(x)

        # Squash samples and timesteps into a single axis
        x_reshape = x.contiguous().view(-1, x.size(-1))  # (samples * timesteps, input_size)
        y = self.module(x_reshape)
        # We have to reshape Y
        if self.batch_first:
            y = y.contiguous().view(x.size(0), -1, y.size(-1))  # (samples, timesteps, output_size)
        else:
            y = y.transpose(0,1).contiguous()  # transpose to (timesteps, samples, output_size)
        return y




class VariationalRecurrentAutoEncoder(nn.Module):
    """
    Model replicated from the paper 2021 but implemented in Pytorch
    """
    def __init__(self, input_size_main=2,input_size_aux=1, gru_size=32,linear_size=16,dim_seq = 925, drop_out=0.25, use_gpu=False):
        super(VariationalRecurrentAutoEncoder, self).__init__()

        # Configuration
        self.dropout = nn.Dropout(drop_out)
        self.use_gpu = use_gpu

        # Encoder
        self.gru_encoder = nn.GRU(input_size=input_size_main,
                                hidden_size=gru_size,
                                num_layers=1,
                                batch_first=True,
                                bidirectional=True)

        self.gru_encoder2 = nn.GRU(input_size=gru_size*2,
                                hidden_size=gru_size,
                                num_layers=1,
                                batch_first=True,
                                bidirectional=True)
        # Latent space
        self.linear_mean = nn.Linear(gru_size*2, linear_size)
        self.linear_logvar= nn.Linear(gru_size*2, linear_size)

        # Decoder
        self.gru_decoder= nn.GRU(input_size=linear_size+input_size_aux,
                                hidden_size=gru_size,
                                num_layers=1,
                                batch_first=True,
                                bidirectional=False)

        self.gru_decoder2 = nn.GRU(input_size=gru_size,
                                hidden_size=gru_size,
                                num_layers=1,
                                batch_first=True,
                                bidirectional=False)

        self.timedis = TimeDistributedLayer(nn.Linear(gru_size,1),True)

    def encode(self, x):
        x, _ = self.gru_encoder(x)
        x = self.dropout(x)
        x, _ = self.gru_encoder2(x)
        x = x[:,-1]
        x = self.dropout(x)
        mean, log_var = self.linear_mean(x), self.linear_logvar(x)
        return mean, log_var
    def decode(self, z, aux):

        latente = torch.clone(z)
        z = torch.unsqueeze(z, dim=1).repeat(1,self.dim_seq,1)
        
        cat = torch.cat((aux, z),dim=2)
        out, _ = self.gru_decoder(cat)
        out = self.dropout(out)
        out, _ = self.gru_decoder2(out)
        out = self.timedis(out)
        return out, latente
    def forward(self, x):

        self.dim_seq = x.shape[1]
        main, aux = x, torch.unsqueeze(x[:,:,0], dim=2)
        mean, log_var = self.encode(main)
        std = torch.exp(0.5*log_var)
        
        #epsilon = torch.empty(mean.shape[1]).normal_(mean=0, std=1)
        epsilon = torch.randn_like(std)
        if self.use_gpu:
            epsilon = epsilon.cuda()
                    
        z = mean + epsilon*std
        
        out,latente = self.decode(z, aux)
        return out, mean, log_var,latente

Thanks in advance

I would try to break down the code into smaller parts and verify that small modules are behaving identically. E.g. the more interested modules would be the bidirectional GRU layers, which are using different values for return_sequences (I’m unsure what this would mean).