Problem with converting keras code to pytorch code

My code in keras working good but not in pytorch.

My code in keras:
combined = concatenate([left.output, right.output])

combined = Conv3D(128, (3, 3, 3), activation='relu', strides=1, kernel_initializer='he_uniform' , padding='same')(combined)

combined = BatchNormalization()(combined)

combined = Conv3D(64, (3, 3, 3), activation='relu', strides=1 , kernel_initializer='he_uniform' , padding='same')(combined)

combined = BatchNormalization()(combined)

combined = MaxPooling3D(pool_size=(2, 2, 2))(combined)

combined = Flatten()(combined)

# apply a FC layer and then a regression prediction on the

# combined outputs

z = Dense(64, activation="relu")(combined)

z = BatchNormalization()(z)

z = Dropout(0.5)(z)

z = Dense(32, activation="relu")(z)

z = Dense(1, activation="linear")(z)

model = Model(inputs=[left.input, right.input], outputs=z)

My optimizer:
optimizer = SGD(momentum=0.9, nesterov=True)
self.model.compile(loss=‘mean_absolute_error’, optimizer=optimizer, metrics=metrics)

My code in Pytorch:
def train_epoch(epoch, data_loader, model, criterion,criterion2, optimizer, opt,

            epoch_logger, batch_logger):

print('train at epoch {}'.format(epoch))

model.train()

model = model.cuda()

# model = nn.DataParallel(model, device_ids=None)

batch_time = AverageMeter()

data_time = AverageMeter()

losses = AverageMeter()

losses2 = AverageMeter()

top1 = AverageMeter()

top5 = AverageMeter()

end_time = time.time()

for i, (inputs, targets) in enumerate(data_loader):

    data_time.update(time.time() - end_time)

    if not opt.no_cuda:

        targets = targets.cuda()

    # inputs=inputs[0]  

    inputs = inputs.squeeze()

    inputs = np.reshape(inputs, (inputs.shape[0],inputs.shape[1],inputs.shape[5],inputs.shape[2],inputs.shape[3],inputs.shape[4]))

    

    inputs = inputs.cuda()

    

    outputs = model(inputs[0].float(), inputs[1].float())

    targets=torch.transpose(targets, 0, 1)

    targets = targets.float()

    # targets = targets.unsqueeze(-1)

    # outputs = torch.reshape(outputs, (-1,))



    # targets = targets.to(torch.float64)

    # targets = targets.unsqueeze(0)

    # targets = torch.reshape(targets, (-1,))

    targets = targets.squeeze()

    outputs = outputs.squeeze()

    loss = criterion(outputs, targets)

    loss2 = criterion2(outputs, targets)

    # loss = loss * weight

    # loss = loss.sum()

    losses.update(loss.mean().data, inputs.size(0))

    losses2.update(loss2.data, inputs.size(0))

    # prec1, prec5 = calculate_accuracy(outputs.data, targets.data, topk=(1,5))

    # top1.update(prec1, inputs.size(0))

    # top5.update(prec5, inputs.size(0))

    optimizer.zero_grad()

    loss.mean().backward()

    optimizer.step()

And my model in pytorch:
class combineNet(nn.Module):

def __init__(self, modelA, modelB):

    super(combineNet, self).__init__()

    self.modelA = modelA

    self.modelB = modelB

    

    # conv

    self.conv1 = nn.Conv3d(2048, 128, kernel_size=(3,3,3), stride=1,  padding=( 1,1,1))

    nn.init.xavier_uniform_(self.conv1.weight)#, gain=nn.init.calculate_gain('relu')

    nn.init.zeros_(self.conv1.bias)

    self.BN1 = nn.BatchNorm3d(128, eps=0.001).train()#, momentum=0.99

    self.conv2 = nn.Conv3d(128, 64, kernel_size=(3,3,3), stride=1,  padding=(1,1,1))

    nn.init.xavier_uniform_(self.conv2.weight)#, gain=nn.init.calculate_gain('relu')

    nn.init.zeros_(self.conv2.bias)

    self.BN2 = nn.BatchNorm3d(64, eps=0.001).train()#, momentum=0.99

    self.MP = nn.MaxPool3d(2)

    self.fc1 = nn.Linear(128, 64)#, bias=False 32768

    # nn.init.xavier_uniform_(self.fc1.weight)

    # nn.init.zeros_(self.fc1.bias)

    self.BN = nn.BatchNorm1d(64, eps=0.001).train()#, momentum=0.99

    self.act = nn.ReLU()

    self.dr = nn.Dropout(0.5)

    self.classifier1 = nn.Linear(64, 32)# 128, bias=False

    # nn.init.xavier_uniform_(self.classifier1.weight)

    # nn.init.zeros_(self.classifier1.bias)

    self.classifier2 = nn.Linear(32, 1)#, bias=False

    # nn.init.xavier_uniform_(self.classifier2.weight)

    # nn.init.zeros_(self.classifier2.bias)

    # self.act=nn.ReLU()

def forward(self, x1, x2):

    x1 = self.modelA(x1)

    x2 = self.modelB(x2)

    x = torch.cat((x1, x2), 1)#, dim=1

    x = x.view(x.size(0),2048,4,2,2)

    x = self.act(self.conv1(x))

    x = self.BN1(x)

    x = self.act(self.conv2(x))

    x = self.BN2(x)

    x = self.MP(x)

    x = x.view(x.size(0), -1)

    x = self.BN(self.act(self.fc1(x))) #self.pool 

    x = self.dr(x)

    x = self.act(self.classifier1(x))

    x = self.classifier2(x)

    return x

My tran loss in keras is 0.2 (my objective) but in pytorch it goes to 0.35
Also the validation loss also not decreasing from 0.4.
Both learning rates are 0.01
The datasets are the same.

Actually the training loss is decreasing in both keras and pytorch. But in pytorch code the validation loss decreases to 0.35 and then not decreases.

Any help?

One difference would be the explicit initialization of the first conv layers with he_uniform in Keras and xavier_uniform (also knows as Glorot) in PyTorch, so you might want to use the same parameter initializations.

I dont know how to initialize he_uniform in pytorch.
I did not find anything!

Could you please help to do that?

I want to add that both keras and pytorch training are the same. But their validation losses are different.
I am pretty sure that general codes are OK. Because I am running this code n another PC, and its working (another dataset). All the versions and libraries are the same in both PCs.