Why different concatenation and slicing order affects the grads?

Hi everyone, I am running a unit test using the code below. The thing is I found the output of code of ‘test2’ and ‘test3’ gives different results in grads of x and x itself after some iterations. ‘test2’ gives exactly the same results like in ‘test1’, but ‘test3’'s results differ from them. I wonder if it is the slicing causing the problem in the computational graph? Any suggestion is appreciated

result of ‘test2’:

iter: 0, x: 0.001123535679653287
x_after: 0.14787426590919495
iter: 1, x: 0.001123535679653287
x_after: 0.14719876646995544
iter: 2, x: 0.0011176818516105413
x_after: 0.1464969515800476
iter: 3, x: 0.0011090511688962579
x_after: 0.14573749899864197
iter: 4, x: 0.0010979033540934324
x_after: 0.14489953219890594
iter: 5, x: 0.0010852295672520995
x_after: 0.14401070773601532
iter: 6, x: 0.00107166962698102
x_after: 0.14312008023262024
iter: 7, x: 0.0010575008345767856
x_after: 0.14225336909294128
iter: 8, x: 0.001042835763655603
x_after: 0.14140984416007996
iter: 9, x: 0.001027711434289813
x_after: 0.1405780017375946
iter: 10, x: 0.0010121342493221164
x_after: 0.1397448629140854
iter: 11, x: 0.0009961080504581332
x_after: 0.13889974355697632
iter: 12, x: 0.000979656120762229
x_after: 0.13803447782993317
iter: 13, x: 0.0009628300904296339
x_after: 0.1371440291404724
iter: 14, x: 0.0009457063279114664
x_after: 0.13622605800628662
iter: 15, x: 0.0009283693507313728
x_after: 0.13528119027614594
iter: 16, x: 0.0009108937229029834
x_after: 0.13431242108345032
iter: 17, x: 0.000893332646228373
x_after: 0.13332459330558777
iter: 18, x: 0.0008757139439694583
x_after: 0.1323237121105194
iter: 19, x: 0.0008580424473620951
x_after: 0.1313161700963974
iter: 20, x: 0.0008403086685575545
x_after: 0.13030840456485748
iter: 21, x: 0.0008224951452575624
x_after: 0.12930648028850555
iter: 22, x: 0.000804582261480391
x_after: 0.12831594049930573
iter: 23, x: 0.0007865555817261338
x_after: 0.1273418664932251
iter: 24, x: 0.000768406200222671
x_after: 0.12638860940933228
iter: 25, x: 0.0007501322543248534
x_after: 0.12546001374721527
iter: 26, x: 0.0007317414856515825
x_after: 0.12455948442220688
iter: 27, x: 0.0007132487371563911
x_after: 0.12368995696306229
iter: 28, x: 0.0006946753128431737
x_after: 0.12285391241312027
iter: 29, x: 0.000676048279274255
x_after: 0.12205348908901215

result of ‘test3’

iter: 0, x: 0.001123535679653287
x_after: 0.14787426590919495
iter: 1, x: 0.001123535679653287
x_after: 0.14719876646995544
iter: 2, x: 0.0011176818516105413
x_after: 0.1464969515800476
iter: 3, x: 0.0011090511688962579
x_after: 0.14573749899864197
iter: 4, x: 0.0010979033540934324
x_after: 0.14489953219890594
iter: 5, x: 0.0010852295672520995
x_after: 0.14401070773601532
iter: 6, x: 0.00107166962698102
x_after: 0.14312008023262024
iter: 7, x: 0.0010575008345767856
x_after: 0.14225336909294128
iter: 8, x: 0.001042835763655603
x_after: 0.14140984416007996
iter: 9, x: 0.001027711434289813
x_after: 0.1405780017375946
iter: 10, x: 0.0010121342493221164
x_after: 0.1397448629140854
iter: 11, x: 0.0009961080504581332
x_after: 0.13889974355697632
iter: 12, x: 0.000979656120762229
x_after: 0.13803447782993317
iter: 13, x: 0.0009628300322219729
x_after: 0.1371440291404724
iter: 14, x: 0.0009457063279114664
x_after: 0.13622605800628662
iter: 15, x: 0.0009283691761083901
x_after: 0.13528119027614594
iter: 16, x: 0.0009108937229029834
x_after: 0.13431242108345032
iter: 17, x: 0.000893332646228373
x_after: 0.13332459330558777
iter: 18, x: 0.0008757137693464756
x_after: 0.1323237121105194
iter: 19, x: 0.0008580424473620951
x_after: 0.1313161700963974
iter: 20, x: 0.0008403086685575545
x_after: 0.13030840456485748
iter: 21, x: 0.0008224949124269187
x_after: 0.12930648028850555
iter: 22, x: 0.000804582261480391
x_after: 0.12831594049930573
iter: 23, x: 0.0007865555817261338
x_after: 0.1273418664932251
iter: 24, x: 0.000768406200222671
x_after: 0.12638860940933228
iter: 25, x: 0.0007501322543248534
x_after: 0.12546001374721527
iter: 26, x: 0.0007317413692362607
x_after: 0.12455948442220688
iter: 27, x: 0.0007132486207410693
x_after: 0.12368995696306229
iter: 28, x: 0.0006946754292584956
x_after: 0.12285391241312027
iter: 29, x: 0.0006760482210665941
x_after: 0.12205347418785095
import torch
import torch.nn as nn
import random
import os
import numpy  as np
random.seed(0)
os.environ['PYTHONHASHSEED'] = str(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x,y):
        out = x*y
        return out

class test():
    def __init__(self):
        self.x = torch.randn(30, 3, requires_grad=True)
        self.y = torch.randn(30, 1, requires_grad=True)
        self.model = Model()
        l = [
        {'params': [self.x], 'lr': 0.01, "name": "x"},
        {'params': [self.y], 'lr': 0.01, "name": "y"},
        {'params':  self.model.parameters(), 'lr': 0.01, "name": "model"},
        ]
        self.optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15)

gs = test()
gt = torch.randn(30, 3, requires_grad=False)
for i in range(50):
    means3D = gs.x
    means3D_next= gs.x

    '''test 1'''
    # features = torch.cat([means3D],dim=1)#1,2
    # out = gs.model(features,gs.y) #1,2
    # out_slice1 = out[:,:]

    '''test 2'''
    features = torch.cat([means3D,means3D[:,:2]],dim=1)#1,2
    out = gs.model(features,gs.y) #1,2
    out_slice1 = out[:,:3]

    '''test 3'''
    # features = torch.cat([means3D[:,:2],means3D],dim=1)#1,2
    # out = gs.model(features,gs.y) #1,2
    # out_slice1 = out[:,2:]

    loss = (out_slice1 - gt).mean()
    loss.backward()
    print(f"iter: {i}, x: {gs.x.grad.mean()}") 
    gs.optimizer.step()
    gs.optimizer.zero_grad()
    print(f"x_after: {gs.x.mean()}") 

The relative error is ~1e-7 ((0.12205348908901215 - 0.12205347418785095) / 0.12205348908901215 = 1.2208713822986592e-07) so it looks like an expected numerical mismatch caused by the limited floating point precision and potentially a different algorithm selection.

just did another test, the results of out1 and out2 differs. but out3 and out4 is the same after converting to numpy. Is this expected behavior and can I ask why numpy is more accurate?

x = torch.tensor([[-0.0700,  0.8622,  0.7565],
    [ 0.1642,  0.6630,  0.6686],
    [ 0.4033,  0.8380, -0.7193]],requires_grad=True)
y = torch.tensor([[-0.4033],
    [-0.5966],
    [ 0.1820]], requires_grad=True)

other = [x, x]
features = torch.cat(other, dim=1) 
out1 = features * y
out1_slice = out1[ : , 3 : ]
print(f"out1: {out1_slice.mean()}")

other2 = [x]    
features2 = torch.cat(other2, dim=1) 
out2 = features2 * y
out2_slice = out2[:,:]
print(f"out2: {out2_slice.mean()}")

other3 = [x.detach().numpy(),x.detach().numpy()]    
features3 = np.concatenate(other3, axis=1) 
out3 = features3 * y.detach().numpy()
out3_slice = out3[ : , 3 : ]
print(f"out3: {out3_slice.mean()}")

other4 = [x.detach().numpy()]    
features4 = np.concatenate(other4, axis=1) 
out4 = features4 * y.detach().numpy()
out4_slice = out4[:,:]
print(f"out4: {out4_slice.mean()}")
out1: -0.15799790620803833
out2: -0.15799789130687714
out3: -0.15799787640571594
out4: -0.15799787640571594

Numpy is not more accurate as all these results show the same error against a broader dtype (float64 in my example) and are thus equally “wrong” (these results are of course not wrong as it’s expected to see numerical mismatches given a limited floating point precision):

other5 = [x.detach().double().numpy(),x.detach().double().numpy()]    
features5 = np.concatenate(other5, axis=1) 
out5 = features5 * y.detach().double().numpy()
out5_slice = out5[ : , 3 : ]
print(f"out5: {out5_slice.mean()}")

np.abs(out1.double().detach().numpy() - out5).max()
np.abs(out2.double().detach().numpy() - out5[:, 3:]).max()
np.abs(out3 - out5).max()
np.abs(out4 - out5[:, 3:]).max()