Why different concatenation and slicing order affects the grads?

ss2un · October 10, 2024, 2:04pm

Hi everyone, I am running a unit test using the code below. The thing is I found the output of code of ‘test2’ and ‘test3’ gives different results in grads of x and x itself after some iterations. ‘test2’ gives exactly the same results like in ‘test1’, but ‘test3’'s results differ from them. I wonder if it is the slicing causing the problem in the computational graph? Any suggestion is appreciated

result of ‘test2’:

iter: 0, x: 0.001123535679653287
x_after: 0.14787426590919495
iter: 1, x: 0.001123535679653287
x_after: 0.14719876646995544
iter: 2, x: 0.0011176818516105413
x_after: 0.1464969515800476
iter: 3, x: 0.0011090511688962579
x_after: 0.14573749899864197
iter: 4, x: 0.0010979033540934324
x_after: 0.14489953219890594
iter: 5, x: 0.0010852295672520995
x_after: 0.14401070773601532
iter: 6, x: 0.00107166962698102
x_after: 0.14312008023262024
iter: 7, x: 0.0010575008345767856
x_after: 0.14225336909294128
iter: 8, x: 0.001042835763655603
x_after: 0.14140984416007996
iter: 9, x: 0.001027711434289813
x_after: 0.1405780017375946
iter: 10, x: 0.0010121342493221164
x_after: 0.1397448629140854
iter: 11, x: 0.0009961080504581332
x_after: 0.13889974355697632
iter: 12, x: 0.000979656120762229
x_after: 0.13803447782993317
iter: 13, x: 0.0009628300904296339
x_after: 0.1371440291404724
iter: 14, x: 0.0009457063279114664
x_after: 0.13622605800628662
iter: 15, x: 0.0009283693507313728
x_after: 0.13528119027614594
iter: 16, x: 0.0009108937229029834
x_after: 0.13431242108345032
iter: 17, x: 0.000893332646228373
x_after: 0.13332459330558777
iter: 18, x: 0.0008757139439694583
x_after: 0.1323237121105194
iter: 19, x: 0.0008580424473620951
x_after: 0.1313161700963974
iter: 20, x: 0.0008403086685575545
x_after: 0.13030840456485748
iter: 21, x: 0.0008224951452575624
x_after: 0.12930648028850555
iter: 22, x: 0.000804582261480391
x_after: 0.12831594049930573
iter: 23, x: 0.0007865555817261338
x_after: 0.1273418664932251
iter: 24, x: 0.000768406200222671
x_after: 0.12638860940933228
iter: 25, x: 0.0007501322543248534
x_after: 0.12546001374721527
iter: 26, x: 0.0007317414856515825
x_after: 0.12455948442220688
iter: 27, x: 0.0007132487371563911
x_after: 0.12368995696306229
iter: 28, x: 0.0006946753128431737
x_after: 0.12285391241312027
iter: 29, x: 0.000676048279274255
x_after: 0.12205348908901215

result of ‘test3’

iter: 0, x: 0.001123535679653287
x_after: 0.14787426590919495
iter: 1, x: 0.001123535679653287
x_after: 0.14719876646995544
iter: 2, x: 0.0011176818516105413
x_after: 0.1464969515800476
iter: 3, x: 0.0011090511688962579
x_after: 0.14573749899864197
iter: 4, x: 0.0010979033540934324
x_after: 0.14489953219890594
iter: 5, x: 0.0010852295672520995
x_after: 0.14401070773601532
iter: 6, x: 0.00107166962698102
x_after: 0.14312008023262024
iter: 7, x: 0.0010575008345767856
x_after: 0.14225336909294128
iter: 8, x: 0.001042835763655603
x_after: 0.14140984416007996
iter: 9, x: 0.001027711434289813
x_after: 0.1405780017375946
iter: 10, x: 0.0010121342493221164
x_after: 0.1397448629140854
iter: 11, x: 0.0009961080504581332
x_after: 0.13889974355697632
iter: 12, x: 0.000979656120762229
x_after: 0.13803447782993317
iter: 13, x: 0.0009628300322219729
x_after: 0.1371440291404724
iter: 14, x: 0.0009457063279114664
x_after: 0.13622605800628662
iter: 15, x: 0.0009283691761083901
x_after: 0.13528119027614594
iter: 16, x: 0.0009108937229029834
x_after: 0.13431242108345032
iter: 17, x: 0.000893332646228373
x_after: 0.13332459330558777
iter: 18, x: 0.0008757137693464756
x_after: 0.1323237121105194
iter: 19, x: 0.0008580424473620951
x_after: 0.1313161700963974
iter: 20, x: 0.0008403086685575545
x_after: 0.13030840456485748
iter: 21, x: 0.0008224949124269187
x_after: 0.12930648028850555
iter: 22, x: 0.000804582261480391
x_after: 0.12831594049930573
iter: 23, x: 0.0007865555817261338
x_after: 0.1273418664932251
iter: 24, x: 0.000768406200222671
x_after: 0.12638860940933228
iter: 25, x: 0.0007501322543248534
x_after: 0.12546001374721527
iter: 26, x: 0.0007317413692362607
x_after: 0.12455948442220688
iter: 27, x: 0.0007132486207410693
x_after: 0.12368995696306229
iter: 28, x: 0.0006946754292584956
x_after: 0.12285391241312027
iter: 29, x: 0.0006760482210665941
x_after: 0.12205347418785095

import torch
import torch.nn as nn
import random
import os
import numpy  as np
random.seed(0)
os.environ['PYTHONHASHSEED'] = str(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x,y):
        out = x*y
        return out

class test():
    def __init__(self):
        self.x = torch.randn(30, 3, requires_grad=True)
        self.y = torch.randn(30, 1, requires_grad=True)
        self.model = Model()
        l = [
        {'params': [self.x], 'lr': 0.01, "name": "x"},
        {'params': [self.y], 'lr': 0.01, "name": "y"},
        {'params':  self.model.parameters(), 'lr': 0.01, "name": "model"},
        ]
        self.optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15)

gs = test()
gt = torch.randn(30, 3, requires_grad=False)
for i in range(50):
    means3D = gs.x
    means3D_next= gs.x

    '''test 1'''
    # features = torch.cat([means3D],dim=1)#1,2
    # out = gs.model(features,gs.y) #1,2
    # out_slice1 = out[:,:]

    '''test 2'''
    features = torch.cat([means3D,means3D[:,:2]],dim=1)#1,2
    out = gs.model(features,gs.y) #1,2
    out_slice1 = out[:,:3]

    '''test 3'''
    # features = torch.cat([means3D[:,:2],means3D],dim=1)#1,2
    # out = gs.model(features,gs.y) #1,2
    # out_slice1 = out[:,2:]

    loss = (out_slice1 - gt).mean()
    loss.backward()
    print(f"iter: {i}, x: {gs.x.grad.mean()}") 
    gs.optimizer.step()
    gs.optimizer.zero_grad()
    print(f"x_after: {gs.x.mean()}")

ptrblck · October 10, 2024, 9:14pm

The relative error is ~1e-7 ((0.12205348908901215 - 0.12205347418785095) / 0.12205348908901215 = 1.2208713822986592e-07) so it looks like an expected numerical mismatch caused by the limited floating point precision and potentially a different algorithm selection.

ss2un · October 10, 2024, 10:04pm

just did another test, the results of out1 and out2 differs. but out3 and out4 is the same after converting to numpy. Is this expected behavior and can I ask why numpy is more accurate?

x = torch.tensor([[-0.0700,  0.8622,  0.7565],
    [ 0.1642,  0.6630,  0.6686],
    [ 0.4033,  0.8380, -0.7193]],requires_grad=True)
y = torch.tensor([[-0.4033],
    [-0.5966],
    [ 0.1820]], requires_grad=True)

other = [x, x]
features = torch.cat(other, dim=1) 
out1 = features * y
out1_slice = out1[ : , 3 : ]
print(f"out1: {out1_slice.mean()}")

other2 = [x]    
features2 = torch.cat(other2, dim=1) 
out2 = features2 * y
out2_slice = out2[:,:]
print(f"out2: {out2_slice.mean()}")

other3 = [x.detach().numpy(),x.detach().numpy()]    
features3 = np.concatenate(other3, axis=1) 
out3 = features3 * y.detach().numpy()
out3_slice = out3[ : , 3 : ]
print(f"out3: {out3_slice.mean()}")

other4 = [x.detach().numpy()]    
features4 = np.concatenate(other4, axis=1) 
out4 = features4 * y.detach().numpy()
out4_slice = out4[:,:]
print(f"out4: {out4_slice.mean()}")

out1: -0.15799790620803833
out2: -0.15799789130687714
out3: -0.15799787640571594
out4: -0.15799787640571594

ptrblck · October 11, 2024, 1:32pm

Numpy is not more accurate as all these results show the same error against a broader dtype (float64 in my example) and are thus equally “wrong” (these results are of course not wrong as it’s expected to see numerical mismatches given a limited floating point precision):

other5 = [x.detach().double().numpy(),x.detach().double().numpy()]    
features5 = np.concatenate(other5, axis=1) 
out5 = features5 * y.detach().double().numpy()
out5_slice = out5[ : , 3 : ]
print(f"out5: {out5_slice.mean()}")

np.abs(out1.double().detach().numpy() - out5).max()
np.abs(out2.double().detach().numpy() - out5[:, 3:]).max()
np.abs(out3 - out5).max()
np.abs(out4 - out5[:, 3:]).max()