Why the same tensor data, but different results after calculation

xinbai · April 23, 2023, 8:25am

I have built an LSTM net work.But I find when the model is doing prediction,the same tensor data has different outputs.

For example, when the output of the last hidden layer is

tensor([[0.0150],
        [0.0150],
        [0.0150],
        [0.0150],
        [0.0151],
        [0.0151],
        [0.0151],
        [0.0151],
        [0.0152],
        [0.0152],
        [0.0152],
        [0.0152],
        [0.0153],
        [0.0153],
        [0.0153],
        [0.0153]])

The predicted result is

tensor([[-0.1035],
        [-0.1035],
        [-0.1035],
        [-0.1035],
        [-0.1035],
        [-0.1035],
        [-0.1036],
        [-0.1036],
        [-0.1036],
        [-0.1036],
        [-0.1036],
        [-0.1036],
        [-0.1036],
        [-0.1036],
        [-0.1036],
        [-0.1036]])

The sixth input and the seventh input are both 0.0151, but their output is different.

I really don’t know what causes the different outputs.

Wish for your help.

The complete code is as follows.

import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

timesteps = 10;
L = 1;

batch_size = 16;
Epoch = 1;

timespaces=np.linspace(0,10,50000);

data=np.sin(timespaces * 10);
data=np.reshape(data, (-1, 1));

plt.plot(timespaces, data)
plt.show()

train_len=(int)(len(data) * 0.6);

train_data=data[:train_len]; # (30000, 1)
test_data=data[train_len:]; # (20000, 1)

print(train_data.shape);
print(test_data.shape);

plt.plot(timespaces[:train_len],train_data);
plt.plot(timespaces[train_len:],test_data);
plt.show();

def divide(data,timesteps,L):
    X,Y=[],[];
    
    for i in range(len(data) - timesteps):
        
        x = data[i:i+timesteps];
        y = data[i+timesteps+L-1];
        
        X.append(x);
        Y.append(y);
        
    return X,Y;

def divide_batch(data,batch_size):
    X = [];
    
    for i in range(len(data)):
        
        if((i + 1) * batch_size - 1 > len(data) - 1):
            break;
            
        x = data[i * batch_size : (i + 1) * batch_size];
        X.append(x);
        
    return X;

train_X,train_Y = divide(train_data,timesteps,L);
test_X,test_Y = divide(test_data,timesteps,L);

train_X = torch.tensor(train_X).to(torch.float32); 
train_Y = torch.tensor(train_Y).to(torch.float32);

print("before:");

print("train_X.shape:");
print(train_X.shape); 

print("train_Y.shape:");
print(train_Y.shape); 

train_X = divide_batch(train_X,batch_size);
train_Y = divide_batch(train_Y,batch_size);

train_X = torch.tensor([item.detach().numpy() for item in train_X] ).to(torch.float32); 
train_Y = torch.tensor([item.detach().numpy() for item in train_Y]).to(torch.float32);

print("after:");

print("train_X.shape:");
print(train_X.shape); 

print("train_Y.shape:");
print(train_Y.shape); 

test_X = torch.tensor(test_X).to(torch.float32);
test_Y = torch.tensor(test_Y).to(torch.float32);

print("before:");

print("test_X.shape:");
print(test_X.shape);

print("test_Y.shape:");
print(test_Y.shape); 

test_X = divide_batch(test_X,batch_size);
test_Y = divide_batch(test_Y,batch_size);

test_X = torch.tensor([item.detach().numpy() for item in test_X] ).to(torch.float32); 
test_Y = torch.tensor([item.detach().numpy() for item in test_Y]).to(torch.float32);

print("after:");

print("test_X.shape:");
print(test_X.shape); 

print("test_Y.shape:");
print(test_Y.shape);

class LSTM(object):
    def __init__(self,timesteps,batch_size,input_size,hidden_size,output_size):
        self.times = 0;
        
        self.timesteps = timesteps;
        self.batch_size = batch_size;
        
        self.input_size = input_size;
        self.hidden_size = hidden_size;
        self.output_size = output_size;
        
        self.Wfh = torch.tensor([[0.0107]]);
        self.Wfx = torch.tensor([[-0.0069]]);
        self.bf = torch.tensor([0.0126]);
        
        self.Wih = torch.tensor([[-0.0045]]);
        self.Wix = torch.tensor([[-0.0180]]);
        self.bi = torch.tensor([1]);
        
        self.Woh = torch.tensor([[-0.0065]]);
        self.Wox = torch.tensor([[0.0002]]);
        self.bo = torch.tensor([-0.0018]);
        
        self.Wch = torch.tensor([[-0.0039]]);
        self.Wcx = torch.tensor([[0.0162]]);
        self.bc = torch.tensor([0.0202]);
        
        self.Wp = torch.tensor([[-0.0563]]);
        self.bp = torch.tensor([-0.1027]);
        
#         self.Wfh,self.Wfx,self.bf = self.Weight_bias(self.input_size,self.hidden_size);
#         self.Wih,self.Wix,self.bi = self.Weight_bias(self.input_size,self.hidden_size);
#         self.Woh,self.Wox,self.bo = self.Weight_bias(self.input_size,self.hidden_size);
#         self.Wch,self.Wcx,self.bc = self.Weight_bias(self.input_size,self.hidden_size);
        
#         self.Wp = torch.randn(self.hidden_size,self.output_size) * 0.01;
#         self.bp = torch.randn(self.output_size) * 0.01;

        self.f = torch.zeros(self.batch_size,self.hidden_size);
        self.i = torch.zeros(self.batch_size,self.hidden_size);
        self.o = torch.zeros(self.batch_size,self.hidden_size);
        self.ct = torch.zeros(self.batch_size,self.hidden_size);
        
        self.h = torch.zeros(self.batch_size,self.hidden_size);
        self.c = torch.zeros(self.batch_size,self.hidden_size);
        
        self.fList = [];
        self.iList = [];
        self.oList = [];
        self.ctList = [];
        
        self.hList = [];
        self.cList = [];
        
        self.preList=[];
        
        self.fList.append(self.f);
        self.iList.append(self.i);
        self.oList.append(self.o);
        self.ctList.append(self.ct);
        
        self.hList.append(self.h);
        self.cList.append(self.c);
        
        print("__init__:");
        
        print("self.Wfh:");
        print(self.Wfh.shape);
        
        print("self.Wfx:");
        print(self.Wfx.shape);
        
        print("self.bf:");
        print(self.bf.shape);
        
        print("self.Wih:");
        print(self.Wih.shape);
        
        print("self.Wix:");
        print(self.Wix.shape);
        
        print("self.bi:");
        print(self.bi.shape);
        
        print("self.Woh:");
        print(self.Woh.shape);
        
        print("self.Wox:");
        print(self.Wox.shape);
        
        print("self.bo:");
        print(self.bo.shape);
        
        print("self.Wch:");
        print(self.Wch.shape);
        
        print("self.Wcx:");
        print(self.Wcx.shape);
        
        print("self.bc:");
        print(self.bc.shape);
        
        print("self.h:");
        print(self.h.shape);
        
        print("self.c:");
        print(self.c.shape);
        
    def Weight_bias(self,input_size,hidden_size):
        return (torch.randn(hidden_size,hidden_size) * 0.01,
                torch.randn(input_size,hidden_size) * 0.01,
                torch.randn(hidden_size) * 0.01);
        
    def forward(self,x):
        for i in range(self.timesteps):
            self.times += 1;
            
            self.f = self.Sigmoid_forward(self.hList[-1] @ self.Wfh + x[i] @ self.Wfx + self.bf);
            self.i = self.Sigmoid_forward(self.hList[-1] @ self.Wih + x[i] @ self.Wix + self.bi);
            self.o = self.Sigmoid_forward(self.hList[-1] @ self.Woh + x[i] @ self.Wox + self.bo);
            self.ct = self.Tanh_forward(self.hList[-1] @ self.Wch + x[i] @ self.Wcx + self.bc);
                
            self.c = self.f * self.cList[-1] + self.i * self.ct;
            self.h = self.o * self.Tanh_forward(self.c);
                
            self.fList.append(self.f);
            self.iList.append(self.i);
            self.oList.append(self.o);
            self.ctList.append(self.ct);
            
            self.hList.append(self.h);
            self.cList.append(self.c);
        
        print("self.h:");
        print(self.h);
        
        return self.prediction();
        
    def prediction(self):
        pre = self.hList[-1] @ self.Wp + self.bp;
        self.preList.append(pre);
        
        return pre;
    
    def backward(self,x,grad):
        
        self.delta_Wfh,self.delta_Wfx,self.delta_bf = self.Weight_bias(self.input_size,self.hidden_size);
        self.delta_Wih,self.delta_Wix,self.delta_bi = self.Weight_bias(self.input_size,self.hidden_size);
        self.delta_Woh,self.delta_Wox,self.delta_bo = self.Weight_bias(self.input_size,self.hidden_size);
        self.delta_Wch,self.delta_Wcx,self.delta_bc = self.Weight_bias(self.input_size,self.hidden_size);
        
        self.delta_hList = self.init_delta();
        self.delta_cList = self.init_delta();
        
        self.delta_fList = self.init_delta();
        self.delta_iList = self.init_delta();
        self.delta_oList = self.init_delta();
        self.delta_ctList = self.init_delta();
        
        self.delta_hList[-1] = grad;
                                                    
        for k in range(self.times,0,-1):
            self.compute_gate_backward(self,x,k);
            
        self.compute_Weight_bias_backward(self,x);
            
    def init_delta(self):
        X = [];
        
        for i in range(self.times + 1):
            X.append(np.zeros(self.batch_size,self.hidden_size));
            
        return X;
    
    def compute_gate_backward(self,t,x):
        
        f = self.fList[k];
        i = self.iList[k];
        o = self.oList[k];
        ct = self.ctList[k];
        
        h = self.hList[k];
        c = self.cList[k];
        
        c_pre = self.cList[k-1];
        f_for = self.fList[k+1];
        
        delta_hk = self.delta_hList[k];
        
        if(k == self.times):
            delta_ck = delta_hk * o * self.Tanh_backward(c);
        else:
            delta_ck = delta_hk * o * self.Tanh_backward(c) + self.delta_cList[k+1] * f_for;
            
        delta_ctk = delta_ck * i;
        delta_fk = delta_ck * c_pre;
        delta_ik = delta_ck * ct;
        delta_ok = delta_hk * Tanh_forward(c);
        
        delta_hkpre = delta_fk * self.Sigmoid_backward(h @ self.Wfh + x[k-1] @ self.Wfx + self.bf) * self.Wfh + delta_ik * self.Sigmoid_backward(h @ self.Wih + x[k-1] @ self.Wix + self.bi) * self.Wih +delta_ok * self.Sigmoid_backward(h @ self.Woh + x[k-1] @ self.Wox + self.bo) * self.Woh +delta_ctk * self.Tanh_backward(h @ self.Wch + x[k-1] @ self.Wcx + self.bc) * self.Wch;
                    
        self.delta_hList[k-1] = delta_hkpre;
        self.delta_cList[k] = delta_ck;
        
        self.delta_fList[k] = delta_fk;
        self.delta_iList[k] = delta_ik;
        self.delta_oList[k] = delta_ok;
        self.delta_ctList[k] = delta_ctk;
        
    def compute_Weight_bias_backward(self,x):
        
        for t in range (slef.times,0,-1):
            
            delta_Wfh = self.delta_fList[t] * self.Sigmoid_backward(h @ self.Wfh + x[t-1] @ self.Wfx + self.bf) * self.hList[t-1];
            delta_Wfx = self.delta_fList[t] * self.Sigmoid_backward(h @ self.Wfh + x[t-1] @ self.Wfx + self.bf) * x[t];
            delta_bf = self.delta_fList[t] * self.Sigmoid_backward(h @ self.Wfh + x[t-1] @ self.Wfx + self.bf);
            
            delta_Wih = self.delta_iList[t] * self.Sigmoid_backward(h @ self.Wih + x[t-1] @ self.Wix + self.bi) * self.hList[t-1];
            delta_Wix = self.delta_iList[t] * self.Sigmoid_backward(h @ self.Wih + x[t-1] @ self.Wix + self.bi) * x[t];
            delta_bi = self.delta_iList[t] * self.Sigmoid_backward(h @ self.Wih + x[t-1] @ self.Wix + self.bi);
            
            delta_Wch = self.delta_ctList[t] * self.Tanh_backward(h @ self.Wch + x[t-1] @ self.Wcx + self.bc) * self.hList[t-1];
            delta_Wcx = self.delta_ctList[t] * self.Tanh_backward(h @ self.Wch + x[t-1] @ self.Wcx + self.bc) * x[t];
            delta_bc = self.delta_ctList[t] * self.Tanh_backward(h @ self.Wch + x[t-1] @ self.Wcx + self.bc);
            
            delta_Woh = self.delta_oList[t] * self.Sigmoid_backward(h @ self.Woh + x[t-1] @ self.Wox + self.bo) * self.hList[t-1];
            delta_Wox = self.delta_oList[t] * self.Sigmoid_backward(h @ self.Woh + x[t-1] @ self.Wox + self.bo) * x[t];
            delta_bo = self.delta_oList[t] * self.Sigmoid_backward(h @ self.Woh + x[t-1] @ self.Wox + self.bo);
            
            self.delta_Wfh += delta_Wfh;
            self.delta_Wfx += delta_Wfx;
            self.delta_bf += delta_bf;
            
            self.delta_Wih += delta_Wih;
            self.delta_Wix += delta_Wix;
            self.delta_bi += delta_bi;
            
            self.delta_Wch += delta_Wch;
            self.delta_Wcx += delta_Wcx;
            self.delta_bc += delta_bc;
            
            self.delta_Woh += delta_Woh;
            self.delta_Wox += delta_Wox;
            self.delta_bo += delta_bo;
    
    def update(self,lr):
        
        self.Wfh -= self.delta_Wfh * lr;
        self.Wfx -= self.delta_Wfx * lr;
        self.bf -= self.delta_bf * lr;
        
        self.Wih -= self.delta_Wih * lr;
        self.Wix -= self.delta_Wix * lr;
        self.bi -= self.delta_bi * lr;
        
        self.Woh -= self.delta_Woh * lr;
        self.Wox -= self.delta_Wox * lr;
        self.bo -= self.delta_bo * lr;
        
        self.Wch -= self.delta_Wch * lr;
        self.Wcx -= self.delta_Wcx * lr;
        self.bc -= self.delta_bc * lr;
        
    def reset(self):
        self.times = 0;
        
        self.hList = [torch.zeros(self.batch_size,self.hidden_size)];
        self.cList = [torch.zeros(self.batch_size,self.hidden_size)];
        
    def Sigmoid_forward(self,x):
        return 1.0 / (1.0 + torch.exp(-x));

    def Sigmoid_backward(self,x):
        return x * (1 - x);

    def Tanh_forward(self,x):
        return ((torch.exp(x) - torch.exp(-x)) / (torch.exp(x) + torch.exp(-x)));

    def Tanh_backward(self,x):
        return 1 - (self.Tanh_forward(x) * self.Tanh_forward(x));

l = LSTM(timesteps,batch_size,1,1,1);

lr = 0.01;
lossList = [];

for epoch in range(Epoch):
    
    loss = 0;
    
    for i in range(len(train_X)):
        
        x = train_X[i].permute(1, 0, 2);
        
        l.forward(x);   
        
        pre = l.prediction();
        
        print("pre:");
        print(pre);

KFrank · April 23, 2023, 6:24pm

Hi Xinbai!

The values you think are the same are almost certainly different. By default,
pytorch prints out tensors with four digits, while a float has about seven or
eight digits of precision.

Try torch.set_printoptions (precision = 8) and then print out your
tensors.

Best.

K. Frank

xinbai · April 24, 2023, 8:51am

Thank you for your help.

The reason why I ask this question is to compare my handwritten model with calling the function in pytorch, and found that the output of calling the pytorch function is the same, but the output of my model is different. I overlooked the problem of precision.

Thank you again for your help.