Fluctuating loss function, and too small predictions

Michael_Mech · July 1, 2020, 9:48pm

Hi, I’m a noob at pytorch and just started using it 1 week ago. I am trying to train an RNN on some scaled, modified stock data, using inputs with multiple features to try and predict a single output(I’m interested in only 1 feature of the data). I do this by repeatedly, randomly sampling 3 timesteps to predict the next timestep, and compare it to the labels for the training data, to ultimately train the model, get the loss, and update the weights. The model predicts an output which has the same dimensions as the input, but I coded it to only get the single value at the index of the feature I’m interested in, hence outputting a a value of a single dimension.
However, as I try to train the model, the loss fluctuates greatly(indicated by the graph), and for some reason my predictions, when graphed against the actual targets, are very small. What could be accounting for this? Also, I don’t formally know comp sci, so my code might not be the best(I’m a self-learner).

##load data and modify data
pre_data=pd.read_csv('MSFTmod.csv')
pre_data=pre_data.drop(['date','Unnamed: 0','changeOverTime'],axis=1)
labels=pre_data.columns

scaler = MinMaxScaler(feature_range=(-1, 1))
standardizer=StandardScaler()
data=standardizer.fit_transform(pre_data)
#data=scaler.fit_transform(pre_data)
data=pd.DataFrame(data,columns=labels)

train_data= data[0:round(0.8*data.shape[0])]  #80% of first data will be for training
test_data= data[round(0.8*data.shape[0]):]
test_data=test_data.reset_index().drop('index',axis=1)
change_ind=test_data.columns.get_loc('change')
test_data.head()

#Get random sequence from the train_data
def rand_seq(seq_len=seq_len):
    #seq_len     = 3  # length of an input sequence
    start_index = random.randint(0, len(train_data) - seq_len) 
    end_index   = start_index+seq_len+1 #input takes first 3, target takes last 3
    rand_seq=train_data[start_index:end_index]
    return rand_seq

#Split into input and target
def Extract_Train(batch_size=batch_size):
    seq = rand_seq()
    train_inputs = torch.from_numpy(seq.iloc[:-1,:].values.astype(np.float32)) #input of shape (seq_len, batch, input_size)
    #train_targets = torch.from_numpy(seq.iloc[1:,:].values.astype(np.float32))
    train_targets= torch.from_numpy(seq.iloc[1:,change_ind].values.astype(np.float32))
    train_inputs=torch.reshape(train_inputs,(len(train_inputs), batch_size, input_size))
    train_targets=torch.reshape(train_targets,(-1, batch_size,change_size))
    return train_inputs.to(device), train_targets.to(device)

#get retraining data to retrain initialized hidden-state for predicting
retrain_inputs=torch.from_numpy(train_data.iloc[-5:,:].values.astype(np.float32)) #5 timesteps to iterate over, iterating over 3 times
retrain_inputs=torch.reshape(retrain_inputs,(retrain_inputs.shape[0], batch_size, input_size)) #get testing data from remainder of samples
test_inputs = torch.from_numpy(test_data.iloc[:-1,:].values.astype(np.float32))
#test_targets = torch.from_numpy(test_data.iloc[1:,:].values.astype(np.float32))
test_targets= torch.from_numpy(test_data.iloc[1:,change_ind].values.astype(np.float32))
test_inputs=torch.reshape(test_inputs,(test_inputs.shape[0], batch_size, input_size))
test_targets=torch.reshape(test_targets,(test_targets.shape[0], batch_size, change_size))
#test_targets=torch.reshape(test_targets,(test_targets.shape[0], batch_size, output_size))
test_inputs, test_targets, retrain_inputs= test_inputs.to(device), test_targets.to(device), retrain_inputs.to(device)type or paste code here

class Net(nn.Module):
    def __init__(self,input_size,hidden_size,output_size,batch_size):
        super(Net, self).__init__()
        self.hidden_size=hidden_size
        self.batch_size= batch_size
        self.output_size=output_size
        #self.input_size=input_size
        
        self.rnn= nn.RNNCell(input_size,hidden_size)#,nonlinearity= 'relu')
        self.linear=nn.Linear(self.hidden_size,self.output_size)
    
    def forward(self, inputs, hidden):
        #hidden=self.rnn(inputs,hidden)
        hidden= self.rnn(inputs.detach(),hidden.detach())
        output= self.linear(hidden)
        change=output[0][change_ind]
        return output, hidden, change
    
    def init_hidden(self):
        return torch.zeros(self.batch_size, self.hidden_size).to(device)
        
    
net=Net(input_size,hidden_size,output_size,batch_size)
net.to(device)

## train step that is to be iterated by number of epochs
def train_step(net,opt,loss_func):
    train_inputs, train_targets= Extract_Train()
    seq_len = train_inputs.shape[0]    # Get the sequence length of current input.
    hidden = net.init_hidden()  # Initial hidden state.
    net.zero_grad()             # Clear the gradient.
    loss = 0                    # Initial loss.

    for t in range(seq_len):    # For each one in the input sequence.
        output, hidden, change = net(train_inputs[t], hidden)
        #loss = loss + loss_func(output, train_targets[t])
        loss= loss+ loss_func(change,train_targets[t])
    
    loss.backward()             # Backward. 
    opt.step()                  # Update the weights.
    
    avg_loss= loss / seq_len
    return avg_loss          # Return the average loss w.r.t sequence length.

#Actual test inputs as inputs:
def eval_step(net, loss_func, retrain_inputs,seq_len):
    #seq_len= 3 
    eval_iter=len(test_targets)
    retrain_iter=len(retrain_inputs)
    eval_list=[]
    loss=0
    eval_loss=[]
    retrain_inputs=torch.cat((retrain_inputs,test_inputs[0:seq_len-1]),dim=0)
    net.eval() 
    with torch.no_grad():
        hidden=net.init_hidden()
        for i in range(retrain_iter): 
            retrain=retrain_inputs[i:i+seq_len,:]
            if retrain.shape[0]==seq_len:
                for t in range(seq_len):
                    output, _, change=net(retrain[t],hidden)
                    if i>=2:
                        loss=loss+loss_func(change,test_targets[t])
                    else:
                        continue
                if i>=2:
                    avg_loss=loss/seq_len
                    eval_list.append(change)
                    eval_loss.append(avg_loss)
                    loss=0
                else:
                    continue
            else:
                break
        for i in range(eval_iter):
            retest=test_inputs[i:i+seq_len,:]
            if retest.shape[0]==seq_len:
                for t in range(seq_len):
                    output,_, change= net(retest[t], hidden)
                    loss= loss_func(change, test_targets[t])
                eval_list.append(change)
                avg_loss= loss/seq_len
                eval_loss.append(avg_loss)
                loss=0
            else: 
                break
        return eval_list, eval_loss

train_losses=[]
training_iterations= 8000 #8000
loss_sum=0

for iteration in range(training_iterations):
    avg_loss=train_step(net, opt, loss_func)
    loss_sum= loss_sum + avg_loss
    if (iteration % 100) == 99:
        train_losses.append(loss_sum/100)
        loss_sum=0


eval_list, eval_loss=eval_step(net, loss_func, retrain_inputs,seq_len)

ptrblck · July 4, 2020, 6:10am

Could you check the shape of output before indexing it in this line of code?

change=output[0][change_ind]

I would assume that output should have the shape [batch_size, features], so output[0] would only use the first sample of the batch?

Michael_Mech · July 4, 2020, 9:51pm

It would seem that you’re correct.

How would I go about rectifying this?

ptrblck · July 6, 2020, 4:53am

Based on your current code it seems that train_input as well as train_targets have the temporal dimension in dim0, since you are indexing them as:

for t in range(seq_len):    # For each one in the input sequence.
        output, hidden, change = net(train_inputs[t], hidden)
        loss= loss+ loss_func(change,train_targets[t])

If that’s the case, then train_inputs[t] would have the shape [batch_size, features] after the indexing?

Are you using a batch size of 1, as the last screenshot prints the shape [1, 8]?

I’m not familiar with your use case and I would need to get more information about the input and target shapes as well as about the general use case.

Note that the majority of layers will accept an input in the shape [batch_size, *], so you would have to make sure to keep this shape. If you are indexing the batch dimension, you would only use a single sample instead of the complete batch.

Michael_Mech · July 10, 2020, 7:30am

Oh my bad. Yeah, I was using a batch size of one.

batch_size = 1
input_size= data.shape[1] #num of features
output_size= input_size #in RNN inputs are outputs
hidden_size = 3
seq_len=3

Does this explain why I’m getting such low values for the predictions?