How to do make a single row inference for pertained time series model in pytorch

Kalyan.Banik · June 16, 2020, 3:59pm

Hello I am very new in pytorch, I need some idea about single inference on pertained model. Recently I have implemented DeepAR Time series model for point of interest in pytorch. It worked well for me as they described. What I need is to make inference on pertained model. My idea is to pass a timestamp in a range such as.

inference(start_time, end_time):

     return prediction data frame/dictionary/list

They used basically fixed 192 train and test windows in param .json file. And they do 24 hour predict state. but I want to infer a specific window prediction. My. statement is very general here. Need some idea of approaching. Here is the git hub repo link . .https://github.com/zhykoties/TimeSeries

ptrblck · June 17, 2020, 7:21am

Based on your description it seems you would have to change the inference method in the repository, which uses a fixed time window and try to use your time stamps instead.

Where are you stuck at the moment and what have you tried so far?

Kalyan.Banik · June 17, 2020, 9:07am

Thanks @ptrblck for your reply. so far what I have created an inference module based on their evaluate.py which evaluates on trained saved weight model.

#Workflow
[preprocess_iference_set.py]----->process and save inferwindowset–>[infer_dataset #folder]
^
|
|
|
load .npyfile
|
|
[inferencemoudule]<- call infer method----------------- [inferencetest.py]

I have just followed their code and modified some thing

preprocces

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
from datetime import datetime, timedelta
import pandas as pd
import math
import numpy as np
import random
from tqdm import trange

from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

from math import sqrt
from pandas import read_csv, DataFrame
from scipy import stats

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

def prep_data(data, covariates, data_start, train = True):
    #print("train: ", train)
    time_len = data.shape[0]
    #print("time_len: ", time_len)
    input_size = window_size-stride_size
    print("input_size",input_size)
    print('numseries',num_series)
    print('timelen:',time_len)
    print(input_size)
    print('diff',time_len-input_size)
    print('stride_size',stride_size)
    windows_per_series = np.full((num_series), (time_len-input_size) // stride_size)
    
    print("Window_per_series",windows_per_series)
    #print("windows pre: ", windows_per_series.shape)
    if train: windows_per_series -= (data_start+stride_size-1) // stride_size
    #print("data_start: ", data_start.shape)
    #print(data_start)
    #print("windows: ", windows_per_series.shape)
    #print(windows_per_series)
    total_windows = np.sum(windows_per_series)
    print("Total windows", total_windows)
    x_input = np.zeros((total_windows, window_size, 1 + num_covariates + 1), dtype='float32')
    print('x_input', x_input)
    label = np.zeros((total_windows, window_size), dtype='float32')
    v_input = np.zeros((total_windows, 2), dtype='float32')
    #cov = 3: ground truth + age + day_of_week + hour_of_day + num_series
    #cov = 4: ground truth + age + day_of_week + hour_of_day + month_of_year + num_series
    count = 0
    if not train:
        print("This is in test part")
        covariates = covariates[-time_len:]
        print('covariate_start_index', covariates[-time_len])
        print('covariats',covariates)
    for series in trange(num_series):
        cov_age = stats.zscore(np.arange(total_time-data_start[series]))
        if train:
            covariates[data_start[series]:time_len, 0] = cov_age[:time_len-data_start[series]]
        else:
            covariates[:, 0] = cov_age[-time_len:]
        for i in range(windows_per_series[series]):
            if train:
                window_start = stride_size*i+data_start[series]
            else:
                window_start = stride_size*i
            window_end = window_start+window_size
            '''
            print("x: ", x_input[count, 1:, 0].shape)
            print("window start: ", window_start)
            print("window end: ", window_end)
            print("data: ", data.shape)
            print("d: ", data[window_start:window_end-1, series].shape)
            '''
            x_input[count, 1:, 0] = data[window_start:window_end-1, series]
            x_input[count, :, 1:1+num_covariates] = covariates[window_start:window_end, :]
            x_input[count, :, -1] = series
            label[count, :] = data[window_start:window_end, series]
            nonzero_sum = (x_input[count, 1:input_size, 0]!=0).sum()
            if nonzero_sum == 0:
                v_input[count, 0] = 0
            else:
                v_input[count, 0] = np.true_divide(x_input[count, 1:input_size, 0].sum(),nonzero_sum)+1
                x_input[count, :, 0] = x_input[count, :, 0]/v_input[count, 0]
                if train:
                    label[count, :] = label[count, :]/v_input[count, 0]
            count += 1
    prefix = os.path.join(save_path, 'train_' if train else 'test_')
    np.save(prefix+'data_'+save_name, x_input)
    np.save(prefix+'v_'+save_name, v_input)
    np.save(prefix+'label_'+save_name, label)

def gen_covariates(times, num_covariates):
    covariates = np.zeros((times.shape[0], num_covariates))
    print("first_covariates",times)
    for i, input_time in enumerate(times):
        covariates[i, 1] = input_time.weekday()
        #print('covarities_weekday',covariates[i, 1])
        covariates[i, 2] = input_time.hour
        #print('covarities2_hour',covariates[i, 2])
        covariates[i, 3] = input_time.month
        #print('covarities3_month',covariates[i, 3])
    for i in range(1,num_covariates):
        covariates[:,i] = stats.zscore(covariates[:,i])
    return covariates[:, :num_covariates]

def visualize(data, week_start):
    x = np.arange(window_size)
    f = plt.figure()
    plt.plot(x, data[week_start:week_start+window_size], color='b')
    f.savefig("visual.png")
    plt.close()
    
    





#global save_path
if __name__ == '__main__':
    global save_path
    save_name = 'infer_dataset'
    save_path = os.path.join('/Users/kalyan.admin/KalyanWork/DeepAR_demo/data', save_name)
    window_size = 192
    stride_size = 24
#data_frame_trainbase=pd.read_csv('LD2011_2014.txt',sep=';',index_col=0,parse_dates=True,decimal=',')
    data_frame_testbase=pd.read_csv('/Users/kalyan.admin/TimeseriesDataset/test_amount_12.csv')
    data_frame_testbase['date_TRXHOUR']=pd.to_datetime(data_frame_testbase['date_TRXHOUR'], format='%Y-%m-%d %H:%M:%S')
    data_frame_testbase=data_frame_testbase.set_index('date_TRXHOUR')
#data_frame_testbase=pd.read_csv('test-data_dateimeval.csv')
#data_frame_testbase['date_TRXHOUR']=pd.to_datetime(data_frame_testbase['date_TRXHOUR'], format='%Y-%m-%d %H:%M:%S')
#data_frame_testbase=data_frame_testbase.set_index('date_TRXHOUR')

#train_start = '2017-01-01 00:00:00'
#train_end = '2020-04-05 23:00:00'
    infer_start = '2020-04-25 00:00:00'
    infer_end = '2020-05-12 23:00:00'




#--------------------------------#
#train_start = '2011-01-01 00:00:00'
#train_end = '2014-08-31 23:00:00'
#test_start = '2014-08-25 00:00:00' #need additional 7 days as given info
#test_end = '2014-09-07 23:00:00'
#-----------------------------------#
    num_covariates = 4
    data_frame_testbase = data_frame_testbase.resample('1H',label = 'left',closed = 'right').sum()[infer_start:infer_end]
#data_frame_trainbase.fillna(0, inplace=True)
#print(data_frame_trainbase[train_start:test_end].index)
    covariates = gen_covariates(data_frame_testbase[infer_start:infer_end].index, num_covariates)



#train_data = data_frame_trainbase[train_start:train_end].values


    test_data = data_frame_testbase[infer_start:infer_end].values
    data_start = (test_data!=0).argmax(axis=0)
    total_time = data_frame_testbase.shape[0]
    num_series = data_frame_testbase.shape[1]
#prep_data(train_data, covariates, data_start)
    prep_data(test_data, covariates, data_start, train=False)

this is actually they did for preprocessing the data. I have just loaded very new csv file and from this time range I have just converted it .npy file. no training data is here it is fully infer purpose

then

here is my inference class

Inferencemodule.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 15 16:53:31 2020

@author: kalyan.admin
"""

import os

import numpy as np
import torch
from torch.utils.data.sampler import RandomSampler
from tqdm import tqdm

import utils
import model.net as net
from dataloader import *

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pandas as pd

logger = logging.getLogger('DeepAR.Eval')


class Inference:
# =============================================================================
#     def evaluate(model, loss_fn, test_loader, params, plot_num, sample=False):
#         model.eval()
#         with torch.no_grad():
#             
#             print("In model .eval")  
#             plot_batch = np.random.randint(len(test_loader))
#             print("plot_batch value in evaluation", plot_batch)
# 
#         summary_metric = {}
#         raw_metrics = utils.init_metrics(sample=sample)
# 
#       # Test_loader: 
#       # test_batch ([batch_size, train_window, 1+cov_dim]): z_{0:T-1} + x_{1:T}, note that z_0 = 0;
#       # id_batch ([batch_size]): one integer denoting the time series id;
#       # v ([batch_size, 2]): scaling factor for each window;
#       # labels ([batch_size, train_window]): z_{1:T}.
#         for i, (test_batch, id_batch, v, labels) in enumerate(tqdm(test_loader)):
#             
#             
#             test_batch = test_batch.permute(1, 0, 2).to(torch.float32).to(params.device)
#             id_batch = id_batch.unsqueeze(0).to(params.device)
#             v_batch = v.to(torch.float32).to(params.device)
#             labels = labels.to(torch.float32).to(params.device)
#             batch_size = test_batch.shape[1]
#             input_mu = torch.zeros(batch_size, params.test_predict_start, device=params.device) # scaled
#             input_sigma = torch.zeros(batch_size, params.test_predict_start, device=params.device) # scaled
#             hidden = model.init_hidden(batch_size)
#             cell = model.init_cell(batch_size)
# 
#             for t in range(params.test_predict_start):
#                 
#                 
#               # if z_t is missing, replace it by output mu from the last time step
#                 zero_index = (test_batch[t,:,0] == 0)
#                 if t > 0 and torch.sum(zero_index) > 0:
#                     
#                     test_batch[t,zero_index,0] = mu[zero_index]
# 
#                 mu, sigma, hidden, cell = model(test_batch[t].unsqueeze(0), id_batch, hidden, cell)
#                 input_mu[:,t] = v_batch[:, 0] * mu + v_batch[:, 1]
#                 input_sigma[:,t] = v_batch[:, 0] * sigma
# 
#                 if sample:
#                     samples, sample_mu, sample_sigma = model.test(test_batch, v_batch, id_batch, hidden, cell, sampling=True)
#                     raw_metrics = utils.update_metrics(raw_metrics, input_mu, input_sigma, sample_mu, labels, params.test_predict_start, samples, relative = False)
#                 else:
#                     
#                     sample_mu, sample_sigma = model.test(test_batch, v_batch, id_batch, hidden, cell)
#                     raw_metrics = utils.update_metrics(raw_metrics, input_mu, input_sigma, sample_mu, labels, params.test_predict_start, relative = False)
# 
#                 if i == plot_batch:
#                     
#                     if sample:
#                         sample_metrics = utils.get_metrics(sample_mu, labels, params.test_predict_start, samples, relative = False)
#                     else:
#                         sample_metrics = utils.get_metrics(sample_mu, labels, params.test_predict_start, relative = False)                
#               # select 10 from samples with highest error and 10 from the rest
#                     top_10_nd_sample = (-sample_metrics['ND']).argsort()[:batch_size // 10]  # hard coded to be 10
#                     chosen = set(top_10_nd_sample.tolist())
#                     all_samples = set(range(batch_size))
#                     not_chosen = np.asarray(list(all_samples - chosen))
#                     if batch_size < 100:
#                         # make sure there are enough unique samples to choose top 10 from
#                         random_sample_10 = np.random.choice(top_10_nd_sample, size=10, replace=True)
#                     else:
#                         random_sample_10 = np.random.choice(top_10_nd_sample, size=10, replace=False)
#                     if batch_size < 12:
#                         # make sure there are enough unique samples to choose bottom 90 from
#                         random_sample_90 = np.random.choice(not_chosen, size=10, replace=True)
#                     else:
#                         random_sample_90 = np.random.choice(not_chosen, size=10, replace=False)
#                     combined_sample = np.concatenate((random_sample_10, random_sample_90))
# 
#                     label_plot = labels[combined_sample].data.cpu().numpy()
#                     predict_mu = sample_mu[combined_sample].data.cpu().numpy()
#                     predict_sigma = sample_sigma[combined_sample].data.cpu().numpy()
#                     plot_mu = np.concatenate((input_mu[combined_sample].data.cpu().numpy(), predict_mu), axis=1)
#                     plot_sigma = np.concatenate((input_sigma[combined_sample].data.cpu().numpy(), predict_sigma), axis=1)
#                     plot_metrics = {_k: _v[combined_sample] for _k, _v in sample_metrics.items()}
#                     Inference.plot_eight_windows(params.plot_dir, plot_mu, plot_sigma, label_plot, params.test_window, params.test_predict_start, plot_num, plot_metrics, sample)
# 
#         summary_metric = utils.final_metrics(raw_metrics, sampling=sample)
#         metrics_string = '; '.join('{}: {:05.3f}'.format(k, v) for k, v in summary_metric.items())
#         logger.info('- Full test metrics: ' + metrics_string)
#         return summary_metric
# 			
# =============================================================================
			
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
    @staticmethod 
    def predict_plot_windows(plot_dir,
                       predict_values,
                       predict_sigma,
                       labels,
                       window_size,
                       predict_start,
                       plot_num,
                       plot_metrics,
                       sampling=False):
        
        x = np.arange(window_size)
        print("window size", x)
        f = plt.figure(figsize=(8, 42), constrained_layout=True)
        nrows = 21
        ncols = 1
        ax = f.subplots(nrows, ncols)
        for k in range(nrows):
            
            if k == 10:
                ax[k].plot(x, x, color='g')
                ax[k].plot(x, x[::-1], color='g')
                ax[k].set_title('This separates top 10 and bottom 90', fontsize=10)
                continue
            m = k if k < 10 else k - 1
            print('iteration',k)
            predict_or_df=pd.DataFrame(predict_values[m,predict_start:],columns=['y_pred'])
            predict_next_df=pd.DataFrame(labels[m,predict_start:],columns=['y_true'])
            y_lower= predict_values[m,predict_start:]-predict_sigma[m,predict_start:]
            y_lower_df=pd.DataFrame(y_lower,columns=['y_lower'])
            y_upper=predict_values[m,predict_start:]+ predict_sigma[m,predict_start:]
            y_upper_df=pd.DataFrame(y_upper,columns=['y_upper'])

            y2_lower=predict_values[m,predict_start:] - 2 * predict_sigma[m,predict_start:]
            y2_lower_df=pd.DataFrame(y2_lower,columns=['y2_lower'])
            y2_upper=predict_values[m,predict_start:] + 2 * predict_sigma[m,predict_start:]
            y2_upper_df=pd.DataFrame(y2_upper,columns=['y2_upper'])

            frames=[predict_next_df,predict_or_df,y_lower_df,y_upper_df,y2_lower_df,y2_upper_df]
            summarydf=pd.concat(frames,axis=1)
            print(summarydf)
            summarydf.to_csv('/Users/kalyan.admin/KalyanWork/DeepAR_demo/experiments/csvfiles_inferenceset/pytorch_report_"%s".csv'%(k))

            #Visualgraph for 192 hour windows that is  week based hourly prediction
            ax[k].plot(x, predict_values[m], color='b')
            ax[k].fill_between(x[predict_start:], predict_values[m, predict_start:] - 2 * predict_sigma[m, predict_start:],
                         predict_values[m, predict_start:] + 2 * predict_sigma[m, predict_start:], color='blue',
                         alpha=0.2)
            ax[k].plot(x, labels[m, :], color='r')
            ax[k].axvline(predict_start, color='g', linestyle='dashed')
            #metrics = utils.final_metrics_({_k: [_i[k] for _i in _v] for _k, _v in plot_metrics.items()})


            plot_metrics_str = f'ND: {plot_metrics["ND"][m]: .3f} ' \
            f'RMSE: {plot_metrics["RMSE"][m]: .3f}'
            if sampling:
                plot_metrics_str += f' rou90: {plot_metrics["rou90"][m]: .3f} ' \
                                f'rou50: {plot_metrics["rou50"][m]: .3f}'

            ax[k].set_title(plot_metrics_str, fontsize=10)
            
        
        f.savefig(os.path.join(plot_dir, str(plot_num) + '.png'))
		return summarydf
        plt.close()
        
        
    def infer(model, loss_fn, test_loader, params, plot_num, sample=False):
        
        
    
        model.eval()
        with torch.no_grad():
            
          print("In model .eval")  
          plot_batch = np.random.randint(len(test_loader))
          print("plot_batch value in evaluation", plot_batch)
    
          summary_metric = {}
          raw_metrics = utils.init_metrics(sample=sample)
    
          # Test_loader: 
          # test_batch ([batch_size, train_window, 1+cov_dim]): z_{0:T-1} + x_{1:T}, note that z_0 = 0;
          # id_batch ([batch_size]): one integer denoting the time series id;
          # v ([batch_size, 2]): scaling factor for each window;
          # labels ([batch_size, train_window]): z_{1:T}.
          for i, (test_batch, id_batch, v, labels) in enumerate(tqdm(test_loader)):
              test_batch = test_batch.permute(1, 0, 2).to(torch.float32).to(params.device)
              id_batch = id_batch.unsqueeze(0).to(params.device)
              v_batch = v.to(torch.float32).to(params.device)
              labels = labels.to(torch.float32).to(params.device)
              batch_size = test_batch.shape[1]
              input_mu = torch.zeros(batch_size, params.test_predict_start, device=params.device) # scaled
              input_sigma = torch.zeros(batch_size, params.test_predict_start, device=params.device) # scaled
              hidden = model.init_hidden(batch_size)
              cell = model.init_cell(batch_size)
    
              for t in range(params.test_predict_start):
                  # if z_t is missing, replace it by output mu from the last time step
                  zero_index = (test_batch[t,:,0] == 0)
                  if t > 0 and torch.sum(zero_index) > 0:
                      test_batch[t,zero_index,0] = mu[zero_index]
    
                  mu, sigma, hidden, cell = model(test_batch[t].unsqueeze(0), id_batch, hidden, cell)
                  input_mu[:,t] = v_batch[:, 0] * mu + v_batch[:, 1]
                  input_sigma[:,t] = v_batch[:, 0] * sigma
    
              if sample:
                  samples, sample_mu, sample_sigma = model.test(test_batch, v_batch, id_batch, hidden, cell, sampling=True)
                  raw_metrics = utils.update_metrics(raw_metrics, input_mu, input_sigma, sample_mu, labels, params.test_predict_start, samples, relative = False)
              else:
                  sample_mu, sample_sigma = model.test(test_batch, v_batch, id_batch, hidden, cell)
                  raw_metrics = utils.update_metrics(raw_metrics, input_mu, input_sigma, sample_mu, labels, params.test_predict_start, relative = False)
    
              if i == plot_batch:
                  if sample:
                      sample_metrics = utils.get_metrics(sample_mu, labels, params.test_predict_start, samples, relative = False)
                  else:
                      sample_metrics = utils.get_metrics(sample_mu, labels, params.test_predict_start, relative = False)                
                  # select 10 from samples with highest error and 10 from the rest
                  top_10_nd_sample = (-sample_metrics['ND']).argsort()[:batch_size // 10]  # hard coded to be 10
                  chosen = set(top_10_nd_sample.tolist())
                  all_samples = set(range(batch_size))
                  not_chosen = np.asarray(list(all_samples - chosen))
                  if batch_size < 100: # make sure there are enough unique samples to choose top 10 from
                      random_sample_10 = np.random.choice(top_10_nd_sample, size=10, replace=True)
                  else:
                      random_sample_10 = np.random.choice(top_10_nd_sample, size=10, replace=False)
                  if batch_size < 12: # make sure there are enough unique samples to choose bottom 90 from
                      random_sample_90 = np.random.choice(not_chosen, size=10, replace=True)
                  else:
                      random_sample_90 = np.random.choice(not_chosen, size=10, replace=False)
                  combined_sample = np.concatenate((random_sample_10, random_sample_90))
    
                  label_plot = labels[combined_sample].data.cpu().numpy()
                  predict_mu = sample_mu[combined_sample].data.cpu().numpy()
                  predict_sigma = sample_sigma[combined_sample].data.cpu().numpy()
                  plot_mu = np.concatenate((input_mu[combined_sample].data.cpu().numpy(), predict_mu), axis=1)
                  plot_sigma = np.concatenate((input_sigma[combined_sample].data.cpu().numpy(), predict_sigma), axis=1)
                  plot_metrics = {_k: _v[combined_sample] for _k, _v in sample_metrics.items()}
                  final_df=Inference.predict_plot_windows(params.plot_dir, plot_mu, plot_sigma, label_plot, params.test_window, params.test_predict_start, plot_num, plot_metrics, sample)
    
          summary_metric = utils.final_metrics(raw_metrics, sampling=sample)
          metrics_string = '; '.join('{}: {:05.3f}'.format(k, v) for k, v in summary_metric.items())
          logger.info('- Full test metrics: ' + metrics_string)
        return finaldf

this infer method calls the static predict_plot_windows() function for return the ytrue,ypred, ylower…y2higher as consolidated data frame.

inferencetest.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 15 17:03:31 2020

@author: kalyan.admin
"""
import os

import numpy as np
import torch
from torch.utils.data.sampler import RandomSampler
from tqdm import tqdm

import utils
import model.net as net
from dataloader import *

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pandas as pd

logger = logging.getLogger('DeepAR.Eval')
from inferencemodule import Inference

#Inference.Test('hello zntu')



model_dir = os.path.join('experiments', 'base_model') 
json_path = os.path.join(model_dir, 'params.json')
data_dir = os.path.join('data', 'infer_dataset')
assert os.path.isfile(json_path), 'No json configuration file found at {}'.format(json_path)
params = utils.Params(json_path)
print(params)
utils.set_logger(os.path.join(model_dir, 'eval.log'))

params.sampling = False
params.model_dir = model_dir
params.plot_dir = os.path.join(model_dir, 'figures')

cuda_exist = torch.cuda.is_available()  # use GPU is available

    # Set random seeds for reproducible experiments if necessary
if (cuda_exist):
    params.device = torch.device('cuda')
     
        # torch.cuda.manual_seed(240)
    logger.info('Using Cuda...')
    model = net.Net(params).cuda()
else:
    params.device = torch.device('cpu')
        # torch.manual_seed(230)
    logger.info('Not using cuda...')
    model = net.Net(params)
    
    
    
logger.info('Loading the datasets...')
test_set = TestDataset(data_dir,'infer_dataset', params.num_class)
test_loader = DataLoader(test_set, batch_size=params.predict_batch, sampler=RandomSampler(test_set), num_workers=4)
print("testloader",type(test_loader))
    #print(test_loader)
logger.info('- done.')  

print('model: ', model)
loss_fn = net.loss_fn
logger.info('Starting evaluation') 

utils.load_checkpoint(os.path.join(model_dir,'best' + '.pkl'), model)
test_metrics = Inference.infer(model, loss_fn, test_loader, params, -1, params.sampling)
save_path = os.path.join(model_dir, 'metrics_test_{}.json'.format('best'))
utils.save_dict_to_json(test_metrics, save_path)

This inferencetest calls infer method which takes test loadert -.the .npy files. models. and get the result.

what I want that better say the idea
such a customised inferencefunction that will take a range of time stamp

def custinfernce(fromtimestamp, twotimestamp):
return inference output from this range as a data frame
I am stucked at this point

what I have observed the preprocess part they did it in fixed windows. As a very novice pytorch learner I am very confused here in the modification . inference test reads some specific parameters from params .json file

{
    "learning_rate": 1e-3,
    "batch_size": 64,
    "lstm_layers": 3,
    "num_epochs": 20,
    "train_window": 192,
    "test_window": 192,
    "predict_start": 168,
    "test_predict_start": 168,
    "predict_steps": 24,
    "num_class": 370,
    "cov_dim": 4,
    "lstm_hidden_dim": 40,
    "embedding_dim": 20,
    "sample_times": 200,
    "lstm_dropout": 0.1,
    "predict_batch": 256
}

I am sorry it will be very heavy asking from you to help .But proper guide line will be helped… I am confused here. I have done the POC but I need to make a customised version. it will be very helpful for me if I get some good guidelines for approaching further Thanks

ptrblck · June 18, 2020, 2:14am

Based on your description, you could call the complete preprocessing pipeline for these particular time stamps and run the prediction on the input.
The code might need some refactoring e.g. to avoid storing the data first, but it should be doable.

One thing I’m a bit concerned about is the hard-coded test_window.
Do you know, if it needs to have a specific shape, e.g. for preprocessing purposes, or could it be dynamic?
In the former case, your passed time stamps would at least have to be extended to the minimal necessary time window.

Kalyan.Banik · June 18, 2020, 4:58am

@ptrblck In the repository the code is hard coded. actually they mentioned it in their repository description
The model is evaluated on the electricity dataset, which contains the electricity consumption of 370 households from 2011 to 2014. Under hourly frequency, we use the first week of September, 2014 as the test set and all time steps prior to that as the train set. Following the experiment design in DeepAR, the window size is chosen to be 192, where the last 24 is the forecasting horizon. History (number of time steps since the beginning of each household), month of the year, day of the week, and hour of the day are used as time covariates. Notice that some households started at different times, so we only use windows that contain non-missing values. for better understanding the original preprocess.py was like this

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
from datetime import datetime, timedelta
import pandas as pd
import math
import numpy as np
import random
from tqdm import trange

from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

from math import sqrt
from pandas import read_csv, DataFrame
from scipy import stats

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

def prep_data(data, covariates, data_start, train = True):
    #print("train: ", train)
    time_len = data.shape[0]
    #print("time_len: ", time_len)
    input_size = window_size-stride_size
    windows_per_series = np.full((num_series), (time_len-input_size) // stride_size)
    #print("windows pre: ", windows_per_series.shape)
    if train: windows_per_series -= (data_start+stride_size-1) // stride_size
    #print("data_start: ", data_start.shape)
    #print(data_start)
    #print("windows: ", windows_per_series.shape)
    #print(windows_per_series)
    total_windows = np.sum(windows_per_series)
    x_input = np.zeros((total_windows, window_size, 1 + num_covariates + 1), dtype='float32')
    label = np.zeros((total_windows, window_size), dtype='float32')
    v_input = np.zeros((total_windows, 2), dtype='float32')
    #cov = 3: ground truth + age + day_of_week + hour_of_day + num_series
    #cov = 4: ground truth + age + day_of_week + hour_of_day + month_of_year + num_series
    count = 0
    if not train:
        covariates = covariates[-time_len:]
    for series in trange(num_series):
        cov_age = stats.zscore(np.arange(total_time-data_start[series]))
        if train:
            covariates[data_start[series]:time_len, 0] = cov_age[:time_len-data_start[series]]
        else:
            covariates[:, 0] = cov_age[-time_len:]
        for i in range(windows_per_series[series]):
            if train:
                window_start = stride_size*i+data_start[series]
            else:
                window_start = stride_size*i
            window_end = window_start+window_size
            '''
            print("x: ", x_input[count, 1:, 0].shape)
            print("window start: ", window_start)
            print("window end: ", window_end)
            print("data: ", data.shape)
            print("d: ", data[window_start:window_end-1, series].shape)
            '''
            x_input[count, 1:, 0] = data[window_start:window_end-1, series]
            x_input[count, :, 1:1+num_covariates] = covariates[window_start:window_end, :]
            x_input[count, :, -1] = series
            label[count, :] = data[window_start:window_end, series]
            nonzero_sum = (x_input[count, 1:input_size, 0]!=0).sum()
            if nonzero_sum == 0:
                v_input[count, 0] = 0
            else:
                v_input[count, 0] = np.true_divide(x_input[count, 1:input_size, 0].sum(),nonzero_sum)+1
                x_input[count, :, 0] = x_input[count, :, 0]/v_input[count, 0]
                if train:
                    label[count, :] = label[count, :]/v_input[count, 0]
            count += 1
    prefix = os.path.join(save_path, 'train_' if train else 'test_')
    np.save(prefix+'data_'+save_name, x_input)
    np.save(prefix+'v_'+save_name, v_input)
    np.save(prefix+'label_'+save_name, label)

def gen_covariates(times, num_covariates):
    covariates = np.zeros((times.shape[0], num_covariates))
    for i, input_time in enumerate(times):
        covariates[i, 1] = input_time.weekday()
        covariates[i, 2] = input_time.hour
        covariates[i, 3] = input_time.month
    for i in range(1,num_covariates):
        covariates[:,i] = stats.zscore(covariates[:,i])
    return covariates[:, :num_covariates]

def visualize(data, week_start):
    x = np.arange(window_size)
    f = plt.figure()
    plt.plot(x, data[week_start:week_start+window_size], color='b')
    f.savefig("visual.png")
    plt.close()

if __name__ == '__main__':

    global save_path
    name = 'LD2011_2014.txt'
    save_name = 'elect'
    window_size = 192
    stride_size = 24
    num_covariates = 4
    train_start = '2011-01-01 00:00:00'
    train_end = '2014-08-31 23:00:00'
    test_start = '2014-08-25 00:00:00' #need additional 7 days as given info
    test_end = '2014-09-07 23:00:00'
    pred_days = 7
    given_days = 7

    save_path = os.path.join('data', save_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    csv_path = os.path.join(save_path, name)
    if not os.path.exists(csv_path):
        zipurl = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
        with urlopen(zipurl) as zipresp:
            with ZipFile(BytesIO(zipresp.read())) as zfile:
                zfile.extractall(save_path)

    data_frame = pd.read_csv(csv_path, sep=";", index_col=0, parse_dates=True, decimal=',')
    data_frame = data_frame.resample('1H',label = 'left',closed = 'right').sum()[train_start:test_end]
    data_frame.fillna(0, inplace=True)
    covariates = gen_covariates(data_frame[train_start:test_end].index, num_covariates)
    train_data = data_frame[train_start:train_end].values
    test_data = data_frame[test_start:test_end].values
    data_start = (train_data!=0).argmax(axis=0) #find first nonzero value in each time series
    total_time = data_frame.shape[0] #32304
    num_series = data_frame.shape[1] #370
    prep_data(train_data, covariates, data_start)
    prep_data(test_data, covariates, data_start, train=False)

this store both. train and test files i eg: data/elect/ train.npy ,data/elect/ test.npy total stored files was 6.
If I am wrong please try to correct me that for my case I am feeling that it needs to be dynamic here . About the test_window , I have seen it hard_coded in params.json file . I also pass different number instead of 192 it can predict
but the prediction is always showing 24 hours prediction. for both of the case . I expected that the test window 5 will show only the prediction of 5 hours

the output is like this

      y_true      y_pred  ...     y2_lower     y2_upper

0 6.686744e+06 85177600.0 … -576900544.0 747255744.0
1 4.888917e+06 90021016.0 … -571470784.0 751512768.0
2 1.507870e+07 92519352.0 … -567632768.0 752671488.0
3 1.824654e+07 94020880.0 … -564585664.0 752627392.0
4 7.327324e+06 94660928.0 … -562476416.0 751798272.0
5 2.968790e+07 94579640.0 … -561239040.0 750398336.0
6 1.234428e+08 93913760.0 … -560794624.0 748622208.0
7 3.505584e+08 92788040.0 … -561048960.0 746625024.0
8 7.173911e+08 91309272.0 … -561905984.0 744524480.0
9 1.000136e+09 89564064.0 … -563277312.0 742405504.0
10 9.772118e+08 87619696.0 … -565085632.0 740325056.0
11 6.443588e+08 85526472.0 … -567264640.0 738317568.0
12 2.442418e+08 83321008.0 … -569757248.0 736399296.0
13 5.765772e+08 81029616.0 … -572512256.0 734571520.0
14 6.966076e+08 78671512.0 … -575480512.0 732823488.0
15 8.396323e+08 76261480.0 … -578611904.0 731134912.0
16 9.348614e+08 73811656.0 … -581854080.0 729477376.0
17 3.790162e+08 71332320.0 … -585151872.0 727816576.0
18 7.515996e+08 68831560.0 … -588451200.0 726114304.0
19 4.156775e+08 66314284.0 … -591702528.0 724331136.0
20 2.908992e+08 63780448.0 … -594867968.0 722428800.0
21 1.801725e+08 61223460.0 … -597925888.0 720372864.0
22 5.370657e+07 58629428.0 … -600874496.0 718133376.0
23 1.840954e+07 55977044.0 … -603732288.0 715686336.0

[24 rows x 6 columns]