Thanks @ptrblck for your reply. so far what I have created an inference module based on their evaluate.py which evaluates on trained saved weight model.
#Workflow
[preprocess_iference_set.py]----->process and save inferwindowset–>[infer_dataset #folder]
^
|
|
|
load .npyfile
|
|
[inferencemoudule]<- call infer method----------------- [inferencetest.py]
I have just followed their code and modified some thing
preprocces
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from datetime import datetime, timedelta
import pandas as pd
import math
import numpy as np
import random
from tqdm import trange
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
from math import sqrt
from pandas import read_csv, DataFrame
from scipy import stats
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
def prep_data(data, covariates, data_start, train = True):
#print("train: ", train)
time_len = data.shape[0]
#print("time_len: ", time_len)
input_size = window_size-stride_size
print("input_size",input_size)
print('numseries',num_series)
print('timelen:',time_len)
print(input_size)
print('diff',time_len-input_size)
print('stride_size',stride_size)
windows_per_series = np.full((num_series), (time_len-input_size) // stride_size)
print("Window_per_series",windows_per_series)
#print("windows pre: ", windows_per_series.shape)
if train: windows_per_series -= (data_start+stride_size-1) // stride_size
#print("data_start: ", data_start.shape)
#print(data_start)
#print("windows: ", windows_per_series.shape)
#print(windows_per_series)
total_windows = np.sum(windows_per_series)
print("Total windows", total_windows)
x_input = np.zeros((total_windows, window_size, 1 + num_covariates + 1), dtype='float32')
print('x_input', x_input)
label = np.zeros((total_windows, window_size), dtype='float32')
v_input = np.zeros((total_windows, 2), dtype='float32')
#cov = 3: ground truth + age + day_of_week + hour_of_day + num_series
#cov = 4: ground truth + age + day_of_week + hour_of_day + month_of_year + num_series
count = 0
if not train:
print("This is in test part")
covariates = covariates[-time_len:]
print('covariate_start_index', covariates[-time_len])
print('covariats',covariates)
for series in trange(num_series):
cov_age = stats.zscore(np.arange(total_time-data_start[series]))
if train:
covariates[data_start[series]:time_len, 0] = cov_age[:time_len-data_start[series]]
else:
covariates[:, 0] = cov_age[-time_len:]
for i in range(windows_per_series[series]):
if train:
window_start = stride_size*i+data_start[series]
else:
window_start = stride_size*i
window_end = window_start+window_size
'''
print("x: ", x_input[count, 1:, 0].shape)
print("window start: ", window_start)
print("window end: ", window_end)
print("data: ", data.shape)
print("d: ", data[window_start:window_end-1, series].shape)
'''
x_input[count, 1:, 0] = data[window_start:window_end-1, series]
x_input[count, :, 1:1+num_covariates] = covariates[window_start:window_end, :]
x_input[count, :, -1] = series
label[count, :] = data[window_start:window_end, series]
nonzero_sum = (x_input[count, 1:input_size, 0]!=0).sum()
if nonzero_sum == 0:
v_input[count, 0] = 0
else:
v_input[count, 0] = np.true_divide(x_input[count, 1:input_size, 0].sum(),nonzero_sum)+1
x_input[count, :, 0] = x_input[count, :, 0]/v_input[count, 0]
if train:
label[count, :] = label[count, :]/v_input[count, 0]
count += 1
prefix = os.path.join(save_path, 'train_' if train else 'test_')
np.save(prefix+'data_'+save_name, x_input)
np.save(prefix+'v_'+save_name, v_input)
np.save(prefix+'label_'+save_name, label)
def gen_covariates(times, num_covariates):
covariates = np.zeros((times.shape[0], num_covariates))
print("first_covariates",times)
for i, input_time in enumerate(times):
covariates[i, 1] = input_time.weekday()
#print('covarities_weekday',covariates[i, 1])
covariates[i, 2] = input_time.hour
#print('covarities2_hour',covariates[i, 2])
covariates[i, 3] = input_time.month
#print('covarities3_month',covariates[i, 3])
for i in range(1,num_covariates):
covariates[:,i] = stats.zscore(covariates[:,i])
return covariates[:, :num_covariates]
def visualize(data, week_start):
x = np.arange(window_size)
f = plt.figure()
plt.plot(x, data[week_start:week_start+window_size], color='b')
f.savefig("visual.png")
plt.close()
#global save_path
if __name__ == '__main__':
global save_path
save_name = 'infer_dataset'
save_path = os.path.join('/Users/kalyan.admin/KalyanWork/DeepAR_demo/data', save_name)
window_size = 192
stride_size = 24
#data_frame_trainbase=pd.read_csv('LD2011_2014.txt',sep=';',index_col=0,parse_dates=True,decimal=',')
data_frame_testbase=pd.read_csv('/Users/kalyan.admin/TimeseriesDataset/test_amount_12.csv')
data_frame_testbase['date_TRXHOUR']=pd.to_datetime(data_frame_testbase['date_TRXHOUR'], format='%Y-%m-%d %H:%M:%S')
data_frame_testbase=data_frame_testbase.set_index('date_TRXHOUR')
#data_frame_testbase=pd.read_csv('test-data_dateimeval.csv')
#data_frame_testbase['date_TRXHOUR']=pd.to_datetime(data_frame_testbase['date_TRXHOUR'], format='%Y-%m-%d %H:%M:%S')
#data_frame_testbase=data_frame_testbase.set_index('date_TRXHOUR')
#train_start = '2017-01-01 00:00:00'
#train_end = '2020-04-05 23:00:00'
infer_start = '2020-04-25 00:00:00'
infer_end = '2020-05-12 23:00:00'
#--------------------------------#
#train_start = '2011-01-01 00:00:00'
#train_end = '2014-08-31 23:00:00'
#test_start = '2014-08-25 00:00:00' #need additional 7 days as given info
#test_end = '2014-09-07 23:00:00'
#-----------------------------------#
num_covariates = 4
data_frame_testbase = data_frame_testbase.resample('1H',label = 'left',closed = 'right').sum()[infer_start:infer_end]
#data_frame_trainbase.fillna(0, inplace=True)
#print(data_frame_trainbase[train_start:test_end].index)
covariates = gen_covariates(data_frame_testbase[infer_start:infer_end].index, num_covariates)
#train_data = data_frame_trainbase[train_start:train_end].values
test_data = data_frame_testbase[infer_start:infer_end].values
data_start = (test_data!=0).argmax(axis=0)
total_time = data_frame_testbase.shape[0]
num_series = data_frame_testbase.shape[1]
#prep_data(train_data, covariates, data_start)
prep_data(test_data, covariates, data_start, train=False)
this is actually they did for preprocessing the data. I have just loaded very new csv file and from this time range I have just converted it .npy file. no training data is here it is fully infer purpose
then
here is my inference class
Inferencemodule.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 15 16:53:31 2020
@author: kalyan.admin
"""
import os
import numpy as np
import torch
from torch.utils.data.sampler import RandomSampler
from tqdm import tqdm
import utils
import model.net as net
from dataloader import *
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pandas as pd
logger = logging.getLogger('DeepAR.Eval')
class Inference:
# =============================================================================
# def evaluate(model, loss_fn, test_loader, params, plot_num, sample=False):
# model.eval()
# with torch.no_grad():
#
# print("In model .eval")
# plot_batch = np.random.randint(len(test_loader))
# print("plot_batch value in evaluation", plot_batch)
#
# summary_metric = {}
# raw_metrics = utils.init_metrics(sample=sample)
#
# # Test_loader:
# # test_batch ([batch_size, train_window, 1+cov_dim]): z_{0:T-1} + x_{1:T}, note that z_0 = 0;
# # id_batch ([batch_size]): one integer denoting the time series id;
# # v ([batch_size, 2]): scaling factor for each window;
# # labels ([batch_size, train_window]): z_{1:T}.
# for i, (test_batch, id_batch, v, labels) in enumerate(tqdm(test_loader)):
#
#
# test_batch = test_batch.permute(1, 0, 2).to(torch.float32).to(params.device)
# id_batch = id_batch.unsqueeze(0).to(params.device)
# v_batch = v.to(torch.float32).to(params.device)
# labels = labels.to(torch.float32).to(params.device)
# batch_size = test_batch.shape[1]
# input_mu = torch.zeros(batch_size, params.test_predict_start, device=params.device) # scaled
# input_sigma = torch.zeros(batch_size, params.test_predict_start, device=params.device) # scaled
# hidden = model.init_hidden(batch_size)
# cell = model.init_cell(batch_size)
#
# for t in range(params.test_predict_start):
#
#
# # if z_t is missing, replace it by output mu from the last time step
# zero_index = (test_batch[t,:,0] == 0)
# if t > 0 and torch.sum(zero_index) > 0:
#
# test_batch[t,zero_index,0] = mu[zero_index]
#
# mu, sigma, hidden, cell = model(test_batch[t].unsqueeze(0), id_batch, hidden, cell)
# input_mu[:,t] = v_batch[:, 0] * mu + v_batch[:, 1]
# input_sigma[:,t] = v_batch[:, 0] * sigma
#
# if sample:
# samples, sample_mu, sample_sigma = model.test(test_batch, v_batch, id_batch, hidden, cell, sampling=True)
# raw_metrics = utils.update_metrics(raw_metrics, input_mu, input_sigma, sample_mu, labels, params.test_predict_start, samples, relative = False)
# else:
#
# sample_mu, sample_sigma = model.test(test_batch, v_batch, id_batch, hidden, cell)
# raw_metrics = utils.update_metrics(raw_metrics, input_mu, input_sigma, sample_mu, labels, params.test_predict_start, relative = False)
#
# if i == plot_batch:
#
# if sample:
# sample_metrics = utils.get_metrics(sample_mu, labels, params.test_predict_start, samples, relative = False)
# else:
# sample_metrics = utils.get_metrics(sample_mu, labels, params.test_predict_start, relative = False)
# # select 10 from samples with highest error and 10 from the rest
# top_10_nd_sample = (-sample_metrics['ND']).argsort()[:batch_size // 10] # hard coded to be 10
# chosen = set(top_10_nd_sample.tolist())
# all_samples = set(range(batch_size))
# not_chosen = np.asarray(list(all_samples - chosen))
# if batch_size < 100:
# # make sure there are enough unique samples to choose top 10 from
# random_sample_10 = np.random.choice(top_10_nd_sample, size=10, replace=True)
# else:
# random_sample_10 = np.random.choice(top_10_nd_sample, size=10, replace=False)
# if batch_size < 12:
# # make sure there are enough unique samples to choose bottom 90 from
# random_sample_90 = np.random.choice(not_chosen, size=10, replace=True)
# else:
# random_sample_90 = np.random.choice(not_chosen, size=10, replace=False)
# combined_sample = np.concatenate((random_sample_10, random_sample_90))
#
# label_plot = labels[combined_sample].data.cpu().numpy()
# predict_mu = sample_mu[combined_sample].data.cpu().numpy()
# predict_sigma = sample_sigma[combined_sample].data.cpu().numpy()
# plot_mu = np.concatenate((input_mu[combined_sample].data.cpu().numpy(), predict_mu), axis=1)
# plot_sigma = np.concatenate((input_sigma[combined_sample].data.cpu().numpy(), predict_sigma), axis=1)
# plot_metrics = {_k: _v[combined_sample] for _k, _v in sample_metrics.items()}
# Inference.plot_eight_windows(params.plot_dir, plot_mu, plot_sigma, label_plot, params.test_window, params.test_predict_start, plot_num, plot_metrics, sample)
#
# summary_metric = utils.final_metrics(raw_metrics, sampling=sample)
# metrics_string = '; '.join('{}: {:05.3f}'.format(k, v) for k, v in summary_metric.items())
# logger.info('- Full test metrics: ' + metrics_string)
# return summary_metric
#
# =============================================================================
@staticmethod
def predict_plot_windows(plot_dir,
predict_values,
predict_sigma,
labels,
window_size,
predict_start,
plot_num,
plot_metrics,
sampling=False):
x = np.arange(window_size)
print("window size", x)
f = plt.figure(figsize=(8, 42), constrained_layout=True)
nrows = 21
ncols = 1
ax = f.subplots(nrows, ncols)
for k in range(nrows):
if k == 10:
ax[k].plot(x, x, color='g')
ax[k].plot(x, x[::-1], color='g')
ax[k].set_title('This separates top 10 and bottom 90', fontsize=10)
continue
m = k if k < 10 else k - 1
print('iteration',k)
predict_or_df=pd.DataFrame(predict_values[m,predict_start:],columns=['y_pred'])
predict_next_df=pd.DataFrame(labels[m,predict_start:],columns=['y_true'])
y_lower= predict_values[m,predict_start:]-predict_sigma[m,predict_start:]
y_lower_df=pd.DataFrame(y_lower,columns=['y_lower'])
y_upper=predict_values[m,predict_start:]+ predict_sigma[m,predict_start:]
y_upper_df=pd.DataFrame(y_upper,columns=['y_upper'])
y2_lower=predict_values[m,predict_start:] - 2 * predict_sigma[m,predict_start:]
y2_lower_df=pd.DataFrame(y2_lower,columns=['y2_lower'])
y2_upper=predict_values[m,predict_start:] + 2 * predict_sigma[m,predict_start:]
y2_upper_df=pd.DataFrame(y2_upper,columns=['y2_upper'])
frames=[predict_next_df,predict_or_df,y_lower_df,y_upper_df,y2_lower_df,y2_upper_df]
summarydf=pd.concat(frames,axis=1)
print(summarydf)
summarydf.to_csv('/Users/kalyan.admin/KalyanWork/DeepAR_demo/experiments/csvfiles_inferenceset/pytorch_report_"%s".csv'%(k))
#Visualgraph for 192 hour windows that is week based hourly prediction
ax[k].plot(x, predict_values[m], color='b')
ax[k].fill_between(x[predict_start:], predict_values[m, predict_start:] - 2 * predict_sigma[m, predict_start:],
predict_values[m, predict_start:] + 2 * predict_sigma[m, predict_start:], color='blue',
alpha=0.2)
ax[k].plot(x, labels[m, :], color='r')
ax[k].axvline(predict_start, color='g', linestyle='dashed')
#metrics = utils.final_metrics_({_k: [_i[k] for _i in _v] for _k, _v in plot_metrics.items()})
plot_metrics_str = f'ND: {plot_metrics["ND"][m]: .3f} ' \
f'RMSE: {plot_metrics["RMSE"][m]: .3f}'
if sampling:
plot_metrics_str += f' rou90: {plot_metrics["rou90"][m]: .3f} ' \
f'rou50: {plot_metrics["rou50"][m]: .3f}'
ax[k].set_title(plot_metrics_str, fontsize=10)
f.savefig(os.path.join(plot_dir, str(plot_num) + '.png'))
return summarydf
plt.close()
def infer(model, loss_fn, test_loader, params, plot_num, sample=False):
model.eval()
with torch.no_grad():
print("In model .eval")
plot_batch = np.random.randint(len(test_loader))
print("plot_batch value in evaluation", plot_batch)
summary_metric = {}
raw_metrics = utils.init_metrics(sample=sample)
# Test_loader:
# test_batch ([batch_size, train_window, 1+cov_dim]): z_{0:T-1} + x_{1:T}, note that z_0 = 0;
# id_batch ([batch_size]): one integer denoting the time series id;
# v ([batch_size, 2]): scaling factor for each window;
# labels ([batch_size, train_window]): z_{1:T}.
for i, (test_batch, id_batch, v, labels) in enumerate(tqdm(test_loader)):
test_batch = test_batch.permute(1, 0, 2).to(torch.float32).to(params.device)
id_batch = id_batch.unsqueeze(0).to(params.device)
v_batch = v.to(torch.float32).to(params.device)
labels = labels.to(torch.float32).to(params.device)
batch_size = test_batch.shape[1]
input_mu = torch.zeros(batch_size, params.test_predict_start, device=params.device) # scaled
input_sigma = torch.zeros(batch_size, params.test_predict_start, device=params.device) # scaled
hidden = model.init_hidden(batch_size)
cell = model.init_cell(batch_size)
for t in range(params.test_predict_start):
# if z_t is missing, replace it by output mu from the last time step
zero_index = (test_batch[t,:,0] == 0)
if t > 0 and torch.sum(zero_index) > 0:
test_batch[t,zero_index,0] = mu[zero_index]
mu, sigma, hidden, cell = model(test_batch[t].unsqueeze(0), id_batch, hidden, cell)
input_mu[:,t] = v_batch[:, 0] * mu + v_batch[:, 1]
input_sigma[:,t] = v_batch[:, 0] * sigma
if sample:
samples, sample_mu, sample_sigma = model.test(test_batch, v_batch, id_batch, hidden, cell, sampling=True)
raw_metrics = utils.update_metrics(raw_metrics, input_mu, input_sigma, sample_mu, labels, params.test_predict_start, samples, relative = False)
else:
sample_mu, sample_sigma = model.test(test_batch, v_batch, id_batch, hidden, cell)
raw_metrics = utils.update_metrics(raw_metrics, input_mu, input_sigma, sample_mu, labels, params.test_predict_start, relative = False)
if i == plot_batch:
if sample:
sample_metrics = utils.get_metrics(sample_mu, labels, params.test_predict_start, samples, relative = False)
else:
sample_metrics = utils.get_metrics(sample_mu, labels, params.test_predict_start, relative = False)
# select 10 from samples with highest error and 10 from the rest
top_10_nd_sample = (-sample_metrics['ND']).argsort()[:batch_size // 10] # hard coded to be 10
chosen = set(top_10_nd_sample.tolist())
all_samples = set(range(batch_size))
not_chosen = np.asarray(list(all_samples - chosen))
if batch_size < 100: # make sure there are enough unique samples to choose top 10 from
random_sample_10 = np.random.choice(top_10_nd_sample, size=10, replace=True)
else:
random_sample_10 = np.random.choice(top_10_nd_sample, size=10, replace=False)
if batch_size < 12: # make sure there are enough unique samples to choose bottom 90 from
random_sample_90 = np.random.choice(not_chosen, size=10, replace=True)
else:
random_sample_90 = np.random.choice(not_chosen, size=10, replace=False)
combined_sample = np.concatenate((random_sample_10, random_sample_90))
label_plot = labels[combined_sample].data.cpu().numpy()
predict_mu = sample_mu[combined_sample].data.cpu().numpy()
predict_sigma = sample_sigma[combined_sample].data.cpu().numpy()
plot_mu = np.concatenate((input_mu[combined_sample].data.cpu().numpy(), predict_mu), axis=1)
plot_sigma = np.concatenate((input_sigma[combined_sample].data.cpu().numpy(), predict_sigma), axis=1)
plot_metrics = {_k: _v[combined_sample] for _k, _v in sample_metrics.items()}
final_df=Inference.predict_plot_windows(params.plot_dir, plot_mu, plot_sigma, label_plot, params.test_window, params.test_predict_start, plot_num, plot_metrics, sample)
summary_metric = utils.final_metrics(raw_metrics, sampling=sample)
metrics_string = '; '.join('{}: {:05.3f}'.format(k, v) for k, v in summary_metric.items())
logger.info('- Full test metrics: ' + metrics_string)
return finaldf
this infer method calls the static predict_plot_windows() function for return the ytrue,ypred, ylower…y2higher as consolidated data frame.
inferencetest.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 15 17:03:31 2020
@author: kalyan.admin
"""
import os
import numpy as np
import torch
from torch.utils.data.sampler import RandomSampler
from tqdm import tqdm
import utils
import model.net as net
from dataloader import *
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pandas as pd
logger = logging.getLogger('DeepAR.Eval')
from inferencemodule import Inference
#Inference.Test('hello zntu')
model_dir = os.path.join('experiments', 'base_model')
json_path = os.path.join(model_dir, 'params.json')
data_dir = os.path.join('data', 'infer_dataset')
assert os.path.isfile(json_path), 'No json configuration file found at {}'.format(json_path)
params = utils.Params(json_path)
print(params)
utils.set_logger(os.path.join(model_dir, 'eval.log'))
params.sampling = False
params.model_dir = model_dir
params.plot_dir = os.path.join(model_dir, 'figures')
cuda_exist = torch.cuda.is_available() # use GPU is available
# Set random seeds for reproducible experiments if necessary
if (cuda_exist):
params.device = torch.device('cuda')
# torch.cuda.manual_seed(240)
logger.info('Using Cuda...')
model = net.Net(params).cuda()
else:
params.device = torch.device('cpu')
# torch.manual_seed(230)
logger.info('Not using cuda...')
model = net.Net(params)
logger.info('Loading the datasets...')
test_set = TestDataset(data_dir,'infer_dataset', params.num_class)
test_loader = DataLoader(test_set, batch_size=params.predict_batch, sampler=RandomSampler(test_set), num_workers=4)
print("testloader",type(test_loader))
#print(test_loader)
logger.info('- done.')
print('model: ', model)
loss_fn = net.loss_fn
logger.info('Starting evaluation')
utils.load_checkpoint(os.path.join(model_dir,'best' + '.pkl'), model)
test_metrics = Inference.infer(model, loss_fn, test_loader, params, -1, params.sampling)
save_path = os.path.join(model_dir, 'metrics_test_{}.json'.format('best'))
utils.save_dict_to_json(test_metrics, save_path)
This inferencetest calls infer method which takes test loadert -.the .npy files. models. and get the result.
what I want that better say the idea
such a customised inferencefunction that will take a range of time stamp
def custinfernce(fromtimestamp, twotimestamp):
return inference output from this range as a data frame
I am stucked at this point
what I have observed the preprocess part they did it in fixed windows. As a very novice pytorch learner I am very confused here in the modification . inference test reads some specific parameters from params .json file
{
"learning_rate": 1e-3,
"batch_size": 64,
"lstm_layers": 3,
"num_epochs": 20,
"train_window": 192,
"test_window": 192,
"predict_start": 168,
"test_predict_start": 168,
"predict_steps": 24,
"num_class": 370,
"cov_dim": 4,
"lstm_hidden_dim": 40,
"embedding_dim": 20,
"sample_times": 200,
"lstm_dropout": 0.1,
"predict_batch": 256
}
I am sorry it will be very heavy asking from you to help .But proper guide line will be helped… I am confused here. I have done the POC but I need to make a customised version. it will be very helpful for me if I get some good guidelines for approaching further Thanks