I have a time series data and I am using a recurrent model to predict output sequence given inputs where the input/output data have different length in each batch.
class subDataset(torch.utils.data.Dataset):
def __init__(self, Data, Target):
super().__init__()
# Data size of (batch_size, seq_len, input_size)
data, target, seq_len = self._preprocess(Data, Target)
self.data = data
self.target = target
self.seq_len = seq_len
def __len__(self):
return self.data.size(0)
def __getitem__(self, index):
data = self.data[index,:]
target = self.target[index,:]
return data, target, self.seq_len[index]
@staticmethod
def _preprocess(data, target):
x =[torch.stack([data[i][j] for j in range(len(data[i]))], dim=0) for i in range(len(data))]
y =[torch.stack([target[i][j] for j in range(len(target[i]))], dim=0) for i in range(len(target))]
x_padded = torch.nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=0)
y_padded = torch.nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)
seq_len = torch.LongTensor(list(map(len,x)))
print(f"maximum size of sequence {torch.max(seq_len)} {seq_len} {x_padded.shape}")
# Data size of ( batch_size, seq_len, input_size)
return x_padded, y_padded, seq_len
When I trained my model it seems that the mean of predicted outputs from my model has slightly off compared to the mean of true outputs.
I also used this class to normalize the input and output.
class Normalizer1D(nn.Module):
_epsilon = 1e-16
# Data size of (batch_size, seq_len, input_size)
def __init__(self, scale, offset):
super(Normalizer1D, self).__init__()
self.register_buffer('scale', torch.tensor(scale, dtype=torch.float32) + self._epsilon)
self.register_buffer('offset', torch.tensor(offset, dtype=torch.float32))
def normalize(self, x):
#(T, B, D)
x = (x - self.offset) / self.scale
return x
def unnormalize(self, x):
#(T, B, D)
x = x * self.scale + self.offset
return x
def unnormalize_mean(self, x_mu):
#(T, B, D)
x_mu = x_mu * self.scale + self.offset
return x_mu
def unnormalize_sigma(self, x_sigma):
#(T, B, D)
x_sigma = x_sigma * self.scale
return x_sigma
def compute_normalizer(loader_train):
# definition #batch_size,seq_len, input_dim
variance_scaler = 1
# initialization
total_batches = 0
u_mean = 0
y_mean = 0
u_var = 0
y_var = 0
for i, (u, y, s) in enumerate(loader_train):
total_batches += u.size()[0]
u_mean += torch.mean(u, dim=(0, 1))
y_mean += torch.mean(y, dim=(0, 1))
u_var += torch.mean(torch.var(u, dim=1, unbiased=False), dim=(0,))
y_var += torch.mean(torch.var(y, dim=1, unbiased=False), dim=(0,))
u_mean = u_mean.numpy()
y_mean = y_mean.numpy()
u_var = u_var.numpy()
y_var = y_var.numpy()
u_normalizer = Normalizer1D(np.sqrt(u_var / total_batches) * variance_scaler, u_mean / total_batches)
y_normalizer = Normalizer1D(np.sqrt(y_var / total_batches) * variance_scaler, y_mean / total_batches)
return u_normalizer, y_normalizer
However, I think the compute_normalizer
has a problem, since it doesn’t take into account that a big chunk on the data is masked and contain zeros. This will bias the mean and variance. Does anyone has a suggestion how to normalize the data with variable sequence length properly in pytorch?