I’m building a MLP to interpolate data from a timeseries .csv file. The problem is that my data looks something like this:
t x y
0.1 3 -
0.2 - 5
0.3 3.1 -
0.4 - -
In other words, I don’t have measurements for both x and y in most rows.
Is there a way for me to use this data without having to use something like pandas.interpolate()?
Here is some relevant code:
How I’m currently using the data:
class TimeSeriesDataset(torch.utils.data.Dataset):
def __init__(self, csv_file):
df = pd.read_csv(csv_file)
df.replace('-', pd.NA, inplace=True)
df['x'] = pd.to_numeric(df['x'])
df['y'] = pd.to_numeric(df['y'])
df['t'] = pd.to_timedelta(df['t'], unit='s')
df.set_index('t', inplace=True)
df = df.interpolate(method='time')
df.reset_index(inplace=True)
df['t'] = df['t'].dt.total_seconds()
df.dropna(inplace=True)
self.data = df
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
row = self.data.iloc[idx]
x = torch.tensor([row['x']], dtype=torch.float32)
y = torch.tensor([row['y']], dtype=torch.float32)
t = torch.tensor([row['t']], dtype=torch.float32)
labels = torch.tensor([x.item(), y.item()], dtype=torch.float32)
return t, labels
training loop:
BATCH_SIZE = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
dataset = TimeSeriesDataset(r'/content/data.csv')
trainloader = torch.utils.data.DataLoader(
dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
def loss_fn(outputs, labels):
loss = torch.mean((labels - outputs) ** 2)
return loss
net = Net().to(device)
optimizer = optim.Adam(net.parameters(), lr=0.001)
for epoch in range(300):
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = loss_fn(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 20 == 19: # print every 20 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 20))
running_loss = 0.0
print('Finished Training')
I know that I could seperate the two variables and train them seperatly, I’m curious if there is any way I could keep it such that my model takes in one input ‘time’, and predicts the value for both variables ‘x’ and ‘y’.
Cheers