Can't get my training function right

Mhtsf · November 2, 2022, 11:33am

Hi,

I’m trying to create a model that infers two parameters from a string of data. To do this I had to create a custom data loader for my data files. Data file format is 10538 data points in a .csv file. Due to the need for a custom data loader, I am unable to quite get the definition of the training function correct and keep getting errors. I have included my code:

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torch import nn
import os
import pandas as pd
import numpy as np
from numpy import genfromtxt
from scipy import special as sp
import random
import shutil

# Define Custom Dataset to contain all the training data
class CustomDataset(Dataset):
    def __init__(self, annotations_file, data_dir):
        self.labels = pd.read_csv(annotations_file)
        self.data_dir = data_dir

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        data_path = os.path.join(self.data_dir, self.labels.iloc[idx, 0])
        data = torch.from_numpy(genfromtxt(data_path, delimiter=','))
        mu = self.labels.iloc[idx,1]
        sigma = self.labels.iloc[idx,2]
        return data, mu, sigma

# Parameters for taking a random sample of the data for validation
dataset = CustomDataset("data/labels.csv", "data")
batch_size = 64
validation_split = 0.25
shuffle = True
random_seed = 42

indices = list(range(dataset.__len__()))
split = int(np.floor(validation_split * dataset.__len__()))
if shuffle :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[:split], indices[split:]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
validation_loader = DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler)

# Ensure the GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

# Design Neural Network for training
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(10538, 1054),
            nn.ReLU(),
            nn.Linear(1054, 106),
            nn.ReLU(),
            nn.Linear(106, 11),
            nn.ReLU(),
            nn.Linear(11, 2),
        )
    
    def forward(self, x):
        x = self.flatten(x)
        x = x.float()
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device).float()

# Define Model Loss Function and Optimizer
loss_fn = nn.MSELoss()

optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

# Define the training method for the Neural Network
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, mu, sigma) in enumerate(dataloader):
        X, mu, sigma = X.to(device), mu.to(device), sigma.to(device)

        # Compute Prediction Error
        pred = model(X)
        target = torch.column_stack((mu,sigma))
        loss = loss_fn(pred, target)

        # Backpropogation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

# Define the testing method for the Neural Network
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_bathces = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg Loss: {test_loss:>8f} \n")

# Train the Model
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n------------------------------")
    train(train_loader, model, loss_fn, optimizer)
    test(validation_loader, model, loss_fn)
print("Done!")

torch.save(model.state_dict(), "model.pth")
print("Saved PyTorch Morch State to model.pth")

The initial error I was receiving was that pred = model(X) was expecting a float value but receiving double. This was solved by casting the model and the values to float: model = NeuralNetwork().to(device).float(). However, now the error I receive is:

Traceback (most recent call last):
  File "DataGen.py", line 123, in <module>
    print(f"Epoch {t+1}\n------------------------------")
  File "DataGen.py", line 96, in train
    optimizer.zero_grad()
  File "D:\Documents\Python\.venvs\virtual1\lib\site-packages\torch\_tensor.py", line 396, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "D:\Documents\Python\.venvs\virtual1\lib\site-packages\torch\autograd\__init__.py", line 173, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: Found dtype Double but expected Float

And I am unsure what to make of it, why is pytorch outputting double type data if it is expecting float type? I would ideally not have to continue casting to the correct type, is there an underlying solution? I am very new to Pytorch, so apologise if this seems trivial.

Mhtsf · November 2, 2022, 2:07pm

This has been solved by casting the target tensor to float32 by: target = torch.column_stack((mu, sigma)).to(torch.float32)