Issues with training a model

milan_kalkenings · April 2, 2021, 11:36am

Hi all,
I am new to pytorch and I wonder why my neural network doesn’t train. The loss is always the same in every epoch and the execution time per epoch implicates that no training really happens.

Sry for the long post…

class NeuralNetwork(nn.Module):
def init(self):
super(NeuralNetwork, self).init()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(11, 6),
nn.ReLU(),
nn.Linear(6, 4),
nn.ReLU(),
nn.Linear(4, 1),
nn.Softmax()
)

def forward(self, x):
    x = self.flatten(x)
    logits = self.linear_relu_stack(x)
    return logits

model = NeuralNetwork().to(‘cuda’)

for n, p in model.named_parameters():
print(p.device, “”, n)

#hyperparameters and optimization
learning_rate = 0.001
batch_size = 128
epochs = 5
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

training_data = CustomDataset(data_dir=“train.csv”, targets=“quality”)
test_data = CustomDataset(data_dir=“test.csv”, targets=“quality”)

train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

def train_loop(dataloader, model, loss_fn, optimizer):
for batch, (X, y) in enumerate(dataloader):
# move the data to the gpu
X = X.to(‘cuda’)
y = y.to(‘cuda’)

    y = y.float() 
    
    # Compute prediction and loss
    pred = model(X.float())
    
    y = torch.reshape(input=y, shape=pred.shape)
    loss = loss_fn(pred, y)

    # Backpropagation
    optimizer.zero_grad() 
    loss.backward()
    optimizer.step()

def test_loop(dataloader, model, loss_fn):
size = len(dataloader.dataset)
print(size)
test_loss = 0

with torch.no_grad():
    for X, y in dataloader:
        X = X.float().to('cuda')
        y = y.to('cuda')
        y = y.float()
        pred = model(X)
        y = torch.reshape(input=y, shape=pred.shape)
        test_loss += loss_fn(pred, y).item()
        
test_loss /= size
print(f"Avg loss on test: {test_loss} ")

for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
start = timeit.default_timer()
train_loop(train_dataloader, model, loss_fn, optimizer)
test_loop(test_dataloader, model, loss_fn)
stop = timeit.default_timer()
print("time: ", stop - start, “\n”)
print(“Done!”)

output:
Epoch 1

320
Avg loss on test: 0.4052734375
time: 0.7036787000000002

Epoch 2

320
Avg loss on test: 0.4052734375
time: 0.023357099999999242

Epoch 3

320
Avg loss on test: 0.4052734375
time: 0.02345839999999999

Epoch 4

320
Avg loss on test: 0.4052734375
time: 0.0254683

Epoch 5

320
Avg loss on test: 0.4052734375
time: 0.02313960000000037

Done!

ptrblck · April 2, 2021, 7:38pm

I assume you are working on a binary classification.
If that’s the case, note that nn.Softmax would output a tensor of ones for an input tensor (activation coming from the previous linear layer) in the shape [batch_size, 1], so you might want the use nn.Sigmoid instead.
For better numerical stability I would recommend to remove the activation (no softmax or sigmoid) and use nn.BCEWithLogitsLoss as the criterion.