Loss fluctuating

In my current code the loss is fluctuating - Currently, I’ve tried regularization, clipping, and changing the optimizer. Can you have a look at my code:

import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from utils import (get_elements_and_compositions_single_2, 
    method_six, 
    undo_method_six, 
    undo_method_six_2,
    combine_elements_and_compositions)
from attention_bi_lstm_predict import predict_d_max

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt

class Seq2Seq(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(Seq2Seq, self).__init__()

        self.hidden_size = hidden_size
        # https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
        self.encoder = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq):
        _, (encoder_hidden, _) = self.encoder(input_seq)
        decoder_output, _ = self.decoder(encoder_hidden)
        output = self.fc(decoder_output)
        output = F.softmax(output, dim=2)
        output_lens = np.count_nonzero(np.count_nonzero(input_seq.numpy(), axis=2), axis=1)
        output_masks = []
        for output_len in output_lens: 
            output_mask = []
            max_len = 10
            for i in range(max_len):
                if i < output_len:
                    output_mask.append(1)
                else:
                    output_mask.append(0)
            output_masks.append(output_mask)
        output_masks = torch.FloatTensor(output_masks)
        output = output.squeeze(0) * torch.FloatTensor(output_masks)
        output_sum = output.sum(dim=1, keepdim=True)
        output = output / output_sum
        return output

def custom_loss(output, target, inputs):
    # TODO: define the strategy of the loss here
    distance = (output - target).pow(2).sum(1).sqrt()

    output_np = output.detach().numpy()
    target_np = target.detach().numpy()
    elements_batch = undo_method_six_2(inputs.numpy())

    reinforcement_reward = 0
    better_performance_batch_count = 0
    for i in range(len(elements_batch)):
        generated_alloy = combine_elements_and_compositions(elements=elements_batch[i], compositions=output_np[i])
        iteration_alloy = combine_elements_and_compositions(elements=elements_batch[i], compositions=target_np[i])
        generated_d_max = predict_d_max(generated_alloy)
        iteration_d_max = predict_d_max(iteration_alloy)
        # print(f"generated: {generated_alloy} generated_d_max: {generated_d_max}")
        # print(f"iteration: {iteration_alloy} iteration_d_max: {iteration_d_max}")
        if generated_d_max > iteration_d_max:
            better_performance_batch_count += 1
            reinforcement_reward -= 0.1
            with open("model_generated.txt", "a") as file:
                # alloy, d_max, difference_in_d_max, n_elements
                file.write("\n" + generated_alloy + "," + str(generated_d_max) + "," + str(generated_d_max - iteration_d_max) + "," + str(len(elements_batch[i])))
    print(f"performed better: {better_performance_batch_count}")
    return torch.mean(distance) + reinforcement_reward

if __name__ == "__main__":
    df = pd.read_csv("best_compositions.csv")
    
    X = []
    y = []
    for i in range(len(df)):
        elements, compositions = get_elements_and_compositions_single_2(df.iloc[i]["best_bmg_alloy"])
        vector = method_six(elements=elements)
        compositions = np.array(compositions + [0] * (10 - len(compositions))) / 100.0
        print(compositions)
        X.append(vector)
        y.append(compositions)
    
    X = np.array(X)
    y = np.array(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
    print("X_train shape: ", X_train.shape, "y_train shape: ", y_train.shape)

    X_train = torch.tensor(torch.from_numpy(X_train).float())
    y_train = torch.tensor(torch.from_numpy(y_train).float())
    training_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    training_loader = torch.utils.data.DataLoader(training_dataset, batch_size=32, shuffle=True)

    X_test = torch.tensor(torch.from_numpy(X_test).float())
    y_test = torch.tensor(torch.from_numpy(y_test).float())
    testing_dataset = torch.utils.data.TensorDataset(X_test, y_test)
    testing_loader = torch.utils.data.DataLoader(testing_dataset, batch_size=1, shuffle=False)

    model = Seq2Seq(input_size=200, output_size=10, hidden_size=128)
    # optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    num_epochs = 200
    print_every = 2
    plot_every = 1
    all_losses = []

    for epoch in range(num_epochs):
        for i, data in enumerate(training_loader):
            # https://github.com/pytorch/pytorch/issues/309
            inputs, labels = data
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = custom_loss(outputs, labels, inputs)
            loss.backward()

            clipping_value = 1
            torch.nn.utils.clip_grad_norm_(model.parameters(), clipping_value)

            optimizer.step()
            if (epoch + 1) % print_every == 0 and i == len(training_loader) - 1:
                print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}")

            if (epoch + 1) % plot_every == 0 and i == len(training_loader) - 1:
                all_losses.append(loss.item())
    
    plt.plot(all_losses, label='Training Loss')
    plt.title('Training Loss Over Iterations')
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig("model_loss.png")

    torch.save(model.state_dict(), "bmg_generator.pt")
    model.eval()

    total_loss = 0.0
    num_batches = 0

    with torch.no_grad():
        for i, data in enumerate(testing_loader):
            inputs, labels = data
            outputs = model(inputs)
            elements = undo_method_six(inputs.numpy())
            if len(elements) == 3:
                output_np = outputs.detach().numpy()
                target_np = labels.detach().numpy()
                print(output_np)
                generated_alloy = combine_elements_and_compositions(elements=elements, compositions=output_np[0][:len(elements)])
                iteration_alloy = combine_elements_and_compositions(elements=elements, compositions=target_np[0][:len(elements)])
                generated_d_max = predict_d_max(generated_alloy)
                iteration_d_max = predict_d_max(iteration_alloy)

                if generated_d_max > iteration_d_max:
                    print(generated_alloy + "\tFound a winner!")
                else:
                    print("iteration wins:(")
            loss = custom_loss(outputs, labels, inputs)
            total_loss += loss.item()
            num_batches += 1

    average_loss = total_loss / num_batches
    print(f"Test Loss: {average_loss}")

image

Could you describe why your loss can be negative and what the expected loss for a perfect output would be?

So the code is trying to generate alloys of the highest d_max - so given the elements and their composition we can predict the d_max.

I’ve created a dataset iterating through all possible compositions so for three elements A, B, C the search space would look like [[5, 5, 90], [5, 10, 85] …] where we use a window size of 5. Using this and the given data I have a dataset of the alloys with the best_d_max values.] We are creating a neural network because this method of iteration fails with alloys that have more elements and so the current loss depends on two factors:

  1. The distance of the composition vector from iteration and the one generated from the neural network.
  2. If we want the model to generate alloys that are even better than the dataset (which it should do) we negatively reward the loss function so as to minimize it (I didn’t know what to do about this fact so I added a small negative integer to the loss so the model could incentivize this).

What are you trying to do, i.e., what is your task you’re trying to learn.

Your implementation of the network model is not a standard Seq2Seq model, as you’re a given the hidden state of the encoder as input to the decoder. Typically the hidden state if the decoder is initialized with the hidden state of the decoder. The the decoder gets a special start token/symbol to get going and use this output as input for the text step.

I’ll try to explain the problem:

The dataset looks like this:

Ti5Cu29Zr66 - where for the input of elements Ti, Cu, and Zr the best composition we found through iterating over all possible combinations and the best one is [5, 29, 66] because this has the highest d_max (look into d_max values for bulk metallic glasses). So currently, I’m trying to train a neural network to generate the output: [5, 29, 66] given the elements: Ti, Cu, and Zr. I simply vectorize the inputs and feed it to the network and generate the output of the compositions is a bit like a softmax except that given the fact that for three elements on the first three values add up to 1.

Yeah, this is not the standard Seq2Seq - currently, it’s named this because for the input sequence, we produce the needed output sequence.

I’ve tried changing the network and the loss function to be just be the distance and not account for increasing the d_max so the code looks like:

class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(Model, self).__init__()

        self.hidden_size = hidden_size
        self.fc1 = nn.Linear(200*118, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, input_seq):
        input_seq_flatten = input_seq.view(*input_seq.size()[:-2], -1)  # Flatten the input
        # print(input_seq_flatten.shape)
        x = F.relu(self.fc1(input_seq_flatten))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        output = self.fc3(x)
        output = F.softmax(output, dim=1)
        # print("output shape: ", output.shape)
        output_lens = np.count_nonzero(np.count_nonzero(input_seq.numpy(), axis=2), axis=1)
        # print("output_lens: ", output_lens[-1], undo_method_six_2(input_seq.numpy())[-1])
        output_masks = []
        for output_len in output_lens: 
            output_mask = []
            max_len = 10
            for i in range(max_len):
                if i < output_len:
                    output_mask.append(1)
                else:
                    output_mask.append(0)
            output_masks.append(output_mask)
        # print("output mask:", output_masks[-1])
        output = output.squeeze(0) *  torch.FloatTensor(output_masks)
        output_sum = output.sum(dim=1, keepdim=True)
        output = output / output_sum
        # print("output:", output[-1])
        return output

def custom_loss(output, target, inputs):
    # TODO: define the strategy of the loss here
    distance = (output - target).pow(2).sum(1).sqrt()
    return torch.mean(distance)

the loss looks like:
image

It’s still fluctuating a lot but is reducing.

Well, it’s not quite common that the loss will fluctuate if you work with batches.

I’m just wondering – as I have no background in this matter – does in the case of “Ti5Cu29Zr66” the sequence of elements actually matter. Because I’m wondering if this is a sequence task to begin with.

Yeah, thought about the problem as well - you’re right the task is independent of the sequence of elements. Here’s the vectorization code for creating the input:

def method_six(elements):
    vector = np.zeros((118, 200), dtype=float)
    for element in elements:
        vector[element_to_index(element=element), :] = element_weights[element]
    return vector

where at the index of the atomic number we populate with the pre-trained representation of the element. And I’ve changed the model from LSTM to a simple fully connected network as well, but still the loss seems to be fluctuating for batches.

Currently, I’ve tried adding a batch normalization layer:

class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(Model, self).__init__()

        self.hidden_size = hidden_size
        self.fc1 = nn.Linear(200*118, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.5)
        self.batch_norm = nn.BatchNorm1d(118)

    def forward(self, input_seq):
        input_seq_normalized = self.batch_norm(input_seq)
        input_seq_flatten = input_seq_normalized.view(*input_seq_normalized.size()[:-2], -1)  # Flatten the input
        # print(input_seq_flatten.shape)
        x = F.relu(self.fc1(input_seq_flatten))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        output = self.fc3(x)
        output = F.softmax(output, dim=1)
        # print("output shape: ", output.shape)
        output_lens = np.count_nonzero(np.count_nonzero(input_seq.numpy(), axis=2), axis=1)
        # print("output_lens: ", output_lens[-1], undo_method_six_2(input_seq.numpy())[-1])
        output_masks = []
        for output_len in output_lens: 
            output_mask = []
            max_len = 10
            for i in range(max_len):
                if i < output_len:
                    output_mask.append(1)
                else:
                    output_mask.append(0)
            output_masks.append(output_mask)
        # print("output mask:", output_masks[-1])
        output = output.squeeze(0) *  torch.FloatTensor(output_masks)
        output_sum = output.sum(dim=1, keepdim=True)
        output = output / output_sum
        # print("output:", output[-1])
        return output

But the loss still looks like:

image