My LSTM model is underfitted

nganngants · July 21, 2024, 4:23pm

Hello everyone, I’m new to PyTorch and currently are stuck with training a LSTM model. I’m trying to reproduce result from this: Trading Momentum Transformer (the model is defined in mom_trans/deep_momentum_network.py)

Briefly, this work aim to use LSTM for a momentum trading strategy. Input is close price of various tickers. From this close price, target_returns can be calculated. The output is positions for each ticker on each time step. The aim is to maximize the sharpe ratio between target_returns and positions.

The output of the LSTM is followed by a time distributed, fully-connected layer with a activation function tanh(), which is squashing function that directly outputs positions.

The original repo use Tensorflow but I need to migrate to PyTorch for compatibility with my system. Below is my implementation in PyTorch:

# Because we want to maximize sharpe ratio, the sharpe loss will return -sharpe_ratio
class SharpeLoss(nn.Module):
    def __init__(self, output_size: int = 1):
        super(SharpeLoss, self).__init__()
        self.output_size = output_size

    def forward(self, y_true, weights):
        print(f'{y_true.shape} | {weights.shape}')
        captured_returns = y_true * weights
        mean_returns = torch.mean(captured_returns)
        variance_returns = torch.mean(torch.square(captured_returns)) - torch.square(mean_returns)
        std_returns = torch.sqrt(variance_returns + 1e-9)
        sharpe_loss = -mean_returns * torch.sqrt(torch.tensor(252.0)) / std_returns
        return sharpe_loss


# Since PyTorch does not support the equivalence of tf.keras.layers.TimeDistributed
# we need to implement one
class TimeDistributed(nn.Module):
    def __init__(self, module, output_size, batch_first=True, activation=None):
        super(TimeDistributed, self).__init__()
        self.module = module
        self.output_size = output_size
        self.activation = activation
        self.batch_first = batch_first

    def forward(self, x):
        # Input x shape: (batch_size, time_steps, input_size)
        if self.batch_first:
            batch_size, time_steps, _ = x.size()
        else:
            time_steps, batch_size, _ = x.size()

        # Reshape to (batch_size * time_steps, input_size)
        x_reshaped = x.contiguous().view(-1, x.size(-1))

        output = self.module(x_reshaped)
        output = self.activation(output)

        if self.batch_first:
            # Reshape back to (batch_size, time_steps, output_size)
            output = output.view(batch_size, time_steps, self.output_size)
        else:
            # Reshape back to (time_steps, batch_size, output_size)
            output = output.view(time_steps, batch_size, self.output_size)

        return output

class LstmDeepMomentumNetworkModel(nn.Module):

    def __init__(self, hidden_layer_size, dropout_rate, **params):
        super(LstmDeepMomentumNetworkModel, self).__init__()

        params = params.copy()

        self.time_steps = int(params["total_time_steps"])
        self.input_size = int(params["input_size"])
        self.output_size = int(params["output_size"])
        self.hidden_layer_size = hidden_layer_size
        self.dropout_rate = dropout_rate
        self.evaluate_diversified_val_sharpe = params["evaluate_diversified_val_sharpe"]
        self.force_output_sharpe_length = params["force_output_sharpe_length"]

        self.lstm = nn.LSTM(
            input_size = self.input_size,
            bias = True,
            batch_first = True,
            hidden_size = self.hidden_layer_size
        )

        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_layer_size, self.output_size)
        self.time_distributed = TimeDistributed(
            module=self.fc,
            output_size=self.output_size,
            activation=nn.Tanh(),
            batch_first=True
        )
        self._reinitialize()

    def _reinitialize(self):
        """
        Tensorflow/Keras-like initialization
        """
        for name, p in self.named_parameters():
            if 'lstm' in name:
                if 'weight_ih' in name:
                    nn.init.xavier_uniform_(p.data)
                elif 'weight_hh' in name:
                    nn.init.orthogonal_(p.data)
                elif 'bias_ih' in name:
                    p.data.fill_(0)
                    # Set forget-gate bias to 1
                    n = p.size(0)
                    p.data[(n // 4):(n // 2)].fill_(1)
                elif 'bias_hh' in name:
                    p.data.fill_(0)
            elif 'fc' in name:
                if 'weight' in name:
                    nn.init.xavier_uniform_(p.data)
                elif 'bias' in name:
                    p.data.fill_(0)

    def forward(self, x):
        # x (batch_size, time_steps, input_size)
        lstm_out, _ = self.lstm(x) # (batch_size, time_steps, hidden_size)
        lstm_out = self.dropout(lstm_out)
        output = self.time_distributed(lstm_out) # (batch_size, time_steps, output_size)
        return output

# Training function
def train_n_epochs(device, n_epochs, model, train_loader, valid_loader, criterion, optimizer, early_stop_epoch, max_grad_norm):
    valid_loss_min = np.Inf  # track change in validation loss
    train_loss_set = []
    valid_loss_set = []
    invariant_epochs = 0

    model.to(device)
    print(f'Using {device} for training')
    best_model = None

    for epoch_i in range(n_epochs):
        # keep track of training and validation loss
        train_loss, valid_loss = 0.0, 0.0

        # Model for training
        model.train()

        for i, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(target, output)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            train_loss += loss.item()

        # Model validation
        model.eval()
        with torch.no_grad():
            for i, (data, target) in enumerate(valid_loader):
                data, target = data.to(device), target.to(device)
                output = model(data)
                loss = criterion(target, output)
                valid_loss += loss.item()

        # Compute average sharpe loss
        train_loss /= len(train_loader.dataset)
        valid_loss /= len(valid_loader.dataset)

        train_loss_set.append(train_loss)
        valid_loss_set.append(valid_loss)

        print(f'Epoch: {epoch_i + 1} Training Loss: {train_loss:.6f} Validation Loss: {valid_loss:.6f}')

        # if validation loss gets smaller, save the model
        if valid_loss <= valid_loss_min:
            print(f'Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model...')
            valid_loss_min = valid_loss
            invariant_epochs = 0

            best_model = copy.deepcopy(model)
        else:
            invariant_epochs += 1

        if invariant_epochs >= early_stop_epoch:
            print(f"Early Stop at Epoch [{epoch_i + 1}]: Performance hasn't improved for {early_stop_epoch} epochs")
            break

    return train_loss_set, valid_loss_set, valid_loss_min, best_model

Here is how I train it:

criterion = SharpeLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

_, _, min_val_loss, best_model = train_n_epochs(
                        'cpu', n_epochs=300, model=model,
                        train_loader=train_loader, valid_loader=valid_loader,
                        criterion=criterion, optimizer=optimizer,
                        early_stop_epoch=30, max_grad_norm=1.0)

The data preparation is the same with the original repo.

The problem is the loss when training is high. I expect it to decrease to around -1.x → -2.x (as output by the original code with tensorflow) but my training loss is only get to ~-0.5 and stop decreasing. The valid loss is even more higher. I have tried with different hyperparameter sets but still cannot find optimal hyperparameters so I think my model or loss function have something wrong…