How to weight LSTM model to focus on the first 3 characters in a string

I have the following model architecture, which essentially is a 5 layer LSTM that takes in 62 length strings and outputs classification predictions based on that. Because of how the data works, the first 3-5 characters are more important for the classification than the remainder of the strings. How do I get the model to place more weight on the first three characters???

#basic model, need to modify to situation
class NLP_model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(NLP_model, self).__init__()
        self.char_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers = 5, bidirectional= True)
        self.fc = nn.Linear(hidden_dim*2,num_classes)
        #self.att = nn.MultiheadAttention(embed_dim, num_heads, ...)
        
    def forward(self, x):
        x = self.char_embedding(x)
        output, hidden = self.lstm(x)
        hidden = torch.cat((hidden[0][-2,:,:], hidden[0][-1,:,:]), dim=1)
        x = self.fc(hidden[0])
    
        return x

#modify for os
model_os = NLP_model(len(alphabet), 12, 24, lenos) #Find the number of fismaids for last variable
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model_os.parameters(), lr = 0.001)

for epoch in range(10): #Look into DataLoader for batch processing
    y = list()
    z = list()
    for sentence, label in zip(ls_X_train_os, ls_y_train_os): #training_data should be an array of hostnames and labels
        model_os.zero_grad()
        output = model_os(sentence_to_id(sentence)) #sentence is the hostname, label is the fismaid
        #print(label)
        #print(output.shape)
        temp_label = label
        label = torch.zeros(lenos)
        label[temp_label] = 1.0
        #label = torch.tensor(label).unsqueeze(0)
        #label = torch.tensor([label]).unsqueeze(1)
        #label = torch.tensor(label).unsqueeze(1)
        #print(label.shape)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        y.append(loss.item())
    y_true = []
    y_pred = []
    model_os.eval()
    for sentence, label in zip(ls_X_test_os, ls_y_test_os):
        temp_label = label
        label = torch.zeros(lenos)
        label[temp_label] = 1.0
        output = model_os(sentence_to_id(sentence))
        loss = criterion(output, label)
        z.append(loss.item())
        pred = output.detach().numpy
        pred = np.argmax(pred)
        y_pred.append(pred)
        y_true.append(temp_label)
    print(f'epoch {epoch} training loss: {np.array(y).mean()}')
    print(f'testing loss : {np.array(z).mean()}')
    print(f'recall: {recall_score(y_true, y_pred, average="weighted")}')
    print(f'precision: {precision_score(y_true, y_pred, average="weighted")}')
    print(f' f1: {f1_score(y_true, y_pred, average="weighted")}')
    print(f'accuracy: {accuracy_score(y_true, y_pred)}')

I assume that the model output and target tensors contain a temporal dimension, which represents the logits and targets for each time step, respectively?
If so, you could create the unreduced loss by specifying reduction='none' while creating the criterion, and weight the loss using a custom weight tensor indicating the “importance” for each time step.
Afterwards you could reduce the loss (e.g. by using the mean) and calculate the gradients via the backward operation.

The output is an array of probabilities for each category, from which I select the highest one to find the predicted outcome. I’m new to data science, so I’m not sure if that is what you mean by logits and targets for each time step. It certainly is a group of probabilities, or logit, but I’m not certain about the time-step. Is there an example of using a custom weight tensor I can look to?

The workflow to get the predictions sounds correct for the inference use case, but I assume won’t be used to train the model.

Here is a small example of what I had in mind:

# setup
batch_size, nb_classes, seq_len = 2, 3, 4
output = torch.randn(batch_size, nb_classes, seq_len, requires_grad=True)
target = torch.randint(0, 2, (batch_size, nb_classes, seq_len)).float()
criterion = nn.BCEWithLogitsLoss(reduction='none')

# create decreasing weight
weights = torch.arange(seq_len, 0, -1).view(1, 1, seq_len).float()

# calculate loss
loss = criterion(output, target)

# apply weights
loss = loss * weights

# reduce and calculate gradients
loss.mean().backward()