Output LSTM Pack Padded Sequence to Linear Layer

class MyModel(someBaseModel):
    def __init__(self):
        super().__init__()

        self.gru = nn.GRU(20, 50, 1, batch_first=True)  # input, hiddensize, layers
        self.classifier = nn.Linear(50, 4)

    def forward(self, xb):  # xb is a packed padded sequence of size (batch_size*max_length, 20)
        out, hn = self.gru(xb)  # out is packed padded sequence with data of size (bs*ml, 50)
        out = self.classifier(out.data)  # (bs*ml, 4) Now it is not a packed padded sequence but normal tensor
        return out

And my loss calculation looks like:

def training_step(batch):
    inputs, labels = batch
    inputs = inputs.to("cuda:2")  # Packed Padded Sequence of size (bs*ml, 20)
    # labels = torch.Tensor(labels).to("cuda:2")  # Won't work since all labels are of different lengths

    batch_sizes = inputs.batch_sizes
    out = model(inputs)    # Generate predictions
    out_padded, out_lengths = pad_packed_sequence(torch.nn.utils.rnn.PackedSequence(out, batch_sizes), batch_first=True)
    # out_padded is of shape (bs, ml, 4)

    loss = torch.zeros(out_padded.size(0))
    for i in range(out_padded.size(0)):
        msel = F.mse_loss(out_padded[i][:out_lengths[i]], labels[i].to("cuda:2"))
        loss[i] = msel

    acc = torch.mean(loss)
    return loss, acc

for batch in train_loader:
    loss, acc = model.training_step(batch)
    train_losses.append(torch.mean(loss))
    train_accs.append(acc)
    torch.sum(loss).backward()  # [l.backward() for l in loss] won't work: RuntimeError: Trying to backward through the graph a second time
    optimizer.step()
    optimizer.zero_grad()

Does this look right? I’m really unsure what the torch.sum(loss).backward() step would be doing in terms of accumulating gradients in both the GRU as well as the Linear weights… Is there a better way?

I really couldn’t find much info on how to best use the pack_sequence function.