How can I change model to remove seq_length?

When I made the following changes to remove seq_length, I got the following error.

RuntimeError: Tensor for 'out' is on CPU, Tensor for argument #1 'self' is on CPU, but expected them to be on GPU (while checking arguments for addmm)

This error occurs at x = fc(x). How can I change it to remove seq_length?

class CNNLSTM(nn.Module):
    def __init__(
            self,
            num_classes,
            latent_dim=512,
            num_channels=4,
            lstm_layers=1,
            hidden_dim=1024,
            # seq_length=40, # I want to remove this parameter
            bidirectional=True,
            attention=True,
    ):
        super(CNNLSTM, self).__init__()
        self.encoder = Encoder(latent_dim, num_channels)
        self.lstm = LSTM(latent_dim, lstm_layers, hidden_dim, bidirectional)
        # Comment to remove seq_length
        # self.fc = nn.Linear(2 * hidden_dim if bidirectional else hidden_dim,
        #                     seq_length * num_classes)
        self.output_layers = nn.Sequential(
            nn.Softmax(dim=-1),
        )
        self.bidirectional = bidirectional
        self.attention = attention
        self.num_classes = num_classes
        self.hidden_dim = hidden_dim
        self.attention_layer = nn.Linear(
            2 * hidden_dim if bidirectional else hidden_dim, 1
        )

    def forward(self, x):
        batch_size, seq_length, c, h, w = x.shape
        x = x.view(batch_size * seq_length, c, h, w)
        x = self.encoder(x)
        x = x.view(batch_size, seq_length, -1)
        x = self.lstm(x)
        if self.attention:
            attention_w = F.softmax(self.attention_layer(x).squeeze(-1), dim=-1)
            x = torch.sum(attention_w.unsqueeze(-1) * x, dim=1)
        else:
            x = x[:, -1]
        # x = self.fc(x) # Comment to remove seq_length
        # Define the following to get seq_length dynamically
        fc = nn.Linear(2 * self.hidden_dim if self.bidirectional else self.hidden_dim, seq_length * self.num_classes)
        x = fc(x)
        x = x.reshape(batch_size * seq_length, -1)
        x = x.view(batch_size, seq_length, -1)
        output = self.output_layers(x)
        return output

The model before removing seq_length worked fine, but it was wrong as the model to use.

It turns out that it is possible to remove seq_length without Attention as shown below.

class CNNLSTM(nn.Module):
    def __init__(
            self,
            num_classes,
            latent_dim=512,
            num_channels=4,
            lstm_layers=1,
            hidden_dim=1024,
            bidirectional=True,
            attention=True,
    ):
        super(CNNLSTM, self).__init__()
        self.encoder = Encoder(latent_dim, num_channels)
        self.lstm = LSTM(latent_dim, lstm_layers, hidden_dim, bidirectional)
        self.fc = nn.Linear(2 * hidden_dim if bidirectional else hidden_dim, num_classes)
        self.output_layers = nn.Sequential(
            nn.Softmax(dim=-1),
        )
        self.bidirectional = bidirectional
        self.attention = attention
        self.attention_layer = nn.Linear(
            2 * hidden_dim if bidirectional else hidden_dim, 1
        )

    def forward(self, x):
        batch_size, seq_length, c, h, w = x.shape
        x = x.view(batch_size * seq_length, c, h, w)
        x = self.encoder(x)
        x = x.view(batch_size, seq_length, -1)
        x = self.lstm(x)
        x = x.reshape(batch_size * seq_length, -1)
        # NOTE: Comment because it is possible to remove seq_length without Attention
        # if self.attention:
        #     attention_w = F.softmax(self.attention_layer(x).squeeze(-1), dim=-1)
        #     x = torch.sum(attention_w.unsqueeze(-1) * x, dim=1)
        # else:
        #     x = x[:, -1]
        x = self.fc(x)
        x = x.view(batch_size, seq_length, -1)
        output = self.output_layers(x)
        return output