LSTM variable perdiction performance

I’m running an LSTM model on a AWS t2.medium instance (2,4GHZ 2 cores).
I use sequence length of 120 and have 16 features.

The time it takes to predict and cpu utilization seems very variable. prediction takes around 50 ms most of the time but all of a sudden it can take 5 seconds for one predictions while cpu usage goes above 100%
See the model below, any ideas?

class LSTMmodel(nn.Module):

    def __init__(self, output_size, hidden_dim, n_layers, features, drop_prob=0.3):
        """
        Initialize the model by setting up the layers.
        """
        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.features = features

     
        self.lstm = nn.LSTM(self.features, hidden_dim, n_layers,
                            dropout=drop_prob)

        # dropout layer
        self.dropout = nn.Dropout(0.3)

        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x, hidden):

        batch_size = x.size(1)
        lstm_out, hidden = self.lstm(x, hidden)

        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)

        # reshape to be batch_size first
        out = out.view(batch_size, -1, self.output_size)
        out = out[:, -1] # get last batch of labels

        return out, hidden



    def init_hidden(self, batch_size, train_on_gpu=False):
        ''' Initializes hidden state '''

        weight = next(self.parameters()).data

        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size,  self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

        return hidden


    def predict(self, inp):

        self.eval()
        h = self.init_hidden(1)
        inp = torch.FloatTensor(inp)
        with torch.no_grad():
                outputs, h = self(inp, h)
                _, prediction = torch.max(outputs, 1)
        return F.softmax(outputs, dim=1).numpy()[0], prediction.numpy()[0]