My LSTM model seems not utilize GPU well

My siamese network seems not working well on GPU, I have checked all inputs and outputs which are all on GPU. I uses only one GPU but the GPU utilization is always under 30%. Here is the model an training code:

-------------------------------------------------------------------------------------------------------------------
      Layer (type)                                                    Output Shape         Param #     Tr. Param #
===================================================================================================================
       Embedding-1                                                  [128, 32, 768]      16,226,304               0
       Embedding-2                                                  [128, 32, 768]      16,226,304               0
          Linear-3                                                  [128, 32, 768]         590,592         590,592
          Linear-4                                                  [128, 32, 768]         590,592         590,592
         Dropout-5                                                  [128, 32, 768]               0               0
         Dropout-6                                                  [128, 32, 768]               0               0
            LSTM-7     [653, 600], [9], [128], [128], [2, 128, 300], [2, 128, 300]       2,568,000       2,568,000
            LSTM-8     [625, 600], [9], [128], [128], [2, 128, 300], [2, 128, 300]       2,568,000       2,568,000
===================================================================================================================
Total params: 38,769,792
Trainable params: 6,317,184
Non-trainable params: 32,452,608
-------------------------------------------------------------------------------------------------------------------

model.py

class SiameseLSTM(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.build()

    def build(self):
        weight = torch.from_numpy(self.config.bert_embedding["embeddings"])
        self.embedding = nn.Embedding.from_pretrained(weight)
        # need exclude the freezed parameters, or error occurs
        self.embedding.weight.requires_grad = False
        self.bilstm = nn.LSTM(input_size=self.config.embedding_size, hidden_size=self.config.lstm_hidden, num_layers=self.config.lstm_layers, dropout=self.config.lstm_dropout, bidirectional=True)
        self.embedding_dropout = nn.Dropout(self.config.embedding_dropout)
        # transform the vector to fit the finetune data, and dropout on this not to overfit
        self.embedding_transformer = nn.Linear(self.config.embedding_size, self.config.embedding_size)

        pass

    def forward(self, input1, input2, length1, length2):
        emb1 = self.embedding(input1)
        emb2 = self.embedding(input2)

        batch_size = emb1.shape[0]

        # linear transform vector
        emb1 = self.embedding_transformer(emb1)
        emb2 = self.embedding_transformer(emb2)

        # dropout pretrained emb
        emb1 = self.embedding_dropout(emb1)
        emb2 = self.embedding_dropout(emb2)

        # pack input sequence
        emb1 = rnn_utils.pack_padded_sequence(emb1, lengths=length1, batch_first=True, enforce_sorted=False)
        emb2 = rnn_utils.pack_padded_sequence(emb2, lengths=length2, batch_first=True, enforce_sorted=False)

        #print("pack emb:", emb1)
        # lstm sequence encoding
        output1, (h1, c1) = self.bilstm(emb1)
        output2, (h2, c2) = self.bilstm(emb2)

        h1 = h1.view(self.config.lstm_layers, 2, batch_size, self.config.lstm_hidden)
        h2 = h2.view(self.config.lstm_layers, 2, batch_size, self.config.lstm_hidden)

        h1 = torch.cat([h1[-1, 0, :, :], h1[-1, 1, :, :]], dim=1)
        h2 = torch.cat([h2[-1, 0, :, :], h2[-1, 1, :, :]], dim=1)

        # calculate manhattan distance
        distance = torch.exp(-torch.sum(torch.abs(torch.square(h1 - h2)), dim=1, keepdim=True))
        distance = torch.squeeze(distance, dim=1)

        return distance

train.py

# set model
model = SiameseLSTM(config)
model.cuda()
criterion = nn.BCELoss().cuda()
optimizer = optim.Adam([param for param in model.parameters() if param.requires_grad == True], lr=config.lr, eps=config.eps, weight_decay=config.weight_decay)

def generate_batch(batch):
    # TextDataSet yield one line contain label and input
    batch_label, batch_input1, batch_input2, length1, length2 = [], [], [], [], []
    for data in batch:
        num_input = []
        sent1, sent2, label = data.split('\t')
        #print(sent1, sent2, label)
        batch_label.append(float(label))
        sent1 = torch.squeeze(bert_tokenizer.encode(bert_tokenizer.tokenize(sent1), add_special_tokens=False, return_tensors='pt'), dim=0)
        sent2 = torch.squeeze(bert_tokenizer.encode(bert_tokenizer.tokenize(sent2), add_special_tokens=False, return_tensors='pt'), dim=0)
        batch_input1.append(sent1)
        batch_input2.append(sent2)
        length1.append(sent1.shape[0])
        length2.append(sent2.shape[0])

    input1 = rnn_utils.pad_sequence(batch_input1, batch_first=True)
    input2 = rnn_utils.pad_sequence(batch_input2, batch_first=True)
    #input1 = rnn_utils.pack_padded_sequence(input1, length1, batch_first=True, enforce_sorted=False)
    #input2 = rnn_utils.pack_padded_sequence(input2, length2, batch_first=True, enforce_sorted=False)

    return input1.cuda(), input2.cuda(), torch.tensor(length1, dtype=torch.long).cuda(), torch.tensor(length2, dtype=torch.long).cuda(), torch.tensor(batch_label, dtype=torch.float).cuda()

def test(test_data):
    valid_loss = 0
    valid_acc = 0
    test_dataset = TextDataSet(test_data)
    data = DataLoader(test_dataset, batch_size=config.batch_size, collate_fn=generate_batch)
    #model.eval()
    for i, (input1, input2, length1, length2, label) in enumerate(data):
        with torch.no_grad():
            output = model(input1, input2, length1, length2)
            loss = criterion(output, label)
            valid_loss += loss.item()
            valid_acc += (torch.abs(output - label) < float(config.threshold)).sum().item()

    return valid_loss / len(test_dataset), valid_acc / len(test_dataset)

def train():
    global model

    best_valid_acc = 0.0
    steps = 0
    saved_step = 0
    stop_train = False
    for epoch in range(config.epoch):
        start_time = time.time()
        train_loss = 0
        train_count = 0
        train_acc = 0
        train_dataset = TextDataSet(config.train)
        data = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=generate_batch)
        for i, (input1, input2, length1, length2, label) in enumerate(data):
            #model.train()
            optimizer.zero_grad()
            output = model(input1, input2, length1, length2)
            loss = criterion(output, label)
            train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_([param for param in model.parameters() if param.requires_grad == True], 1.0)
            optimizer.step()
            train_acc += (torch.abs(output - label) < float(config.threshold)).sum().item()
            train_count += input1.shape[0]
            step_accuracy = train_acc / train_count

            if steps % config.save_step == 0 and steps > 0:
                #print("Epoch:%s Steps:%s train_count:%s" % (epoch, steps, train_count))
                valid_loss, valid_acc = test(config.dev)
                if valid_acc > best_valid_acc and epoch > 0:
                    save_checkpoint({
                        "epoch": epoch + 1,
                        "steps": steps,
                        "state_dict": model.state_dict(),
                        "valid_acc": valid_acc
                        })
                    best_valid_acc = valid_acc
                    saved_step = steps

                secs = int(time.time() - start_time)
                mins = secs / 60
                secs = mins % 60
                writer.add_scalars("StepLoss", {
                    'train': train_loss / train_count,
                    "valid": valid_loss
                    }, steps)
                writer.add_scalars("StepAcc", {
                    'train': train_acc / train_count,
                    "valid": valid_acc
                    }, steps)

                print("Epoch: %d" % (epoch + 1), "Steps: %d" % steps, " | time in %d minutes, %d seconds" % (mins, secs))
                print(f"\tLoss: {train_loss / train_count:.4f}(train)\t|\tAcc: {train_acc / train_count * 100:.1f}%(train)")
                print(f"\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)")

            if steps - saved_step > config.early_stop and epoch > config.early_stop_epoch:
                stop_train = True
                break
            steps += 1

        if stop_train == True:
            print("early stop!!!")
            break

        # tensorboard for epoch accuracy and loss
        valid_loss, valid_acc = test(config.dev)
        writer.add_scalars("EpochLoss", {
            'train': train_loss / len(train_dataset),
            "valid": valid_loss
            }, epoch + 1)
        writer.add_scalars("EpochAcc", {
            'train': train_acc / len(train_dataset),
            "valid": valid_acc
            }, epoch + 1)

config.json

{
    "bert_pretrain": "model/bert_vocab_and_embeddings.npz",
    "vocab_save": "model/vocab.txt",
    "tensorboard": "tensorboard",
    "seed": 1992,
    "tensorboard_flush_sec": 30,
    "lr": 2e-5,
    "eps": 1e-8,
    "weight_decay": 1e-6,
    "batch_size": 128,
    "save_model": "model/siamese_lstm.bin",
    "epoch": 100,
    "train": "pretrain/train_index.txt",
    "dev": "pretrain/dev.txt",
    "test": "",
    "save_step": 2000,
    "embedding_size": 768,
    "lstm_hidden": 300,
    "lstm_layers": 1,
    "lstm_dropout": 0.0,
    "embedding_dropout": 0.5,
    "threshold": 0.5,
    "early_stop": 50000,
    "early_stop_epoch": 100
}

Let me know if you need more information, thanks.