My siamese network seems not working well on GPU, I have checked all inputs and outputs which are all on GPU. I uses only one GPU but the GPU utilization is always under 30%. Here is the model an training code:
-------------------------------------------------------------------------------------------------------------------
Layer (type) Output Shape Param # Tr. Param #
===================================================================================================================
Embedding-1 [128, 32, 768] 16,226,304 0
Embedding-2 [128, 32, 768] 16,226,304 0
Linear-3 [128, 32, 768] 590,592 590,592
Linear-4 [128, 32, 768] 590,592 590,592
Dropout-5 [128, 32, 768] 0 0
Dropout-6 [128, 32, 768] 0 0
LSTM-7 [653, 600], [9], [128], [128], [2, 128, 300], [2, 128, 300] 2,568,000 2,568,000
LSTM-8 [625, 600], [9], [128], [128], [2, 128, 300], [2, 128, 300] 2,568,000 2,568,000
===================================================================================================================
Total params: 38,769,792
Trainable params: 6,317,184
Non-trainable params: 32,452,608
-------------------------------------------------------------------------------------------------------------------
model.py
class SiameseLSTM(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.build()
def build(self):
weight = torch.from_numpy(self.config.bert_embedding["embeddings"])
self.embedding = nn.Embedding.from_pretrained(weight)
# need exclude the freezed parameters, or error occurs
self.embedding.weight.requires_grad = False
self.bilstm = nn.LSTM(input_size=self.config.embedding_size, hidden_size=self.config.lstm_hidden, num_layers=self.config.lstm_layers, dropout=self.config.lstm_dropout, bidirectional=True)
self.embedding_dropout = nn.Dropout(self.config.embedding_dropout)
# transform the vector to fit the finetune data, and dropout on this not to overfit
self.embedding_transformer = nn.Linear(self.config.embedding_size, self.config.embedding_size)
pass
def forward(self, input1, input2, length1, length2):
emb1 = self.embedding(input1)
emb2 = self.embedding(input2)
batch_size = emb1.shape[0]
# linear transform vector
emb1 = self.embedding_transformer(emb1)
emb2 = self.embedding_transformer(emb2)
# dropout pretrained emb
emb1 = self.embedding_dropout(emb1)
emb2 = self.embedding_dropout(emb2)
# pack input sequence
emb1 = rnn_utils.pack_padded_sequence(emb1, lengths=length1, batch_first=True, enforce_sorted=False)
emb2 = rnn_utils.pack_padded_sequence(emb2, lengths=length2, batch_first=True, enforce_sorted=False)
#print("pack emb:", emb1)
# lstm sequence encoding
output1, (h1, c1) = self.bilstm(emb1)
output2, (h2, c2) = self.bilstm(emb2)
h1 = h1.view(self.config.lstm_layers, 2, batch_size, self.config.lstm_hidden)
h2 = h2.view(self.config.lstm_layers, 2, batch_size, self.config.lstm_hidden)
h1 = torch.cat([h1[-1, 0, :, :], h1[-1, 1, :, :]], dim=1)
h2 = torch.cat([h2[-1, 0, :, :], h2[-1, 1, :, :]], dim=1)
# calculate manhattan distance
distance = torch.exp(-torch.sum(torch.abs(torch.square(h1 - h2)), dim=1, keepdim=True))
distance = torch.squeeze(distance, dim=1)
return distance
train.py
# set model
model = SiameseLSTM(config)
model.cuda()
criterion = nn.BCELoss().cuda()
optimizer = optim.Adam([param for param in model.parameters() if param.requires_grad == True], lr=config.lr, eps=config.eps, weight_decay=config.weight_decay)
def generate_batch(batch):
# TextDataSet yield one line contain label and input
batch_label, batch_input1, batch_input2, length1, length2 = [], [], [], [], []
for data in batch:
num_input = []
sent1, sent2, label = data.split('\t')
#print(sent1, sent2, label)
batch_label.append(float(label))
sent1 = torch.squeeze(bert_tokenizer.encode(bert_tokenizer.tokenize(sent1), add_special_tokens=False, return_tensors='pt'), dim=0)
sent2 = torch.squeeze(bert_tokenizer.encode(bert_tokenizer.tokenize(sent2), add_special_tokens=False, return_tensors='pt'), dim=0)
batch_input1.append(sent1)
batch_input2.append(sent2)
length1.append(sent1.shape[0])
length2.append(sent2.shape[0])
input1 = rnn_utils.pad_sequence(batch_input1, batch_first=True)
input2 = rnn_utils.pad_sequence(batch_input2, batch_first=True)
#input1 = rnn_utils.pack_padded_sequence(input1, length1, batch_first=True, enforce_sorted=False)
#input2 = rnn_utils.pack_padded_sequence(input2, length2, batch_first=True, enforce_sorted=False)
return input1.cuda(), input2.cuda(), torch.tensor(length1, dtype=torch.long).cuda(), torch.tensor(length2, dtype=torch.long).cuda(), torch.tensor(batch_label, dtype=torch.float).cuda()
def test(test_data):
valid_loss = 0
valid_acc = 0
test_dataset = TextDataSet(test_data)
data = DataLoader(test_dataset, batch_size=config.batch_size, collate_fn=generate_batch)
#model.eval()
for i, (input1, input2, length1, length2, label) in enumerate(data):
with torch.no_grad():
output = model(input1, input2, length1, length2)
loss = criterion(output, label)
valid_loss += loss.item()
valid_acc += (torch.abs(output - label) < float(config.threshold)).sum().item()
return valid_loss / len(test_dataset), valid_acc / len(test_dataset)
def train():
global model
best_valid_acc = 0.0
steps = 0
saved_step = 0
stop_train = False
for epoch in range(config.epoch):
start_time = time.time()
train_loss = 0
train_count = 0
train_acc = 0
train_dataset = TextDataSet(config.train)
data = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=generate_batch)
for i, (input1, input2, length1, length2, label) in enumerate(data):
#model.train()
optimizer.zero_grad()
output = model(input1, input2, length1, length2)
loss = criterion(output, label)
train_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_([param for param in model.parameters() if param.requires_grad == True], 1.0)
optimizer.step()
train_acc += (torch.abs(output - label) < float(config.threshold)).sum().item()
train_count += input1.shape[0]
step_accuracy = train_acc / train_count
if steps % config.save_step == 0 and steps > 0:
#print("Epoch:%s Steps:%s train_count:%s" % (epoch, steps, train_count))
valid_loss, valid_acc = test(config.dev)
if valid_acc > best_valid_acc and epoch > 0:
save_checkpoint({
"epoch": epoch + 1,
"steps": steps,
"state_dict": model.state_dict(),
"valid_acc": valid_acc
})
best_valid_acc = valid_acc
saved_step = steps
secs = int(time.time() - start_time)
mins = secs / 60
secs = mins % 60
writer.add_scalars("StepLoss", {
'train': train_loss / train_count,
"valid": valid_loss
}, steps)
writer.add_scalars("StepAcc", {
'train': train_acc / train_count,
"valid": valid_acc
}, steps)
print("Epoch: %d" % (epoch + 1), "Steps: %d" % steps, " | time in %d minutes, %d seconds" % (mins, secs))
print(f"\tLoss: {train_loss / train_count:.4f}(train)\t|\tAcc: {train_acc / train_count * 100:.1f}%(train)")
print(f"\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)")
if steps - saved_step > config.early_stop and epoch > config.early_stop_epoch:
stop_train = True
break
steps += 1
if stop_train == True:
print("early stop!!!")
break
# tensorboard for epoch accuracy and loss
valid_loss, valid_acc = test(config.dev)
writer.add_scalars("EpochLoss", {
'train': train_loss / len(train_dataset),
"valid": valid_loss
}, epoch + 1)
writer.add_scalars("EpochAcc", {
'train': train_acc / len(train_dataset),
"valid": valid_acc
}, epoch + 1)
config.json
{
"bert_pretrain": "model/bert_vocab_and_embeddings.npz",
"vocab_save": "model/vocab.txt",
"tensorboard": "tensorboard",
"seed": 1992,
"tensorboard_flush_sec": 30,
"lr": 2e-5,
"eps": 1e-8,
"weight_decay": 1e-6,
"batch_size": 128,
"save_model": "model/siamese_lstm.bin",
"epoch": 100,
"train": "pretrain/train_index.txt",
"dev": "pretrain/dev.txt",
"test": "",
"save_step": 2000,
"embedding_size": 768,
"lstm_hidden": 300,
"lstm_layers": 1,
"lstm_dropout": 0.0,
"embedding_dropout": 0.5,
"threshold": 0.5,
"early_stop": 50000,
"early_stop_epoch": 100
}
Let me know if you need more information, thanks.