I try to build a neural network based on BertModel with the implementation from huggingface/transformers.
I basically take the bert-base-uncased model for contextual representation and another pretrained embedding layer for token-level representation. And do some operations in the network. I.E. Matrix multiplication between those two representations… But after training, I can’t see any updates for the embedding layer (i.e query_encoder in the network) by checking the same words’ embedding vector. Could you please help me with this, I think there is something wrong with the code.
import torch import torch.nn as nn import torch.nn.functional as F from transformers import BertModel class MyNet(nn.Module): def __init__(self): super(MyNet, self).__init__() # embedding layer for question encoder self.query_encoder = BertModel.from_pretrained("bert-base-uncased").embeddings.word_embeddings self.query_encoder.weight.requires_grad = True # bert encoder for answer, context embedding self.context_encoder = BertModel.from_pretrained("bert-base-uncased") # ReLU layer, bias append before relu. self.bias = nn.Parameter(torch.FloatTensor([0.]), requires_grad=True) self.relu = nn.ReLU() def forward(self, query_tokens, context_tokens, batch_size, neg_pairs): # get query embedding # shape (batch_size, query_len, 768) question_emb = self.query_encoder(query_tokens) # get context embedding # shape (batch_size, context_len, 768) context_emb = self.context_encoder(**context_tokens).last_hidden_state # batch multiply matrix out = torch.bmm(question_emb, torch.transpose(context_emb, 1, 2)) op_dim = 2 if out.shape == 2 * batch_size * neg_pairs: out = out.view(batch_size, 2 * neg_pairs, out.shape, out.shape) op_dim += 1 # max-pooling out, _ = torch.max(out, dim=op_dim) # add bias out = out + self.bias # relu out = self.relu(out) # log out = torch.log(out + 1) # summation out = torch.sum(out, dim=op_dim-1) return out