I am trying to add pos embedding with BERT transformer embedding. So the dimension of POS embedding should be 768.
Please suggest.
I am trying to add pos embedding with BERT transformer embedding. So the dimension of POS embedding should be 768.
Please suggest.
Here is what I use in my projects (I removed the segment embedding).
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
N_MAX_POSITIONS = 512 # maximum input sequence length
def Embedding(num_embeddings, embedding_dim, padding_idx=None):
m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
if padding_idx is not None:
nn.init.constant_(m.weight[padding_idx], 0)
return m
def create_sinusoidal_embeddings(n_pos, dim, out):
position_enc = np.array([
[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
for pos in range(n_pos)
])
out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
out.detach_()
out.requires_grad = False
class Embeddings(nn.Module):
"""token + position embedding"""
def __init__(self, n_words, embed_dim, padding_idx = None, sinusoidal_embeddings = True, eps=1e-12, dropout = 0.1):
super().__init__()
self.token_embeddings = Embedding(n_words, embed_dim, padding_idx = padding_idx)
self.position_embeddings = Embedding(N_MAX_POSITIONS, embed_dim)
if sinusoidal_embeddings:
with torch.no_grad(): # RuntimeError: a view of a leaf Variable that requires grad is being used in an in-place operation.
create_sinusoidal_embeddings(N_MAX_POSITIONS, embed_dim, out=self.position_embeddings.weight)
self.layer_norm_emb = nn.LayerNorm(embed_dim, eps=eps)
self.dropout = nn.Dropout(dropout)
def forward(self, x, positions = None):
"""
Inputs:
`x` LongTensor(bs, slen), containing word indices
`positions` LongTensor(bs, slen), containing word positions
"""
bs, slen = x.size()
x = self.token_embeddings(x) # bs x slen x embed_dim
# Attention Is All You Need, section 3.4, Embeddings and Softmax : In the embedding layers, we multiply those weights by sqrt(d_model)
#x= x*torch.sqrt(torch.tensor(self.embed_dim, dtype=torch.float32))
# positions
if positions is None:
positions = x.new(slen).long()
positions = torch.arange(slen, out=positions).unsqueeze(0) # bs x slen
else:
assert positions.size() == (bs, slen)
x = x + self.position_embeddings(positions).expand_as(x) # bs x slen x embed_dim
x = self.layer_norm_emb(x) # bs x slen x embed_dim
x = self.dropout(x) # bs x slen x embed_dim
return x
Which you can use as follows :
vocab_size = 10
embed_dim = 768
embedding = Embeddings(n_words = vocab_size, embed_dim = embed_dim, padding_idx = 0, sinusoidal_embeddings = True)
bs, slen = 2, 5
torch.manual_seed(0)
x = torch.empty(bs, slen, dtype=torch.long).random_(vocab_size-1)
embed = embedding(x)
#An example with traditional positional encoding
position = torch.arange(start=0, end = slen, step=1).expand_as(x) # tensor([0, 1, ..., slen - 1]) x bs
embed = embedding(x, position)