How to combine both word embeddings and pos embedding together in an NER

I am trying to add pos embedding with BERT transformer embedding. So the dimension of POS embedding should be 768.

Please suggest.

Here is what I use in my projects (I removed the segment embedding).

import torch    
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

N_MAX_POSITIONS = 512  # maximum input sequence length

def Embedding(num_embeddings, embedding_dim, padding_idx=None):
    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
    if padding_idx is not None:
        nn.init.constant_(m.weight[padding_idx], 0)
    return m

def create_sinusoidal_embeddings(n_pos, dim, out):
    position_enc = np.array([
        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
        for pos in range(n_pos)
    ])
    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
    out.detach_()
    out.requires_grad = False

class Embeddings(nn.Module):
    """token + position embedding""" 
    def __init__(self, n_words, embed_dim, padding_idx = None, sinusoidal_embeddings = True, eps=1e-12, dropout = 0.1):
        super().__init__()
		
        self.token_embeddings = Embedding(n_words, embed_dim, padding_idx = padding_idx)

        self.position_embeddings = Embedding(N_MAX_POSITIONS, embed_dim)
        if sinusoidal_embeddings:
            with torch.no_grad(): # RuntimeError: a view of a leaf Variable that requires grad is being used in an in-place operation.
                create_sinusoidal_embeddings(N_MAX_POSITIONS, embed_dim, out=self.position_embeddings.weight)

        self.layer_norm_emb = nn.LayerNorm(embed_dim, eps=eps)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, positions = None):
        """
        Inputs:
            `x` LongTensor(bs, slen), containing word indices
            `positions` LongTensor(bs, slen), containing word positions
        """
        bs, slen = x.size()
        
        x = self.token_embeddings(x) # bs x slen x embed_dim
        # Attention Is All You Need, section 3.4, Embeddings and Softmax : In the embedding layers, we multiply those weights by sqrt(d_model)
        #x= x*torch.sqrt(torch.tensor(self.embed_dim, dtype=torch.float32))

        # positions
        if positions is None:
            positions = x.new(slen).long()
            positions = torch.arange(slen, out=positions).unsqueeze(0)  # bs x slen
        else:
            assert positions.size() == (bs, slen)
        x = x + self.position_embeddings(positions).expand_as(x) # bs x slen x embed_dim
        
        x = self.layer_norm_emb(x) # bs x slen x embed_dim
        x = self.dropout(x) # bs x slen x embed_dim

        return x

Which you can use as follows :

vocab_size = 10
embed_dim = 768
embedding = Embeddings(n_words = vocab_size, embed_dim = embed_dim, padding_idx = 0, sinusoidal_embeddings = True) 

bs, slen = 2, 5
torch.manual_seed(0)
x = torch.empty(bs, slen, dtype=torch.long).random_(vocab_size-1)

embed = embedding(x)

#An example with traditional positional encoding
position = torch.arange(start=0, end = slen, step=1).expand_as(x) # tensor([0, 1, ..., slen - 1]) x bs
embed = embedding(x, position)