Hi, I’m trying to train a small model to Downsize the Dimension(let’s call it DD model for convenience) of existing GPT3 embeddings. This is because GPT3 embedding has 1536 and our application uses embedding with a dimension of 100. The motivation is that such a model can pick the most essential 100 dims that are sensitive to our own problem, therefore we can use such a DD model to fix the OOV issues w2v has, i.e. when there is a new word, we can first get a 1536 dim embedding from GPT3 and then we can convert it to 100 use such DD model.
However I’m running some inference issues, would be great if anyone can help.
The model is DynamicDimsionEmbedding
in below
class DDTrain(nn.Embedding):
def __init__(self, vocab, *args, **kwargs):
super(DDTrain, self).__init__(*args, **kwargs)
with torch.no_grad():
for skill, idx in vocab.get_stoi().items():
# read GPT3 embedding into initial embedding
# I already constructed a pytorch vocab at this point
emd = embedding_from_string(skill)
self.weight[idx] = torch.tensor(emd, requires_grad=False)
# CNN layers
self.features = nn.Sequential(
nn.Conv1d(in_channels=1, out_channels=4, kernel_size=3, stride=1, padding=1),
nn.Conv1d(in_channels=4, out_channels=8, kernel_size=3, stride=3, padding=1),
nn.Conv1d(in_channels=8, out_channels=4, kernel_size=3, stride=2, padding=1),
nn.Conv1d(in_channels=4, out_channels=1, kernel_size=3, stride=2, padding=1)
)
# FC layers
self.downsize_fc = nn.Linear(in_features=128, out_features=100)
self.weight = nn.Parameter(self.weight.unsqueeze(dim=1))
self.weight = nn.Parameter(self.features(self.weight))
self.weight = nn.Parameter(self.downsize_fc(self.weight))
self.weight = nn.Parameter(self.weight.squeeze(dim=1))
class CBOW_Model(nn.Module):
def __init__(self, vocab):
super(CBOW_Model, self).__init__()
vocab_size = len(vocab)
self.embeddings = DDTrain(
vocab=vocab,
num_embeddings=vocab_size,
embedding_dim=EMBED_DIMENSION,
max_norm=EMBED_MAX_NORM,
)
self.linear = nn.Linear(
in_features=100,
out_features=vocab_size,
)
def forward(self, inputs_):
x = self.embeddings(inputs_)
x = x.mean(axis=1)
x = self.linear(x)
return x
During inference, I take out the Sequential and the FC layers in DDTrain
and construct it as
class DDInference(nn.Module):
def __init__(self):
super(DDInference, self).__init__()
self.features = nn.Sequential(
nn.Conv1d(in_channels=1, out_channels=4, kernel_size=3, stride=1, padding=1),
nn.Conv1d(in_channels=4, out_channels=8, kernel_size=3, stride=3, padding=1),
nn.Conv1d(in_channels=8, out_channels=4, kernel_size=3, stride=2, padding=1),
nn.Conv1d(in_channels=4, out_channels=1, kernel_size=3, stride=2, padding=1)
)
self.downsize_fc = nn.Linear(in_features=128, out_features=100)
def forward(self, x):
x = self.features(x)
x = self.downsize_fc(x)
return x
and then read the trained weights from the state_dict
of DDTrain.
with torch.no_grad():
downsize_model.features[0].weight.copy_(state_dict['embeddings.features.0.weight'])
downsize_model.features[0].bias.copy_(state_dict['embeddings.features.0.bias'])
downsize_model.features[1].weight.copy_(state_dict['embeddings.features.1.weight'])
downsize_model.features[1].bias.copy_(state_dict['embeddings.features.1.bias'])
downsize_model.features[2].weight.copy_(state_dict['embeddings.features.2.weight'])
downsize_model.features[2].bias.copy_(state_dict['embeddings.features.2.bias'])
downsize_model.features[3].weight.copy_(state_dict['embeddings.features.3.weight'])
downsize_model.features[3].bias.copy_(state_dict['embeddings.features.3.bias'])
downsize_model.downsize_fc.weight.copy_(state_dict['embeddings.downsize_fc.weight'])
downsize_model.downsize_fc.bias.copy_(state_dict['embeddings.downsize_fc.bias'])
But, if I send in an input of 1536-dim embedding to the DDInference
, I won’t be able to get the same 100-dim embedding output as I got from the state_dict in DDTrain
after training.
Any help is appreciated!