Here is the model I use:
class MixedDropout(nn.Module):
def __init__(self, p):
super().__init__()
self.dense_dropout = nn.Dropout(p)
def forward(self, input):
return self.dense_dropout(input)
class MixedLinear(nn.Module):
def __init__(self, in_features, out_features, bias=False):
super().__init__()
self.in_features = in_features
self.out_features = out_features
self.weight = nn.Parameter(torch.Tensor(in_features, out_features))
if bias:
self.bias = nn.Parameter(torch.Tensor(out_features))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
nn.init.kaiming_uniform_(self.weight, mode = 'fan_out', a = math.sqrt(5))
if self.bias is not None:
_, fan_out = nn.init._calculate_fan_in_and_fan_out(self.weight)
bound = 1 / math.sqrt(fan_out)
nn.init.uniform_(self.bias, -bound, bound)
def forward(self, input):
if self.bias is not None:
res = torch.addmm(self.bias, input, self.weight)
else:
res = input.matmul(self.weight)
return res
class Dynamic_Neural_Network(nn.Module):
def __init__(self, hyper_parameters_dict):
super().__init__()
global device
self.input_size = hyper_parameters_dict['input_size']
self.output_size = hyper_parameters_dict['output_size']
self.list_hidden_layers_sizes = hyper_parameters_dict['list_hidden_layers_sizes']
self.dropout_proba = hyper_parameters_dict['dropout_proba']
self.last_layer_activation_func = hyper_parameters_dict['last_layer_activation_func']
self.batch_norm_flg = hyper_parameters_dict['batch_norm_flg']
if self.batch_norm_flg == True:
bn_ls = []
bn_ls.append(torch.nn.BatchNorm1d(self.input_size))
for hidden_lyr_sz in self.list_hidden_layers_sizes:
bn_ls.append(torch.nn.BatchNorm1d(hidden_lyr_sz))
bn_ls.append(torch.nn.BatchNorm1d(self.output_size))
self.batch_norm_layers = bn_ls
fcs = [MixedLinear(self.input_size, self.list_hidden_layers_sizes[0])]
for hidden_layer_idx in range(len(self.list_hidden_layers_sizes) - 1):
fcs.append(MixedLinear(self.list_hidden_layers_sizes[hidden_layer_idx],
self.list_hidden_layers_sizes[hidden_layer_idx + 1]))
fcs.append(MixedLinear(self.list_hidden_layers_sizes[-1], self.output_size))
self.fcs = nn.ModuleList(fcs)
self.drop = MixedDropout(self.dropout_proba)
def forward(self, X):
if self.batch_norm_flg == False:
embedding = self.fcs[0](X)
for layer_id, fc in enumerate(self.fcs[1:]):
embedding = fc(self.drop(F.leaky_relu(embedding, negative_slope = 0.1, inplace = False)))
else:
bn = self.batch_norm_layers[0].to(device)
embedding = self.fcs[0](bn(X))
for layer_id, fc in enumerate(self.fcs[1:]):
bn = self.batch_norm_layers[layer_id + 1].to(device)
embedding = fc(self.drop(F.leaky_relu(bn(embedding), negative_slope = 0.1, inplace = False)))
if self.last_layer_activation_func == 'softmax':
embedding = F.softmax(embedding, dim=1)
elif self.last_layer_activation_func == 'log_softmax':
embedding = F.log_softmax(embedding, dim=1)
elif self.last_layer_activation_func == 'relu':
embedding = F.relu(embedding)
elif self.last_layer_activation_func == 'leaky_relu':
embedding = F.leaky_relu(embedding, negative_slope = 0.1, inplace = False)
elif self.last_layer_activation_func == 'tanh':
embedding = F.tanh(embedding)
elif self.last_layer_activation_func == 'sigmoid':
embedding = F.sigmoid(embedding)
else:
print('The activation function type you inserted is not correct,\
please insert either softmax, log_softmax, relu, leaky_relu, tanh, or sigmoid activation function')
return embedding
class DIET_Intent_Classifier(nn.Module):
def __init__(self, nn_hyperparameters_dict):
super().__init__()
self.feature_extractor_net = Dynamic_Neural_Network(nn_hyperparameters_dict).to(device)
def forward(self, batch_queries, intents_desc_embed):
# pass the queries of the current batch to the feature extractor neural net to get h(query)
h_queries = self.feature_extractor_net(batch_queries)
# pass all the intents description to the feature extractor neural net to get h(intent)
h_intents = self.feature_extractor_net(intents_desc_embed)
# calculate the dot product between each h(query) and each h(intent)
queries_intents_similarities = torch.matmul(h_queries, h_intents.T)
return queries_intents_similarities
class NLU_Intent_Classifier_Training():
def __init__(self, config_obj, queries_train, intents_train, queries_test, intents_test, intents_desc_embed):
global device
device = config_obj.device
# NLU intent classifier model training hyper-parameters
self.features_extracted_size = config_obj.features_extracted_size
self.lr = config_obj.lr
self.weight_decay = config_obj.weight_decay
self.max_epochs = config_obj.max_epochs
self.batch_size = config_obj.batch_size
self.eval_step = config_obj.eval_step
self.early_stop = config_obj.early_stop
self.patience = config_obj.patience
self.batch_mult_val = config_obj.batch_mult_val
self.list_hidden_layers_sizes = config_obj.list_hidden_layers_sizes
self.dropout_proba = config_obj.dropout_proba
self.last_layer_activation_func = config_obj.last_layer_activation_func
self.loss_func = config_obj.loss_func
self.batch_norm_flg = config_obj.batch_norm_flg
# define our tensors for training and validation
self.queries_train = queries_train
self.intents_train = intents_train
self.queries_test = queries_test
self.intents_test = intents_test
self.intents_desc_embed = intents_desc_embed
def train_intent_classifier_model(self):
global device
def calculate_similarity_loss(batch_queries_intents_similarities, yb):
# get the similarity of the queries with the positive intents (ground-truth label)
sim_pos = batch_queries_intents_similarities[torch.arange(batch_queries_intents_similarities.shape[0]).squeeze(), yb.squeeze()]
# apply masking (-inf) before the exponent to set the similarities of positive intents as 0
sim_neg = batch_queries_intents_similarities
sim_neg[torch.arange(sim_neg.shape[0]).squeeze(), yb.squeeze()] = -float("Inf")
# loss = - avg_batch( S+ve - log(exp[S+ve] + sum_over_negative_intents( exp[S-ve] ) ) )
inner_term = torch.exp(sim_pos) + torch.sum(torch.exp(sim_neg), dim = 1)
batch_loss_tensor = sim_pos - torch.log(inner_term)
loss = - torch.mean(batch_loss_tensor)
return loss
def run_batch(model, xbs, yb, train, optimizer = None, loss = 'cross_entropy'):
global device
if train:
model.train()
else:
model.eval()
with torch.set_grad_enabled(train):
batch_queries_intents_similarities = model(*xbs) ## shape = batch_sz * num_of_intents
batch_predicted_intents = torch.argmax(batch_queries_intents_similarities, dim = 1) ## shape = batch_sz * 1
ncorrect = torch.sum(batch_predicted_intents == yb.squeeze())
if loss == 'similarity_loss':
batch_loss = calculate_similarity_loss(batch_queries_intents_similarities, yb)
else:
batch_proba = torch.softmax(batch_queries_intents_similarities, dim = 1)
loss_func = nn.CrossEntropyLoss().to(device)
if batch_proba.shape[0] > 1:
batch_loss = loss_func(batch_proba, yb.squeeze())
else:
batch_loss = loss_func(batch_proba.squeeze(), yb.squeeze())
if train:
optimizer.zero_grad()
batch_loss.backward()
optimizer.step()
return batch_loss, ncorrect, batch_predicted_intents
## train function
def train(model, train_set_obj, val_set_obj, queries_test, lr, weight_decay, max_epochs,
batch_size, eval_step, early_stop, patience, batch_mult_val, loss_func):
""" Instantiating dataloader objects for the training and validation objects """
train_loader = torch.utils.data.DataLoader(dataset=train_set_obj,
sampler=torch.utils.data.BatchSampler(
torch.utils.data.SequentialSampler(train_set_obj),
batch_size=batch_size, drop_last=False
),
batch_size=None, shuffle=False)
""" Initializing the variables for comparing the batch loss with the best loss in each evaluation step """
best_loss = np.inf
best_test_acc = 0
loss_hist = {'train': [], 'val': []}
acc_hist = {'train': [], 'val': []}
""" Defining our optimizer """
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
for epoch in range(max_epochs):
step = 0
for xbs, yb in train_loader:
loss_batch, ncorr_batch, top1 = run_batch(model, xbs, yb, True, optimizer, loss_func)
step += 1
"""We will not perform evaluation unless we passed a number of batches specified in the eval_step parameter"""
if step % eval_step == 0:
val_indices = torch.arange(queries_test.shape[0])
val_xbs, val_yb = val_set_obj.__getitem__(val_indices)
val_loss, val_ncorr, val_top1 = run_batch(model, val_xbs, val_yb, False, optimizer)
""" Calculating the current training batch loss and batch accuracy """
train_loss = loss_batch
train_acc = ncorr_batch / yb.shape[0]
val_acc = val_ncorr/len(val_indices)
loss_hist['train'].append(train_loss)
acc_hist['train'].append(train_acc)
loss_hist['val'].append(val_loss)
acc_hist['val'].append(val_acc)
if val_acc > best_test_acc:
best_test_acc = val_acc
best_epoch = epoch
best_state = {
key: value.cpu() for key, value
in model.state_dict().items()
}
if early_stop and epoch >= (best_epoch + patience):
model.load_state_dict(best_state)
return model, epoch + 1, loss_hist, acc_hist
print(f"Epoch {epoch}, step {step}: train_loss: {train_loss:.7f}, train_acc: {train_acc:.7f}, val_loss: {val_loss:.7f}, val_acc: {val_acc:.7f}")
model.load_state_dict(best_state)
return model, epoch + 1, loss_hist, acc_hist
# set your training hyper-parameters
input_dimension_size = self.queries_train.shape[1]
nn_hyperparameters_dict = {'input_size' : input_dimension_size,
'output_size' : self.features_extracted_size,
'list_hidden_layers_sizes' : self.list_hidden_layers_sizes,
'dropout_proba' : self.dropout_proba,
'last_layer_activation_func' : self.last_layer_activation_func,
'batch_norm_flg' : self.batch_norm_flg}
train_set_obj = Intent_Capture_Dataset(self.queries_train, self.intents_train, self.intents_desc_embed)
val_set_obj = Intent_Capture_Dataset(self.queries_test, self.intents_test, self.intents_desc_embed)
intent_classifier_obj = DIET_Intent_Classifier(nn_hyperparameters_dict).to(device)
model, epochs, loss_hist, acc_hist = train(intent_classifier_obj, train_set_obj, val_set_obj, self.queries_test, self.lr, self.weight_decay,
self.max_epochs, self.batch_size, self.eval_step, self.early_stop, self.patience,
self.batch_mult_val, self.loss_func)
return model, epochs, loss_hist, acc_hist
I have trained the model using this configuration:
class Config():
def __init__(self):
# set the endpoints names here
self.embedding_model_endpoint_name = 'hf-textembedding-all-minilm-l6-v2-2024-05-17-05-07-36-657'
# define here some environment variables (not dependent on each run)
self.aws_region = boto3.Session().region_name
self.sagemaker = boto3.client('sagemaker')
self.runtime = boto3.client('sagemaker-runtime', region_name = self.aws_region)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# NLU intent classifier model hyper-parameters
self.features_extracted_size = 32
self.lr = 0.001
self.weight_decay = 1e-07
self.max_epochs = 350
self.batch_size = 61
self.eval_step = 1
self.early_stop = True # True or False
self.patience = 100
self.batch_mult_val = 1
self.list_hidden_layers_sizes = [512, 256, 128, 64]
self.dropout_proba = 0.25
self.last_layer_activation_func = 'tanh' # tanh, sigmoid, softmax, relu, leaky_relu
self.loss_func = 'similarity_loss' # similarity_loss or cross_entropy
self.batch_norm_flg = True
# define our data paths
self.annotations_path = './examples.txt'
self.intents_description_path = './intents_description.txt'
from intent_classifier_training import NLU_Intent_Classifier_Training
classifier_obj = NLU_Intent_Classifier_Training(config_obj, queries_train, intents_train, queries_test, intents_test, intents_desc_embed)
model, epochs, loss_hist, acc_hist = classifier_obj.train_intent_classifier_model()
and saved it in the evaluation mode:
model.eval()
torch.save(model, ‘./intent_classifier.pth’)
however, while inferring on a single example as follows:
model = torch.load(‘./intent_classifier.pth’, map_location=config_obj.device)
model.eval()
out = model(queries_test[0].unsqueeze(0), intents_desc_embed)
It produces this error:
ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 384])
Note that: the same line of code works properly when the input is more than 1 example:
I don’t understand why this issue occurs?? I have set the model to the evaluation mode and saved it in the evaluation mode; Is there something wrong in my implementation?
I’m using torch 2.0.0 on aws sagemaker instance CPU optimized with python 3.10