I have a simple LSTM Model that I want to run through Hyperopt to find optimal Hyperparameters. I already can run my model and optimize my learning rate, batch size and even the hidden dimension and number of layers but I dont know how I can change my Model structure inside my objective function. What I now want to do is to maybe add a dense layers based on the amount of layers my lstm has. So for example my model could have 2 lstm layers and 2 dense layers or 1 lstm and 3 dense layers or only 4 lstm layers.
The only solution I could think of would be to implement different Classes and let hyperopt choose one, but that would need me to implement a lot of dfferent classes and I am sure there is a nicer way to do this.
class Model_GRU(nn.Module):
def __init__(self, n_features, n_classes, n_hidden, n_layers):
super().__init__()
self.gru = nn.GRU(
input_size=n_features,
hidden_size=n_hidden,
num_layers=n_layers,
batch_first=True,
dropout=0.75
)
weight = torch.zeros(n_layers,n_hidden)
nn.init.kaiming_uniform_(weight)
self.weight = nn.Parameter(weight)
self.classifier = nn.Linear(n_hidden, n_classes)
def init_hidden(self):
hidden_state = torch.zeros(self.gru.num_layers,batch_size,self.gru.hidden_size)
cell_state = torch.zeros(self.gru.num_layers,batch_size,self.gru.hidden_size)
return (hidden_state, cell_state)
def forward(self, x):
self.hidden = self.init_hidden()
_, (hidden) = self.gru(x)
out=hidden[-1]
return self.classifier(out)
# Hyperopt search space
space={
'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(1)),
'n_hidden': hp.choice('n_hidden', [4,8,16,32,64,128]),
'batch_size': hp.choice('batch_size',[2,4,8,16]),
'weight_decay': hp.loguniform('weight_decay', np.log(0.00001), np.log(0.1)),
'eta_min': hp.quniform('eta_min', 0.00001,0.01,0.00001),
'T_mult': hp.choice('T_mult', [1,2,3]),
'T_0': hp.quniform('T_0', 10,500,20),
}
# Hyperopt Opt
num_eval = 3
best_param = fmin(objective, space, algo=tpe.suggest, max_evals=num_eval, trials=trials_obj)
# Objective function
def objective(params):
global best_val_loss
print("New Params")
input_dim = feature_dim+features_added
output_dim = 3
n_hidden=params['n_hidden']
n_layers=3
lr = params['learning_rate']
n_epochs = 4
iterations_per_epoch = 190
best_val_loss = 100
patience, trials = 30, 0
batch_size = params['batch_size']
weight_decay= params['weight_decay']
T_0 = int(params['T_0'])
T_mult= params['T_mult']
eta_min= params['eta_min']
model = Model_GRU(input_dim, output_dim, n_hidden, n_layers)
criterion = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
sched = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(opt, T_0=T_0, T_mult=T_mult, eta_min=eta_min, last_epoch=-1)
for fold, (train_idx, test_idx) in enumerate(kfold.split(sequences, y_kfold)):
.
.
.