Hi ,

I have almost 300,000 records with mixed of categorical and numerical features. For most of categorical variable where cardinality is greater than 2 are embedded into 50% of those unique values , i defined layers and neurons arbitrarily as follows for classification problem 1 or 0, based on following layers and neurons i am getting loss (Cross Entropy) 0.52656052014033 at 100th epochs

My question are

- Is there anything wrong with my code?
- Is there any technique like Grid Search which i can use to chose optimal number of hidden layers, and neurons in each hidden layer?

```
testNet(
(embeddings): ModuleList(
(0): Embedding(115, 8)
(1): Embedding(119, 10)
(2): Embedding(113, 7)
(3): Embedding(120, 10)
(4): Embedding(184, 42)
(5): Embedding(116, 8)
(6): Embedding(151, 26)
(7): Embedding(161, 31)
(8): Embedding(119, 10)
(9): Embedding(399, 50)
)
(lin1): Linear(in_features=213, out_features=90, bias=True)
(lin2): Linear(in_features=90, out_features=85, bias=True)
(lin3): Linear(in_features=85, out_features=80, bias=True)
(lin4): Linear(in_features=80, out_features=75, bias=True)
(lin5): Linear(in_features=75, out_features=70, bias=True)
(lin6): Linear(in_features=70, out_features=60, bias=True)
(lin7): Linear(in_features=60, out_features=50, bias=True)
(lin8): Linear(in_features=50, out_features=40, bias=True)
(lin9): Linear(in_features=40, out_features=30, bias=True)
(lin10): Linear(in_features=30, out_features=20, bias=True)
(lin11): Linear(in_features=20, out_features=10, bias=True)
(lin12): Linear(in_features=10, out_features=5, bias=True)
(lin13): Linear(in_features=5, out_features=2, bias=True)
(bn1): BatchNorm1d(11, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn2): BatchNorm1d(90, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn3): BatchNorm1d(85, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn4): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn5): BatchNorm1d(75, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn6): BatchNorm1d(70, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn7): BatchNorm1d(60, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn8): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn9): BatchNorm1d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn10): BatchNorm1d(30, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn11): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn12): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn13): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(emb_drop): Dropout(p=0.6, inplace=False)
(drops): Dropout(p=0.3, inplace=False)
)
epoch : 10,training loss : 0.6801770955721538
epoch : 20,training loss : 0.5797973778088887
epoch : 30,training loss : 0.548956808312734
epoch : 40,training loss : 0.5404320967992147
epoch : 50,training loss : 0.5338565409978231
epoch : 60,training loss : 0.5300635928471883
epoch : 70,training loss : 0.529638019879659
epoch : 80,training loss : 0.5281008475780488
epoch : 90,training loss : 0.525910607846578
epoch : 100,training loss : 0.52656052014033
```

Following are my code

```
class testNet(nn.Module):
def __init__(self, emb_dims, n_cont):
super().__init__()
self.embeddings = nn.ModuleList([nn.Embedding(categories+100, size) for categories,size in emb_dims])
no_of_embs = sum(e.embedding_dim for e in self.embeddings) #length of all embeddings combined
self.n_emb, self.n_cont = no_of_embs, n_cont
self.lin1 = nn.Linear(self.n_emb + self.n_cont,90)
self.lin2 = nn.Linear(90, 85)
self.lin3 = nn.Linear(85, 80)
self.lin4 = nn.Linear(80, 75)
self.lin5 = nn.Linear(75, 70)
self.lin6 = nn.Linear(70, 60)
self.lin7 = nn.Linear(60, 50)
self.lin8 = nn.Linear(50, 40)
self.lin9 = nn.Linear(40, 30)
self.lin10 = nn.Linear(30, 20)
self.lin11 = nn.Linear(20, 10)
self.lin12 = nn.Linear(10, 5)
self.lin13 = nn.Linear(5, 2)
self.bn1 = nn.BatchNorm1d(self.n_cont)
self.bn2 = nn.BatchNorm1d(90)
self.bn3 = nn.BatchNorm1d(85)
self.bn4 = nn.BatchNorm1d(80)
self.bn5 = nn.BatchNorm1d(75)
self.bn6 = nn.BatchNorm1d(70)
self.bn7 = nn.BatchNorm1d(60)
self.bn8 = nn.BatchNorm1d(50)
self.bn9 = nn.BatchNorm1d(40)
self.bn10 = nn.BatchNorm1d(30)
self.bn11 = nn.BatchNorm1d(20)
self.bn12 = nn.BatchNorm1d(10)
self.bn13 = nn.BatchNorm1d(5)
self.emb_drop = nn.Dropout(0.6)
self.drops = nn.Dropout(0.3)
def forward(self, x_cat, x_cont):
x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
x = torch.cat(x, 1)
x = self.emb_drop(x)
# batch normalization over continous features
x2 = self.bn1(x_cont)
# concatenate both embedding and continous feature , here 1 means dim
# the dimension over which the tensors are concatenated we are concatenating columns
x = torch.cat([x, x2], 1)
#x = F.relu(self.lin1(x))
m = nn.LeakyReLU(0.01)
x = m(self.lin1(x))
x = self.drops(x)
x = self.bn2(x)
#x = F.relu(self.lin2(x))
x = m(self.lin2(x))
x = self.drops(x)
x = self.bn3(x)
x = self.lin3(x)
x = self.drops(x)
x = self.bn4(x)
x = m(self.lin4(x))
x = self.drops(x)
x = self.bn5(x)
x = m(self.lin5(x))
x = self.drops(x)
x = self.bn6(x)
x = m(self.lin6(x))
x = self.drops(x)
x = self.bn7(x)
x = m(self.lin7(x))
x = self.drops(x)
x = self.bn8(x)
x = m(self.lin8(x))
x = self.drops(x)
x = self.bn9(x)
x = m(self.lin9(x))
x = self.drops(x)
x = self.bn10(x)
x = m(self.lin10(x))
x = self.drops(x)
x = self.bn11(x)
x = m(self.lin11(x))
x = self.drops(x)
x = self.bn12(x)
x = m(self.lin12(x))
x = self.drops(x)
x = self.bn13(x)
return x
```

Training function

```
def get_optimizer(model, lr = 0.001, wd = 0.0):
parameters = filter(lambda p: p.requires_grad, model.parameters())
optim = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
return optim
def init_weights(m):
if type(m) == nn.Linear:
nn.init.xavier_uniform_(m.weight)
if m.bias is not None:
torch.nn.init.zeros_(m.bias)
criterion = nn.CrossEntropyLoss()
to_device(criterion, device)
def train_model(model, optim, train_dl):
model.train()
total = 0
sum_loss = 0
output = 0
for cat, cont, y in train_dl:
batch = y.shape[0]
output = model(cat, cont)
_,pred = torch.max(output,1)
loss = criterion(output, y)
optim.zero_grad()
loss.backward()
optim.step()
total += batch
sum_loss += batch*(loss.item())
return sum_loss/total,pred
def train_loop(model, epochs, lr, wd=0.0):
optim = get_optimizer(model, lr = lr, wd = wd)
for epoch in range(epochs):
loss,pred = train_model(model, optim, train_dl)
if (epoch+1) % 10 ==0:
print(f'epoch : {epoch+1},training loss : {loss}')
def class_imbalance_sampler(labels):
class_count = np.array([len(np.where(labels.cpu().detach().numpy()==t)[0]) for t in np.unique(labels.cpu().detach().numpy())])
print(class_count)
weight = 1. / class_count
samples_weight = np.array([weight[t] for t in labels.cpu().detach().numpy()])
samples_weight = torch.from_numpy(samples_weight)
sampler = WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight))
return sampler
y = torch.from_numpy(y_tr.to_numpy(np.int)).to(device)
sampler = class_imbalance_sampler(y)
batch_size = 512*2
train_dl = DataLoader(train_ds, batch_size=batch_size,sampler=sampler)
train_dl = DeviceDataLoader(train_dl, device)
model = testNet(embedding_sizes,11)
model.apply(init_weights)
to_device(model, device)
print(model)
from collections import defaultdict
opt = torch.optim.Adam(model.parameters(), lr=1e-2)
train_loop(model, epochs=100, lr=0.001)
```