Hi guys and girls,
Newbie to pytorch, more experienced with Keras, GBM, … but curious about performance and power of pytorch, so decided to dive into PyTorch.
I picked a shared code for a DAE + MLP from Kaggle competition (TabApr), and reapplied it (somehow successfully) to April’s competition.
Went in dept in the code to understand it and see what I could tweak and learn from PyTorch.
I’ve seen a curious occurrence of “nan” during the training process (after 50 epochs) which puzzles me and hopping you can help me understand why this is occurring.
What I’ve tried (some of the items below might have an impact to delay the “explosion” of nan, but doesn’t solve):
 Normalisation of DataSet (it was already normalised with L1 but just in case I tried with L2)

drop_last=True)
of DataLoader  Noiser fixed mask probas
 low eps in model
While debugging, I can see that the nan are caused during training at train function by:
 first time it appears, it is around the midend part of the DataSet (batch ±50 of 79) [all batches after that will also return nan]
 after one epoch gets any batch with nan, the recurring epochs will get all batches with nan

outputs = model(dae.feature(inputs))
[outputs get the nan values] 
dae.feature(inputs)
does not throw any nan 
inputs
doesn’t have any nan
Running out of ideas what could be causing this. Like I said I’ve tried a couple of things that delay the “explosion” or accelerate it, but in my mind, I can’t understand and would like to understand what is causing this.
I will share a snapshot of the code which I think is more relevant, feel free to ask for more detailed info or to share it in another means.
First post here, so eager to learn.
Thanks in advance
Simplified Main (for readability)
dae = TransformerAutoEncoder(
num_inputs=X.shape[1],
n_cats=n_cats,
n_nums=n_nums,
hidden_size=SETUP['hidden_size'],
num_subspaces=SETUP['num_subspaces'],
embed_dim=SETUP['embed_dim'],
num_heads=SETUP['num_heads'],
dropout=SETUP['dropout'],
feedforward_dim=SETUP['feedforward_dim'],
emphasis=SETUP['emphasis'],
task_weights=SETUP['task_weights'],
mask_loss_weight=SETUP['mask_loss_weight']
)
skf = StratifiedKFold(n_splits=SETUP['nfolds'], random_state=SETUP['random_seed'], shuffle=True)
for fold, (train_idx, valid_idx) in enumerate(skf.split(X[:n_training], Y)):
train_dataset = FeatureDataset(X[:len_train][train_idx], y[:len_train][train_idx])
valid_dataset = FeatureDataset(X[:len_train][valid_idx], y[:len_train][valid_idx])
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)#, drop_last=True)
validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
# MLP Model
model = Model(num_features=num_features, num_targets=num_targets, hidden_size=hidden_size)
model.to(device)
loss_fn = nn.BCEWithLogitsLoss()
loss_tr = nn.BCEWithLogitsLoss()
for epoch in range(SETUP['epochs']):
train_loss = train_fn(dae, model, optimizer, scheduler, loss_tr, trainloader,epoch, device)
valid_loss, valid_preds = valid_fn(dae, model, loss_fn, validloader, device)
scheduler.step(valid_loss)
# reload model with best result and predict Xtest
model.load_state_dict(torch.load(f"MODEL_{model_name}_FOLD_{fold}_.pth"))
model.to(device)
del trainloader, validloader, train_dataset, valid_dataset
gc.collect()
predictions += inference_fn(dae, model, testloader, device)
Train (where nan’s are seen) / Eval / Predict custom functions
def train_fn(dae, model, optimizer, scheduler, loss_fn, dataloader, epoch, device=SETUP['device']):
dae.eval()
model.train()
all_losses = []
final_loss = 0
noise_maker = SwapNoiseMasker(SETUP['mlp_start_noise']*(SETUP['mlp_noise_decay']**epoch))
for i, data in enumerate(dataloader):
optimizer.zero_grad()
inputs, targets = data['x'].to(device), data['y'].to(device)
inputs, mask = noise_maker.apply(inputs)
## **outputs with nan after 50 epochs !!!**
outputs = model(dae.feature(inputs))
loss = loss_fn(outputs, targets)
loss.backward()
optimizer.step()
final_loss += loss.item()
all_losses.append(loss.item())
final_loss /= len(dataloader)
return final_loss
def valid_fn(dae, model, loss_fn, dataloader, device=SETUP['device']):
dae.eval()
model.eval()
final_loss = 0
valid_preds = []
for data in dataloader:
inputs, targets = data['x'].to(device), data['y'].to(device)
outputs = model(dae.feature(inputs))
loss = loss_fn(outputs, targets)
final_loss += loss.item()
valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
final_loss /= len(dataloader)
valid_preds = np.concatenate(valid_preds)
return final_loss, valid_preds
def inference_fn(dae, model, dataloader, device=SETUP['device']):
dae.eval()
model.eval()
preds = []
for data in dataloader:
inputs = data['x'].to(device)
with torch.no_grad():
outputs = model(dae.feature(inputs))
preds.append(outputs.sigmoid().detach().cpu().numpy())
preds = np.concatenate(preds).reshape(1,)
return preds
MLP Model
class Model(nn.Module):
def __init__(self, num_features=3000, num_targets=1, hidden_size=1000):
super(Model, self).__init__()
self.batch_norm1 = nn.BatchNorm1d(num_features, eps=1e15)
self.dropout1 = nn.Dropout(SETUP['mlp_dropout'])
self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
self.batch_norm2 = nn.BatchNorm1d(hidden_size, eps=1e15)
self.dropout2 = nn.Dropout(SETUP['mlp_dropout'])
self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
self.batch_norm3 = nn.BatchNorm1d(hidden_size, eps=1e15)
self.dropout3 = nn.Dropout(SETUP['mlp_dropout'])
self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
def forward(self, x):
x = self.batch_norm1(x)
x = self.dropout1(x)
x = F.relu(self.dense1(x))
x = self.batch_norm2(x)
x = self.dropout2(x)
x = F.relu(self.dense2(x))
x = self.batch_norm3(x)
x = self.dropout3(x)
x = self.dense3(x)
#x = F.relu(self.dense3(x))
return x
DAE Model
class TransformerEncoder(torch.nn.Module):
def __init__(self, embed_dim, num_heads, dropout, feedforward_dim):
super().__init__()
self.attn = torch.nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
self.linear_1 = torch.nn.Linear(embed_dim, feedforward_dim)
self.linear_2 = torch.nn.Linear(feedforward_dim, embed_dim)
self.layernorm_1 = torch.nn.LayerNorm(embed_dim)
self.layernorm_2 = torch.nn.LayerNorm(embed_dim)
def forward(self, x_in):
attn_out, _ = self.attn(x_in, x_in, x_in)
x = self.layernorm_1(x_in + attn_out)
ff_out = self.linear_2(torch.nn.functional.relu(self.linear_1(x)))
x = self.layernorm_2(x + ff_out)
return x
class TransformerAutoEncoder(torch.nn.Module):
def __init__(
self,
num_inputs,
n_cats,
n_nums,
hidden_size=1024,
num_subspaces=8,
embed_dim=128,
num_heads=8,
dropout=0,
feedforward_dim=512,
emphasis=.75,
task_weights=SETUP['task_weights'],
mask_loss_weight=2,
):
super().__init__()
#print(f'{hidden_size} == {embed_dim} * {num_subspaces}')
assert hidden_size == embed_dim * num_subspaces
self.n_cats = n_cats
self.n_nums = n_nums
self.num_subspaces = num_subspaces
self.num_heads = num_heads
self.embed_dim = embed_dim
self.emphasis = emphasis
self.task_weights = np.array(task_weights) / sum(task_weights)
self.mask_loss_weight = mask_loss_weight
self.excite = torch.nn.Linear(in_features=num_inputs, out_features=hidden_size)
self.encoder_1 = TransformerEncoder(embed_dim, num_heads, dropout, feedforward_dim)
self.encoder_2 = TransformerEncoder(embed_dim, num_heads, dropout, feedforward_dim)
self.encoder_3 = TransformerEncoder(embed_dim, num_heads, dropout, feedforward_dim)
self.mask_predictor = torch.nn.Linear(in_features=hidden_size, out_features=num_inputs)
self.reconstructor = torch.nn.Linear(in_features=hidden_size + num_inputs, out_features=num_inputs)
def divide(self, x):
#print('divide function inside TransformerAutoEncoder')
batch_size = x.shape[0]
x = x.reshape((batch_size, self.num_subspaces, self.embed_dim)).permute((1, 0, 2))
return x
def combine(self, x):
#print('combine function inside TransformerAutoEncoder')
batch_size = x.shape[1]
x = x.permute((1, 0, 2)).reshape((batch_size, 1))
return x
def forward(self, x):
#print('forward function inside TransformerAutoEncoder')
x = torch.nn.functional.relu(self.excite(x))
x = self.divide(x)
x1 = self.encoder_1(x)
x2 = self.encoder_2(x1)
x3 = self.encoder_3(x2)
x = self.combine(x3)
predicted_mask = self.mask_predictor(x)
reconstruction = self.reconstructor(torch.cat([x, predicted_mask], dim=1))
return (x1, x2, x3), (reconstruction, predicted_mask)
def split(self, t):
#print('split function inside TransformerAutoEncoder')
return torch.split(t, [self.n_cats, self.n_nums], dim=1)
def feature(self, x):
#print('feature function inside TransformerAutoEncoder')
#this returns the autoencoder layer outputs as a concatenated feature set
attn_outs, _ = self.forward(x)
attn_outs = torch.cat([self.combine(x) for x in attn_outs], dim=1)
masks = torch.cat([x for x in _], dim=1)
return torch.cat([attn_outs, masks], dim=1)
def loss(self, x, y, mask, reduction='mean'):
#print('got to loss function inside TransformerAutoEncoder')
_, (reconstruction, predicted_mask) = self.forward(x)
x_cats, x_nums = self.split(reconstruction)
y_cats, y_nums = self.split(y)
w_cats, w_nums = self.split(mask * self.emphasis + (1  mask) * (1  self.emphasis))
cat_loss = self.task_weights[0] * torch.mul(w_cats, bce_logits(x_cats, y_cats, reduction='none'))
num_loss = self.task_weights[1] * torch.mul(w_nums, mse(x_nums, y_nums, reduction='none'))
reconstruction_loss = torch.cat([cat_loss, num_loss], dim=1) if reduction == 'none' else cat_loss.mean() + num_loss.mean()
mask_loss = self.mask_loss_weight * bce_logits(predicted_mask, mask, reduction=reduction)
return reconstruction_loss + mask_loss if reduction == 'mean' else [reconstruction_loss, mask_loss]
class SwapNoiseMasker(object):
def __init__(self, probas):
self.probas = torch.from_numpy(np.array(probas))
def apply(self, X):
#provides a distribution of points where we want to corrupt the data
should_swap = torch.bernoulli(self.probas.to(X.device) * torch.ones((X.shape)).to(X.device))
#provides a corruped X output
corrupted_X = torch.where(should_swap == 1, X[torch.randperm(X.shape[0])], X)
#calculates the mask which we aim to predict
mask = (corrupted_X != X).float()
return corrupted_X, mask
DataSet Structure
class FeatureDataset:
def __init__(self, features, targets):
self.features = features
self.targets = targets
def __len__(self):
return (self.features.shape[0])
def __getitem__(self, idx):
dct = {
'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
'y' : torch.tensor(self.targets[idx], dtype=torch.float)
}
return dct
class TestFeatureDataset:
def __init__(self, features):
self.features = features
def __len__(self):
return (self.features.shape[0])
def __getitem__(self, idx):
dct = { 'x' : torch.tensor(self.features[idx, :], dtype=torch.float), }
return dct