#Model Architecture
class Model_1(torch.nn.Module):
def __init__(self):
super().__init__()
self.batch_size = 128
**Pipeline 1**
self.conv1d_down_1_depth = nn.Conv1d(769, 769, kernel_size=10, stride=1, groups=769, padding=0)
self.conv1d_down_1_point = nn.Conv1d(769, 384, kernel_size=1, stride=1, padding=0)
self.bn_1 = nn.LayerNorm([384, 54])
self.relu=nn.ReLU()
self.conv1d_down_2_depth = nn.Conv1d(384, 384, kernel_size=10, stride=1, groups=384, dilation=2, padding=0)
self.conv1d_down_2_point = nn.Conv1d(384, 192, kernel_size=1, stride=1)
self.bn_2 = nn.LayerNorm([192, 36])
self.conv1d_down_3_depth = nn.Conv1d(192, 192, kernel_size=2, stride=1, groups=192, dilation=4, padding=0)
self.conv1d_down_3_point = nn.Conv1d(192, 96, kernel_size=1, stride=1)
self.bn_3 = nn.LayerNorm([96, 32])
**Pipeline 2**
self.pip_conv_1d = nn.Conv1d(2307, 2307, kernel_size=10, stride=1, groups=2307, padding=0)
self.pip_conv_1p = nn.Conv1d(2307, 1152, kernel_size=1, stride=1, padding=0)
self.bn_pip_1 = nn.LayerNorm([1152, 54])
self.pip_conv_2d = nn.Conv1d(1152, 1152, kernel_size=10, stride=1, groups=1152, dilation=2, padding=0)
self.pip_conv_2p = nn.Conv1d(1152, 576, kernel_size=1, stride=1, padding=0)
self.bn_pip_2 = nn.LayerNorm([576, 36])
self.pip_conv_3d = nn.Conv1d(576, 576, kernel_size=2, stride=1, groups=576, dilation=4, padding=0)
self.pip_conv_3p = nn.Conv1d(576, 288, kernel_size=1, stride=1, padding=0)
self.bn_pip_3 = nn.LayerNorm([288, 32])
self.drp_1=nn.Dropout(p=0.2)
self.drp = nn.Dropout(p=0.5)
def forward(self, x, x2):
x = self.relu(self.conv1d_down_1_depth(x))
x = self.bn_1(self.relu(self.conv1d_down_1_point(x)))
x = self.drp_1(x)
x2=self.relu(self.pip_conv_1d(x2))
x2=self.bn_pip_1(self.relu(self.pip_conv_1p(x2)))
x2=self.drp_1(x2)
x = self.relu(self.conv1d_down_2_depth(x))
x = self.bn_2(self.relu(self.conv1d_down_2_point(x)))
x = self.drp_1(x)
x2=self.relu(self.pip_conv_2d(x2))
x2=self.bn_pip_2(self.relu(self.pip_conv_2p(x2)))
x2=self.drp_1(x2)
x = self.relu(self.conv1d_down_3_depth(x))
x = self.bn_3(self.relu(self.conv1d_down_3_point(x)))
x2=self.relu(self.pip_conv_3d(x2))
x2=self.bn_pip_3(self.relu(self.pip_conv_3p(x2)))
#x Output pipeline 1
#x2 Output pipeline 2
return x,x2
class Model_3(torch.nn.Module):
def __init__(self):
super().__init__()
self.batch_size = 128
self.drp = nn.Dropout(p=0.5)
self.fc_1 = nn.Linear(384, 96)
self.fc_2 = nn.Linear(96, 48)
self.fc_3 = nn.Linear(48, 28)
self.softplus = nn.Softplus()
def forward(self, x):
x = self.fc_1(x)
x = self.drp(x)
x = self.fc_3(self.fc_2(x))
mean, variance = x[:, :14], x[:, 14:]
variance = self.softplus(variance)
return mean, (variance + 10e-7)
class Ensemble(torch.nn.Module):
def __init__(self, model1, model2,bs):
super().__init__()
self.batch_size = bs
self.model_a = model1
self.model_c = model2
self.avgpool = nn.AvgPool1d(32, stride=1)
def forward(self, ch1, ch2):
x,x2 = self.model_a(ch1,ch2)
x = torch.cat((x, x2), axis=1)
x = self.avgpool(x)
x = x.reshape(self.batch_size, -1)
mean, variance = self.model_c(x)
return mean, variance
train_dl = DataLoader(train_data, batch_size=128, shuffle=True, num_workers=0, drop_last=True)
val_dl = DataLoader(val_data, batch_size=128, shuffle=True, num_workers=0, drop_last=True)
net_1 = Model_1().to(torch.device("cuda"))
net_3 = Model_3().to(torch.device("cuda"))
net = Ensemble(net_1, net_3,128).to(torch.device("cuda"))
optimizer = optim.Adam(net.parameters(), lr=0.0001)
ar_loss = []
batch_loss_ar = []
total_batch_idx = 0
val_data_ar = []
acc_data_ar = []
save_best_val = 0
adcc = np.zeros((1, 14))
track_var = np.zeros((1, 14))
local_dt_sp = np.zeros((1, 44))
for epoch in range(100):
ar_loss, batch_loss_ar, adcc, track_var = train(net, train_dl, optimizer, epoch, ar_loss, batch_loss_ar)
val_data_ar, acc_data_ar, local_dt_sp = val(net, val_dl, optimizer, epoch, val_data_ar, acc_data_ar)
np.save(path+"mlh_ar_loss.npy", ar_loss)
np.save(path+"mlh_batch_loss_ar.npy", batch_loss_ar)
np.save(path+"mlh_val_data_ar.npy", val_data_ar)
np.save(path+"mlh_acc_data_ar.npy", acc_data_ar)
np.save(path+"mlh_bnf_track_" + str(epoch) + "_var_.npy", track_var)
# save best model
if epoch == 0:
save_best_val = val_data_ar[-1]
np.save(path+"mlh_dummy_input_mean_sh.npy", adcc)
elif save_best_val > val_data_ar[-1]:
debug_1,debug_2=net_1(torch.ones(m1,m2,m3),torch.ones(m1,m4,m5))
print(debug_1) #Do not get nan
print(debug_2) #Get nan values , while training its not the case , trains without nan's
torch.save(
{'model_dict_1': net_1.state_dict(), 'model_dict_3':net_3.state_dict(),'model_dict_ens': net.state_dict(),
'optimizer_dic': optimizer.state_dict(), 'epoch': epoch, 'loss': val_data_ar[-1]},
path+"mlh_tas_save_best_sh.pt")
save_best_val = val_data_ar[-1]
np.save(path+"mlh_bnf_mag_96ms_" + str(epoch) + ".npy", local_dt_sp)
So if i do net_1(torch.ones(m1,m2,m3),torch.ones(m1,m4,m5)) i get nan for x2 value while i don’t get nan for x1 value .
The point to note is while training the same model i don’t get nan on x and on x2.
I also checked the model while running just the second pipeline, and found that the problem persists only with second pipeline.
The loss function here is Negative Log-Likelihood loss.
Debuged more i found that i start getting nan values from self.pip_conv_1p layer output in pipeline-2
#Model Loading Script
device=torch.device("cpu")
model_1=net.Model_1().to(device="cpu")
model_3=net.Model_3().to(device="cpu")
model_4=net.Ensemble(model_1,model_3,1).to(device="cpu")
model_4.batch_size=1
chkp=torch.load(path+"mlh_tas_save_best_sh.pt",map_location=device)
model_1.load_state_dict(chkp['model_dict_1'])
model_3.load_state_dict(chkp['model_dict_3'])
model_4.load_state_dict(chkp['model_dict_ens'])