Hi. I’m executing the following code in an AzureML virtual machine: mcr.microsoft.com/azureml/base:openmpi3.1.2-ubuntu18.04. The following Pytorch Adam Optimization of a mathematical formula in CPU mode. At a certain point, the process slows down and it takes 100 times more to finish. The point where it starts slowing down seems to be random and it depends on the capabilities of the virtual machine that I execute the code with. What could be the issue here?. Has anybody experienced a similar problem?. In my laptop and othervirtual machines it works well, it seems to be related to the virtual machine of azureml but I’m not sure.
Thanks a lot in advance!
optimizer_d[key_dict_d] = optim.Adam(list(qt_d.values()) + [t_background], lr=learning_rate)
loss_fn = torch.nn.MSELoss(reduction=‘mean’)
errors = []
cnt_stop = -1
error_optim = np.inf
qt_d_optim = dict()
t_background_optim = 0.0
for t in range(n_iters):
t_pred = torch.full((df_source_points_d_shape,), 0.0, device=device, dtype=dtype)
if t % 1000 == 0:
logger.info(f"CURRENT ITERATION: {t}“)
logger.info(f"ERROR OPTIM: {error_optim}”)
logger.info(f"T BACKGROUND OPTIM: {t_background_optim}")
t_pred += t_background
for key in key_dict_l:
sigma_y = torch.pow(torch.abs(xt_d[key] / x0 + 1e-6), b)
enum = qt_d[key] * torch.exp(-0.5 * torch.pow(yt_d[key] + 1e-6, 2) / torch.pow(
a_d[key] * sigma_y, 2)) * m_a
denom = u_d[key] * math.sqrt(2 * math.pi) * a_d[key] * sigma_y * m_ch4 * omega_a * ppb
t_pred_aux = enum / denom
t_pred_aux = torch.where(torch.isnan(t_pred_aux), torch.zeros_like(t_pred_aux), t_pred_aux)
t_pred_aux = t_pred_aux.type(dtype)
t_pred_aux = torch.where(xt_d[key] <= 0.0, torch.tensor(0.0, dtype=dtype, device=device),
t_pred_aux)
t_pred += t_pred_aux
loss = loss_fn(t_pred, t_target)
optimizer_d[key_dict_d].zero_grad(set_to_none=True)
current_error = loss.item()
if current_error < error_optim:
for key in key_dict_l:
qt_d_optim[key] = qt_d[key].item()
t_background_optim = t_background.item()
error_optim = current_error
cnt_stop = -1
else:
cnt_stop += 1
errors.append(current_error)
loss.backward()
optimizer_d[key_dict_d].step()
for key in key_dict_l:
qt_d[key].data = qt_d[key].data.clamp(0.0, np.inf)
if cnt_stop >= early_stopping:
break
if t_background.data >= df_source_points_d[“methane”].mean():
t_background.data = t_background.data.clamp(0.0, df_source_points_d[self.col_methane].mean())