hello. I am trying to write a training code with two models (A, B) and two optimizers (A’, B’).
For model A, it corresponds to an autoencoder structure that recovers the input data, and for the input of model B, I need a latent variable (z) that has passed through the encoder of the autoencoder.
For model B, it takes the latent variable (z) generated by the encoder part of the autoencoder as a new input and performs the classification task.
I tried to configure the optimizer for each model, but I get the following error.
Model code
lr = 0.00001
wd = 0.000005
E_thr = 0.03
epoch = 5000
DAG_encoder = AE_Encoder(input_dim = 1, # x_dims
output_dim = args.feature_dim, # int(z_dims)
hidden_dim = args.hidden, # encoder_hidden
init_A = adj_A,
G_thr = args.G_thr
).to(device)
DAG_decoder = AE_Decoder(input_dim = args.feature_dim, # x_dims
output_dim = 1, # int(z_dims)
hidden_dim = args.hidden, # encoder_hidden
).to(device)
CAL = CausalGCN(args.feature_dim, args.num_classes, args).to(device)
DAG_optimizer = optim.Adam(list(DAG_encoder.parameters()) + list(DAG_decoder.parameters()), lr=lr, weight_decay=wd)
CAL_optimizer = optim.Adam(CAL.parameters(), lr=lr, weight_decay=wd)
for e in range(epoch):
start = time.time()
CAL.train()
DAG_encoder.train()
DAG_decoder.train()
for table_, label in train_loader:
table_, label = table_.to(device), label.to(device)
table_ = Variable(table_)
DAG_optimizer.zero_grad()
node_feat, weighted_init_A, Wa = DAG_encoder(table_)
weighted_z, output = DAG_decoder(node_feat, weighted_init_A, Wa)
variance = 0.0
DAG_NLL_loss = nll_gaussian(output, table_, variance) # reconstruction accuracy loss
DAG_KL_loss = kl_gaussian_sem(node_feat) # KL loss
DAG_value = DAG_constraint(weighted_init_A, variable_size) # DAG constraint
DAG_cons_loss = args.loss_weight*DAG_value + (
0.5*DAG_value*DAG_value +
100.0*torch.trace(weighted_init_A*weighted_init_A)
)
DAG_loss = DAG_NLL_loss + DAG_KL_loss + DAG_cons_loss
DAG_loss.backward(retain_graph=True)
DAG_optimizer.step()
correct_causal = 0
bsz_graph = []
edge_index = pred_A.nonzero().t()
for b in range(min(len(node_feat), args.bsz)):
pred_graph = Data(feat=node_feat[b].float(), edge_index=edge_index, y=label[b])
bsz_graph.append(pred_graph)
graph_loader = PyGDataLoader(bsz_graph, batch_size=args.bsz, shuffle=False)
for graph_ in graph_loader:
CAL_optimizer.zero_grad()
subgraph_n_logits, subgraph_c_logits, subgraph_i_logits = CAL(graph_, eval_random=True)
C_target = graph_.y
N_target = torch.ones_like(subgraph_n_logits, dtype=torch.float).to(device) / args.num_classes
N_loss = F.kl_div(subgraph_n_logits, N_target, reduction='batchmean') # Non-causal
C_loss = F.nll_loss(subgraph_c_logits, C_target.long()) # Causal
I_loss = F.nll_loss(subgraph_i_logits, C_target.long()) # Intervention
CAL_loss = 0.5 * N_loss + 1.0 * C_loss + 0.5 * I_loss
CAL_loss.backward(retain_graph=True)
CAL_optimizer.step()
Error code
--> 152 CAL_loss.backward(retain_graph=True)
153 CAL_optimizer.step()
155 # -------------------------------------- valid ----------------------------------------
File ~/anaconda3/lib/python3.9/site-packages/torch/_tensor.py:396, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
387 if has_torch_function_unary(self):
388 return handle_torch_function(
389 Tensor.backward,
390 (self,),
(...)
394 create_graph=create_graph,
395 inputs=inputs)
--> 396 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File ~/anaconda3/lib/python3.9/site-packages/torch/autograd/__init__.py:173, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
168 retain_graph = create_graph
170 # The reason we repeat same the comment below is that
171 # some Python versions print out the first line of a multi-line function
172 # calls in the traceback and some print out the last line
--> 173 Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
174 tensors, grad_tensors_, retain_graph, create_graph, inputs,
175 allow_unreachable=True, accumulate_grad=True)
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.DoubleTensor [128, 10]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!