I am sorry for a long delay… I encountered weird situation when simplyfing my code for this session. It has a Trained model (Model_A) is used in a cascade chain, to train Model_B.

Q1. My question was : how to set Model_A not be trained while training Model_B, but it must be within the data flow, that is, it should support back propagation for Model_B training.

Q2.Now I encountered a new situation. when the device is set ‘cpu’, it shows no error. I have not checked if it actually works well, but at least it runs without en error.

However, when device is set to ‘cuda’ it shows the error that I originally asked (“cudnn RNN backward can only be called in training mode”)…Of course I have a gpu device.

Please check out my working code and give me any suggestion:

```
```

```
import torch
import torch.nn as nn
import torch.optim as optim
from numpy import *
class Net_1(nn.Module):
def __init__(self, emb_input_size, rnn_input_size, hidden_size, num_layers, dropout_prob, batch_first=True):
super(Net_1, self).__init__()
self.rnn_layer = nn.RNN(rnn_input_size, hidden_size, num_layers,batch_first = batch_first)
self.linear1 = nn.Linear(hidden_size, rnn_input_size) #
def forward(self, x):
batch_size = 50
x_rnn_input = x.view(batch_size,-1,rnn_input_size)
output_rnn, hidden_rnn = self.rnn_layer(x_rnn_input)
output = self.linear1(output_rnn)
return output.view(x.shape[0],-1) # maintain same shape as the input
#####################
# Hyper-parameters for Model_A
#####################
hyper_parameters = [1, 10**2, 1e-2, 8, 3, 0.1]
[num_datasets, num_epochs,learning_rate,hidden_size,num_hidden_layers,dropout_prob] = hyper_parameters
emb_input_size = 0
rnn_input_size = 2
#####################
# Setup Model for Model_A
#####################
Model_A = Net_1(emb_input_size, rnn_input_size, hidden_size, num_hidden_layers, dropout_prob, batch_first=True)
device = torch.device("cuda" if (torch.cuda.is_available())else "cpu")
device = "cpu"
Model_A.to(device)
#####################
# Loss Model for Model_A
#####################
loss_func_A = nn.MSELoss(reduction='sum')
optimizer_A = optim.Adam(params = Model_A.parameters(), lr = learning_rate)
#####################
# Data stream
#####################
x_data = (random.randn(1,1000))
y_data = x_data-0.01*x_data**3
x_chunk = (torch.tensor(x_data,dtype = torch.float)).to(device) # x_data and y_data are numpy float of [1,1000]
y_chunk = (torch.tensor(y_data,dtype = torch.float)).to(device)
#####################
# Training Model_A
#####################
for step2 in range(num_epochs):
optimizer_A.zero_grad()
output_A = Model_A(x_chunk)
loss_A = loss_func_A(output_A.view(x_chunk.shape[0],-1), y_chunk.view(x_chunk.shape[0],-1)) # compare total output with the target
loss_A.backward()
optimizer_A.step()
Model_A.eval() # evaluation mode
#####################
# Hyper-parameters for Model_B
#####################
emb_input_size = 0
rnn_input_size = 2
hyper_parameters = [1, 1*(10**2), 1e-2, 24, 2, 0.1]
[num_datasets, num_epochs,learning_rate,hidden_size,num_hidden_layers,dropout_prob] = hyper_parameters
#####################
# Setup Model B
#####################
Model_B = Net_1(emb_input_size, rnn_input_size, hidden_size, num_hidden_layers, dropout_prob, batch_first=True)
Model_B = Model_B.to(device)
Model_B.train() # set Model_B for training
loss_func_B = nn.MSELoss(reduction='sum')
optimizer_B = optim.Adam(params = Model_B.parameters(), lr = learning_rate)
#####################
# Training for Model B
#####################
for step in range(num_epochs):
optimizer_B.zero_grad()
z_A = Model_B(x_chunk) # x->model_B-> z_A ->model_A (=x)
output_A = Model_A(z_A) # get model_A output
loss_B = loss_func_B(output_A.view(x_chunk.shape[0],-1), x_chunk.view(x_chunk.shape[0],-1)) # compare total output with the target sentence
loss_B.backward()
optimizer_B.step()
Model_A.eval()
Model_B.eval() # stop training
print("#--- Progrma End ---#")
```