Hello,
I got an error “Function ‘ScaledDotProductEfficientAttentionBackward0’ returned nan values in its 0th output.” when training custom model.
This exception will not be triggered immediately during training, but will be triggered after training for many epochs.
At first I thought it was a problem with my input data. I checked the data input to the model for invalid values (x.isnan().any() or x.isinf().any()), and there were no invalid values. I also checked that the forward process, as well as the loss value, did not contain invalid values too.
BUT, when loss.backward() run ,some of model’s params grad change to nan. I don’t know why? So I enable the set_detect_anaomly(True) to track the exception, the debug tool show “Function ‘ScaledDotProductEfficientAttentionBackward0’ returned nan values in its 0th output.”. Its a sublayer of TransformerEncoderLayer. This is an official module.
My code snippet as below:
class PositionalEncodingLayer(nn.Module):
def __init__(self, d_model, max_len=10000):
super(PositionalEncodingLayer, self).__init__()
self.d_model = d_model
self.max_len = max_len
self.pos_encoding = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
)
self.pos_encoding[:, 0::2] = torch.sin(position * div_term)
self.pos_encoding[:, 1::2] = torch.cos(position * div_term)
self.pos_encoding = self.pos_encoding.unsqueeze(0)
def forward(self, x):
x = x * math.sqrt(self.d_model+1e-8)
seq_len = x.size(1)
if seq_len > self.max_len:
raise ValueError("Sequence length exceeds maximum length")
else:
pos_enc = self.pos_encoding[:, :seq_len, :]
x = x + pos_enc.to(x.device)
return x
class TransformerNet(nn.Module):
def __init__(
self, input_size, d_model, output_size, activation: str = "leaky_relu"
):
super(TransformerNet, self).__init__()
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.input_size = input_size
self.output_size = output_size
self.activation = activation
self.negative_slope = 0.01
self.batch_norm = torch.nn.BatchNorm2d(
num_features=self.input_size, affine=False
)
self.encoder_layer = nn.Linear(self.input_size, d_model)
self.pos_encoding = PositionalEncodingLayer(d_model)
self.transformer_encoder_layer = nn.TransformerEncoderLayer(
d_model=d_model,
nhead=8,
dim_feedforward=4 * d_model,
dropout=0.1,
activation="gelu",
layer_norm_eps=1e-05,
batch_first=True,
)
self.fc_layer = nn.Linear(d_model, output_size)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Args:
x (torch.Tensor): (batch_size, seq_len, features)
Returns:
output (torch.Tensor): (batch_size, output_size)
"""
# Normalize
_shape = x.shape # (batch_size, seq_len, features)
x = x.contiguous().view(-1, _shape[-1]) # (batch_size * seq_len, features)
x = x.unsqueeze(-1).unsqueeze(-1) # (batch_size * seq_len, features, 1, 1)
x = (
self.batch_norm(x).squeeze(-1).squeeze(-1)
) # shape: (batch_size * seq_len, features)
x = x.contiguous().view(_shape) # (batch_size, seq_len, features)
x = self.encoder_layer(x) # (batch_size, seq_len, d_model)
x = self.pos_encoding(x) # (batch_size, seq_len, d_model)
seq_len = x.shape[1]
attn_mask = nn.Transformer.generate_square_subsequent_mask(
seq_len, self.device
).bool()
# Transformer Encoder Layer
x = self.transformer_encoder_layer(
x, src_mask=attn_mask, is_causal=True
) # (batch_size, seq_len, d_model)
x = x[:, -1, :] # (batch_size, d_model)
# FC layer
if self.activation == "relu":
output = F.relu(self.fc_layer(x))
elif self.activation == "leaky_relu":
output = F.leaky_relu(self.fc_layer(x), self.negative_slope)
else:
raise ValueError("Unknown activation function " + str(self.activation))
return output
class FCNet(nn.Module):
def __init__(self, input_size, output_size, activation: str = "leaky_relu"):
super(FCNet, self).__init__()
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.input_size = input_size
self.output_size = output_size
self.fc_layer = nn.Linear(self.input_size, self.output_size)
self.activation = activation
self.negative_slope = 0.01
self.init_weights()
def zscore(self, x):
mean = x.mean()
std = x.std()
z_score = (x - mean) / (std+1e-8)
return z_score
def init_weights(self):
for name, param in self.fc_layer.named_parameters():
if "weight" in name:
if self.activation == "relu":
nn.init.kaiming_normal_(param, nonlinearity=self.activation)
elif self.activation == "leaky_relu":
nn.init.kaiming_normal_(
param, a=self.negative_slope, nonlinearity=self.activation
)
else:
raise ValueError(
"Unknown activation function " + str(self.activation)
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Args:
x (torch.Tensor): (batch_size, features)
Returns:
output (torch.Tensor): (batch_size, output_size)
"""
# Z-Score
x = self.zscore(x) # shape: (batch_size, features)
# FC layer
if self.activation == "relu":
output = F.relu(self.fc_layer(x))
elif self.activation == "leaky_relu":
output = F.leaky_relu(self.fc_layer(x), self.negative_slope)
else:
raise ValueError("Unknown activation function " + str(self.activation))
return output
class QActor(nn.Module):
def __init__(
self,
num_actions: int,
feature_5_size: int,
feature_1_size: int,
feature_4_size: int,
feature_2_size: int,
feature_3_size: int,
feature_extract_size: int = 1024,
hidden_layers: tuple = (1024,),
d_model: int = 512,
activation: str = "leaky_relu",
**kwargs,
):
super(QActor, self).__init__()
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.num_actions = num_actions
self.feature_5_size = feature_5_size
self.feature_1_size = feature_1_size
self.feature_4_size = feature_4_size
self.feature_2_size = feature_2_size
self.feature_3_size = feature_3_size
self.activation = activation
self.negative_slope = 0.01
self.feature_extractor_1 = TransformerNet(
input_size=self.feature_1_size,
d_model=d_model,
output_size=feature_extract_size,
activation=activation,
)
self.feature_extractor_2 = TransformerNet(
input_size=self.feature_2_size,
d_model=d_model,
output_size=feature_extract_size,
activation=activation,
)
self.feature_extractor_3 = nn.Sequential(
nn.Flatten(start_dim=1, end_dim=-1),
FCNet(
input_size=self.feature_3_size,
output_size=512,
activation=activation,
),
FCNet(
input_size=512,
output_size=feature_extract_size,
activation=activation,
),
)
self.feature_extractor_4 = FCNet(
input_size=self.feature_4_size,
output_size=feature_extract_size,
activation=activation,
)
self.feature_extractor_5 = FCNet(
input_size=self.feature_5_size,
output_size=feature_extract_size,
activation=activation,
)
# FC layer
self.fc_layers = nn.ModuleList()
input_size = feature_extract_size
last_hidden_layer_size = input_size
if hidden_layers is not None:
nh = len(hidden_layers)
self.fc_layers.append(nn.Linear(input_size, hidden_layers[0]))
for i in range(1, nh):
self.fc_layers.append(nn.Linear(hidden_layers[i - 1], hidden_layers[i]))
last_hidden_layer_size = hidden_layers[nh - 1]
self.fc_layers.append(nn.Linear(last_hidden_layer_size, self.num_actions))
self.init_weights()
def init_weights(self):
for name, param in self.fc_layers.named_parameters():
if "weight" in name:
if self.activation == "relu":
nn.init.kaiming_normal_(param, nonlinearity=self.activation)
elif self.activation == "leaky_relu":
nn.init.kaiming_normal_(
param, a=self.negative_slope, nonlinearity=self.activation
)
else:
raise ValueError(
"Unknown activation function " + str(self.activation)
)
def forward(
self,
feature_1: torch.Tensor,
feature_4: torch.Tensor,
feature_2: torch.Tensor,
feature_3: torch.Tensor,
feature_5: torch.Tensor,
) -> torch.Tensor:
"""
Args:
feature_1 (torch.Tensor): (batch_size, seq_len, features1)
feature_5 (torch.Tensor): (batch_size, features2)
Returns:
"""
# ---------------------------------------------------------------- #
feature = self.feature_extractor_1(
feature_1
) # (batch_size, feature_extract_size)
# --------------------------------------------------------------- #
# Create a mask that marks all-zero samples as False and non-zero samples as True
mask = ~feature_2.eq(0).all(dim=1).all(dim=1)
# Apply a mask to select samples to process
feature_2_state_selected = feature_2[mask]
feature_2_state_selected_output = self.feature_extractor_2(
feature_2_state_selected
)
feature_2_output = torch.zeros(
feature_2.shape[0],
feature_2_state_selected_output.shape[1],
device=self.device,
) # (batch_size, feature_extract_size)
feature_2_output[mask] = feature_2_state_selected_output
feature = feature + feature_2_output
# -------------------------------------------------------------- #
mask = ~feature_3.eq(0).all(dim=1).all(dim=1)
feature_3_selected = feature_3[mask]
feature_3_selected_output = self.feature_extractor_3(feature_3_selected)
feature_3_output = torch.zeros(
feature_3.shape[0], feature_3_selected_output.shape[1], device=self.device
) # (batch_size, feature_extract_size)
feature_3_output[mask] = feature_3_selected_output
feature = feature + feature_3_output
# ------------------------------------------------------------------ #
feature_4_output = self.feature_extractor_4(
feature_4
) # (batch_size, feature_extract_size)
feature = feature + feature_4_output
# ------------------------------------------------------------------ #
feature_5_output = self.feature_extractor_5(
feature_5
) # (batch_size, feature_extract_size)
feature = feature + feature_5_output
# ----------------------------------------------------------------------- #
x = feature # (batch_size, feature_extract_size)
num_layers = len(self.fc_layers)
for i in range(0, num_layers - 1):
if self.activation == "relu":
x = F.relu(self.fc_layers[i](x))
elif self.activation == "leaky_relu":
x = F.leaky_relu(self.fc_layers[i](x), self.negative_slope)
else:
raise ValueError("Unknown activation function " + str(self.activation))
Q = self.fc_layers[-1](x)
return Q
def get_batchnorm_params(self):
params = [
self.feature_extractor_1.batch_norm.running_mean,
self.feature_extractor_1.batch_norm.running_var,
self.feature_extractor_2.batch_norm.running_mean,
self.feature_extractor_2.batch_norm.running_var,
]
return params
class TransformerAgent(object):
def __init__(
self,
num_actions,
feature_5_size,
feature_1_size,
feature_4_size,
feature_2_size,
feature_3_size,
window_size,
save_dir,
actor_class=QActor,
actor_kwargs={},
epsilon_initial=1.0,
epsilon_final=0.05,
epsilon_steps=1000000,
batch_size=64,
gamma=0.99,
tau_actor=0.01,
replay_memory_size=2048,
learning_rate_actor=0.001,
initial_memory_threshold=0,
loss_func=F.smooth_l1_loss,
clip_grad=10.0,
device="cuda" if torch.cuda.is_available() else "cpu",
name="TransformerAgent",
ckpt_path=None,
seed=None,
):
#######......######
self.actor = actor_class(
self.num_actions,
feature_5_size,
feature_1_size,
feature_4_size,
feature_2_size[1],
feature_3_size[1] * feature_3_size[0],
**actor_kwargs,
).to(device)
self.actor_target = actor_class(
self.num_actions,
feature_5_size,
feature_1_size,
feature_4_size,
feature_2_size[1],
feature_3_size[1] * feature_3_size[0],
**actor_kwargs,
).to(device)
hard_update_target_network(self.actor, self.actor_target)
self.actor_target.eval()
# l1_smooth_loss performs better but original paper used MSE
self.loss_func = loss_func
self.actor_optimiser = optim.AdamW(
self.actor.parameters(), lr=self.learning_rate_actor, weight_decay=0.01
)
episodes = epsilon_steps * 10
self.actor_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
self.actor_optimiser, T_max=episodes * window_size, eta_min=0.000001
)
def _optimize_td_loss(self, pipe_child):
if self._step < self.batch_size or self._step < self.initial_memory_threshold:
return
(
feature_1,
feature_4,
feature_2,
feature_3,
feature_5,
actions,
rewards,
next_feature_1,
next_feature_4,
next_feature_2,
next_feature_3,
next_feature_5,
terminals,
) = self.replay_memory.sample(self.batch_size, random_machine=self.np_random)
rewards_tensor = torch.from_numpy(rewards).to(self.device).squeeze()
feature_1_tensor = torch.from_numpy(feature_1).to(self.device)
feature_4_tensor = torch.from_numpy(feature_4).to(self.device)
feature_2_tensor = torch.from_numpy(feature_2).to(self.device)
feature_3_tensor = torch.from_numpy(feature_3).to(self.device)
feature_5_tensor = torch.from_numpy(feature_5).to(self.device)
next_feature_1_tensor = torch.from_numpy(next_feature_1).to(self.device)
next_feature_4_tensor = torch.from_numpy(next_feature_4).to(
self.device
)
next_feature_2_tensor = torch.from_numpy(next_feature_2).to(
self.device
)
next_feature_3_tensor = torch.from_numpy(next_feature_3).to(self.device)
next_feature_5_tensor = torch.from_numpy(next_feature_5).to(
self.device
)
actions_tensor = torch.from_numpy(actions).to(self.device, dtype=torch.int64)
with torch.no_grad():
pred_Q_a = self.actor_target(
next_feature_1_tensor,
next_feature_4_tensor,
next_feature_2_tensor,
next_feature_3_tensor,
next_feature_5_tensor,
)
Qprime = torch.max(pred_Q_a, 1, keepdim=True)[0].squeeze()
target = rewards_tensor + self.gamma * Qprime
q_values = self.actor(
feature_1_tensor,
feature_4_tensor,
feature_2_tensor,
feature_3_tensor,
feature_5_tensor,
)
y_predicted = q_values.gather(1, actions_tensor.view(-1, 1)).squeeze()
y_expected = target
loss_Q = self.loss_func(y_predicted, y_expected)
self.actor_optimiser.zero_grad()
###############################
###############################
###############################
loss_Q.backward()
###############################
###############################
###############################
if self.clip_grad > 0:
clip_grad_value_(self.actor.parameters(), self.clip_grad)
self.actor_optimiser.step()
self.actor_scheduler.step()
Could anyone tell me how to solve this problem? Thanks!!