I’m trying to write a custom layer for the forward pass, however the weights return NaN values (and the grad values are 0) after the first forward pass. I suspect the trainable weights are not updating on the backward pass, and are somehow getting detached from the computational graph, but I don’t know if this is true. Am I creating the weights and biases correctly?
This is my custom layer:
class MultiGraphCNN(nn.Module):
def __init__(self,
input_dim,
output_dim,
num_filters,
activation=None,
use_bias=True,
kernel_initializer='glorot_uniform',
bias_initializer='zeros',
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
**kwargs):
super(MultiGraphCNN, self).__init__()
self.output_dim = output_dim
self.num_filters = num_filters
self.activation = activation
self.use_bias = use_bias
self.kernel_initializer = kernel_initializer
self.bias_initializer = bias_initializer
self.input_dim = input_dim
#self.input_shape = input_shape
#if self.num_filters != int(input_shape[1][-2]/input_shape[1][-1]):
#raise ValueError('num_filters does not match with graph_conv_filters dimensions.')
#self.input_dim = self.input_shape[0][-1]
# #self.input_dim = 1 #Ensures kernel shape is (2,100) as before
kernel_shape = (self.num_filters * self.input_dim, self.output_dim)
self.kernel = nn.Parameter(torch.empty(kernel_shape), requires_grad=True)
# self.kernel = self.add_weight(shape=kernel_shape,
# initializer=self.kernel_initializer,
# name='kernel',
# regularizer=self.kernel_regularizer,
# constraint=self.kernel_constraint)
if self.use_bias:
self.bias = nn.Parameter(torch.empty(self.output_dim, ))
#nn.init.zeros_(self.bias)
# self.bias = self.add_weight(shape=(self.output_dim,),
# initializer=self.bias_initializer,
# name='bias',
# regularizer=self.bias_regularizer,
# constraint=self.bias_constraint)
else:
self.bias = None
# self.reset_parameters()
#
# def reset_parameters(self) -> None:
# # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
# # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
# # https://github.com/pytorch/pytorch/issues/57109
# nn.init.xavier_uniform_(self.kernel)
# #init.kaiming_uniform_(self.weight, a=math.sqrt(5))
# if self.bias is not None:
# fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.kernel)
# bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
# nn.init.uniform_(self.bias, -bound, bound)
#
# self.built = True
def forward(self, inputs):
#print("X SHAPE", inputs[0].shape)
output = graph_conv_op(inputs[0], self.num_filters, inputs[1], self.kernel)
output = output + self.bias
output = F.elu(output)
# if self.use_bias:
# output = K.bias_add(output, self.bias)
# if self.activation is not None:
# output = self.activation(output)
return output
And the forward pass of my Encoder that uses this layer looks like so:
class Encoder(nn.Module):
def __init__(self, hidden_dim, in_features, out_features,num_filters, graph_conv_filters):
super(Encoder, self).__init__()
self.out_features = out_features
self.hidden_dim = hidden_dim
self.num_filters = num_filters
self.graph_conv_filters = graph_conv_filters
self.in_features = in_features
self.MultiGraphCNN_1 = MultiGraphCNN(input_dim=1, output_dim=100, num_filters=self.num_filters,
activation='elu')
#self.MultiGraphCNN_1.weight = nn.Parameter(torch.empty(2, 100))
#torch.nn.init.xavier_uniform_(self.MultiGraphCNN_1.weight)
#self.MultiGraphCNN_1.bias = nn.Parameter(torch.empty(100, ))
#self.MultiGraphCNN_1.bias.data.fill_(0.01)
self.MultiGraphCNN_2 = MultiGraphCNN(input_dim=100, output_dim=100, num_filters=self.num_filters,
activation='elu')
#self.MultiGraphCNN_2.weight = nn.Parameter(torch.empty(200, 100))
#torch.nn.init.xavier_uniform_(self.MultiGraphCNN_2.weight)
#self.MultiGraphCNN_2.bias = nn.Parameter(torch.empty(100, ))
#self.MultiGraphCNN_2.bias.data.fill_(0.01)
self.fc1 = nn.Linear(in_features=self.in_features, out_features=self.out_features[0])
self.fc2 = nn.Linear(in_features=self.out_features[0], out_features=self.out_features[1])
self.fc_mean = nn.Linear(in_features=self.out_features[1], out_features=self.hidden_dim)
self.fc_var = nn.Linear(in_features=self.out_features[1], out_features=self.hidden_dim)
def sampling(self, args):
"""Reparameterization trick by sampling fr an isotropic unit Gaussian.
# Arguments
args (tensor): mean and log of variance of Q(z|X)
# Returns
z (tensor): sampled latent vector
"""
z_mean, z_log_var = args
batch = z_mean.shape[0]
dim = z_mean.shape[1]
epsilon = torch.normal(mean=0.0, std=1.0, size=(batch, dim))
return z_mean + z_log_var * epsilon
def forward(self, inputs):
#inputs = [Attr_train, graph_conv_filters]
x = self.MultiGraphCNN_1(inputs)
x = nn.Dropout(0.1)(x)
x = self.MultiGraphCNN_2([x, inputs[1]])
x = nn.Dropout(0.1)(x)
x = Lambda(lambda x: torch.mean(x, dim=1))(
x) # adding a node invariant layer to make sure output does not depend upon the node order in a graph.
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
# z_mean = Dense(modelArgs["latent_dim"], name='z_mean')(x)
z_mean = self.fc_mean(x)
# z_log_var = Dense(modelArgs["latent_dim"], name='z_log_var')(x)
z_log_var = self.fc_var(x)
# use reparameterization trick to push the sampling out as input
# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(self.sampling)([z_mean, z_log_var])
# z = Lambda(self.sampling, output_shape=(modelArgs["latent_dim"],), name='z')([z_mean, z_log_var])
#latent_inputs = Input(shape=(modelArgs["latent_dim"],), name='z_sampling')
return z, z_mean, z_log_var
The first pass works fine, but after the loss.backward() is applied, I find that self.Encoder.MultiGraphCNN_1.kernel is a tensor of NaNs, while self.Encoder.MultiGraphCNN_1.kernel.grad is a tensor of zeros. Any help would be really appreciated. Thank you