Training loss does not changed in rgcn

I saw lots of posts about “loss stay same”, however, they don’t solve my problem.
I found that my gradient at the start always was 0. I don’t know what happen. Could you help me out?

# gradient保持为0,且loss一直不变的原因:
# 1.softmax和nn.CrossEntropy冲突, crossentropy本身自带log-softmax   -----> 最有可能的原因,失败
# 2.with g.local_scope():,感觉这个的原因不大
# 3.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # 梯度裁剪防止梯度爆炸,在梯度0的情况下没有意义
# 4.学习率和optimizer都进行了调整,失败。学习率调小调大都没有用,optimizer从Adam换成了SGD+momentum也没有用

import pandas as pd

from sklearn.preprocessing import LabelEncoder
import os
os.environ["DGLBACKEND"] = "pytorch"

from functools import partial
# load graph data
import dgl.data
import dgl
import dgl.function as fn
import torch
import torch.nn as nn
import torch.nn.functional as F


# 读取 Excel 文件
df = pd.read_csv("/data/data02/zhaokai/data/whole_table_0301_0307.csv")

# whole_table_0301_0307.csv
# case_table_0301_0307.csv
text_columns = ['insured_code', 'car_mark', 'assess_dept_code', 'department_code',
                     'check_department_code', 'indemnity_conclusion', 'person_loss_flag',
                     'veh_clas_code', 'client_type', 'accident_cause_level3', 'renewal_type','report_no']
df[text_columns] = df[text_columns].astype(str)
df.dropna(subset=text_columns, inplace=True)


# 将其他列设置为数值类型
numeric_columns = ['car_age','duty_coefficient','insured_value','policy_sum_estimate',
                   'policy_sum_pay','total_agreed_amount'
                   ]

df[numeric_columns] = df[numeric_columns].astype(float).fillna(0)



# label encoder, 把复杂的number编码成12345,更好的进行edge的连接

columns_to_encode = ['insured_code', 'car_mark', 'assess_dept_code', 'department_code',
                     'check_department_code', 'indemnity_conclusion', 'person_loss_flag',
                     'veh_clas_code', 'client_type', 'accident_cause_level3', 'renewal_type','report_no']

for column in columns_to_encode:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])

# Group by
# 节点的数量应该要和节点feature的数量对等
# for car
car_age = df.groupby('car_mark')['car_age'].mean()
insured_value = df.groupby('car_mark')['insured_value'].mean() # 数值型
veh_clas_code = df.groupby('car_mark')['veh_clas_code'].first() # 类别型

# for accident
policy_sum_estimate = df.groupby('report_no')['policy_sum_estimate'].mean() # 数值型
policy_sum_pay = df.groupby('report_no')['policy_sum_pay'].mean() # 数值型
duty_coefficient = df.groupby('report_no')['duty_coefficient'].mean() # 数值型
total_agreed_amount = df.groupby('report_no')['total_agreed_amount'].mean() # 数值型

person_loss_flag = df.groupby('report_no')['person_loss_flag'].first() # 类别型
accident_cause_level3 = df.groupby('report_no')['accident_cause_level3'].first() # 类别型

# for insured
client_type = df.groupby('insured_code')['client_type'].first() # 类别型
renewal_type = df.groupby('insured_code')['renewal_type'].first() # 类别型
indemnity_conclusion = df.groupby('insured_code')['indemnity_conclusion'].first() # 类别型



# ------------------------------------------------ KG building --------------------------------------------------------
def build_knowledge_graph(data):
    g = dgl.heterograph({
        ('insured', 'owns', 'car_mark'): (data['insured_code'], data['car_mark']), # 被保险人owns车
        ('insured', 'assessed by', 'assess_dept_code'): (data['insured_code'], data['assess_dept_code']), # 被保险人被assessed
        ('insured', 'insured by', 'department_code'): (data['insured_code'], data['department_code']), # 被保险人被查勘
        ('insured', 'checked by', 'check_department_code'): (data['insured_code'], data['check_department_code']), # 被保险人checked
        ('insured', 'has', 'accident'): (data['insured_code'], data['report_no']), # 被保人报案,进行保险赔付
    })

    # 添加节点特征
    g.nodes['car_mark'].data['car_age'] = torch.tensor(car_age.values) # 车的年龄
    g.nodes['car_mark'].data['insured_value'] = torch.tensor(insured_value.values) # 车的购置价格
    g.nodes['car_mark'].data['veh_clas_code'] = torch.tensor(veh_clas_code.values) # 车辆大类


    g.nodes['accident'].data['policy_sum_estimate'] = torch.tensor(policy_sum_estimate.values) # 保单总预估价格
    g.nodes['accident'].data['policy_sum_pay'] = torch.tensor(policy_sum_pay.values) # 保单总预估价格
    g.nodes['accident'].data['duty_coefficient'] = torch.tensor(duty_coefficient.values) # 责任系数
    g.nodes['accident'].data['person_loss_flag'] = torch.tensor(person_loss_flag.values) # 是否有人伤 # TYPE
    g.nodes['accident'].data['total_agreed_amount'] = torch.tensor(total_agreed_amount.values) # 定损金额(车辆损失的金额)
    g.nodes['accident'].data['accident_cause_level3'] = torch.tensor(accident_cause_level3.values) # 碰撞具体原因 # TYPE

    # 如果客户在不同的时间存在两种状态,怎么办?代码只能cover到很短的时间
    # 因此把这个特征轨道accident里面,每一种accident对应一种客户的状态
    # 把客户的featuers放到accident里面进行处理??
    g.nodes['insured'].data['client_type'] = torch.tensor(client_type.values) # 客户类型
    g.nodes['insured'].data['renewal_type'] = torch.tensor(renewal_type.values) # 是否续保
    g.nodes['insured'].data['label'] = torch.tensor(indemnity_conclusion.values) # 赔付结论 ### LABEL

    return g

# --------------------------------------------- RGCN ------------------------------------------------------------------
class RGCNLayer(nn.Module):
    def __init__(
        self,
        in_feat,
        out_feat,
        num_rels,
        num_bases=-1,
        bias=None,
        activation=None,
        is_input_layer=False,
    ):
        super(RGCNLayer, self).__init__()
        self.in_feat = in_feat
        self.out_feat = out_feat
        self.num_rels = num_rels
        self.num_bases = num_bases
        self.bias = bias
        self.activation = activation
        self.is_input_layer = is_input_layer

        # sanity check
        if self.num_bases <= 0 or self.num_bases > self.num_rels:
            self.num_bases = self.num_rels
        # weight bases in equation (3)
        self.weight = nn.Parameter(
            torch.Tensor(self.num_bases, self.in_feat, self.out_feat)
        )
        if self.num_bases < self.num_rels:
            # linear combination coefficients in equation (3)
            self.w_comp = nn.Parameter(
                torch.Tensor(self.num_rels, self.num_bases)
            )
        # add bias
        if self.bias:
            self.bias = nn.Parameter(torch.Tensor(out_feat))
        # init trainable parameters
        nn.init.xavier_uniform_(
            self.weight, gain=nn.init.calculate_gain("relu")
        )
        if self.num_bases < self.num_rels:
            nn.init.xavier_uniform_(
                self.w_comp, gain=nn.init.calculate_gain("relu")
            )
        if self.bias:
            nn.init.xavier_uniform_(
                self.bias, gain=nn.init.calculate_gain("relu")
            )

    def forward(self, g):
        with g.local_scope():
            if self.num_bases < self.num_rels:
                # generate all weights from bases (equation (3))
                weight = self.weight.view(
                    self.in_feat, self.num_bases, self.out_feat
                )
                weight = torch.matmul(self.w_comp, weight).view(
                    self.num_rels, self.in_feat, self.out_feat
                )
            else:
                weight = self.weight
            if self.is_input_layer:

                def message_func(edges):
                    # for input layer, matrix multiply can be converted to be
                    # an embedding lookup using source node id
                    embed = weight.view(-1, self.out_feat)
                    index = edges.data[dgl.ETYPE] * self.in_feat + edges.src["id"]
                    return {"msg": embed[index] * edges.data["norm"]}

            else:

                def message_func(edges):
                    w = weight[edges.data[dgl.ETYPE]]
                    msg = torch.bmm(edges.src["h"].unsqueeze(1), w).squeeze()
                    msg = msg * edges.data["norm"]
                    return {"msg": msg}

            def apply_func(nodes):
                h = nodes.data["h"]
                if self.bias:
                    h = h + self.bias
                if self.activation:
                    h = self.activation(h)
                return {"h": h}

        g.update_all(message_func, fn.sum(msg="msg", out="h"), apply_func)
class Model(nn.Module):
    def __init__(
        self,
        num_nodes,
        h_dim,
        out_dim,
        num_rels,
        num_bases=-1,
        num_hidden_layers=1,
    ):
        super(Model, self).__init__()
        self.num_nodes = num_nodes
        self.h_dim = h_dim
        self.out_dim = out_dim
        self.num_rels = num_rels
        self.num_bases = num_bases
        self.num_hidden_layers = num_hidden_layers

        # create rgcn layers
        self.build_model()

        # create initial features
        self.features = self.create_features()

    def build_model(self):
        self.layers = nn.ModuleList()
        # input to hidden
        i2h = self.build_input_layer()
        self.layers.append(i2h)
        # hidden to hidden
        for _ in range(self.num_hidden_layers):
            h2h = self.build_hidden_layer()
            self.layers.append(h2h)
        # hidden to output
        h2o = self.build_output_layer()
        self.layers.append(h2o)

    # initialize feature for each node
    def create_features(self):
        features = torch.arange(self.num_nodes)
        return features

    def build_input_layer(self):
        return RGCNLayer(
            self.num_nodes,
            self.h_dim,
            self.num_rels,
            self.num_bases,
            activation=F.relu,
            is_input_layer=True,
        )

    def build_hidden_layer(self):
        return RGCNLayer(
            self.h_dim,
            self.h_dim,
            self.num_rels,
            self.num_bases,
            activation=F.relu,
        )

    def build_output_layer(self):
        return RGCNLayer(
            self.h_dim,
            self.out_dim,
            self.num_rels,
            self.num_bases,
            activation=partial(F.softmax, dim=1),
            # activation=F.relu,
        )

    def forward(self, g):
        if self.features is not None:
            g.ndata["id"] = self.features
        for layer in self.layers:
            layer(g)
        return g.ndata.pop("h")


def create_mask(graph,category):
    n_nodes = graph.number_of_nodes(category)
    n_train = int(n_nodes * 0.6)
    n_val = int(n_nodes * 0.2)
    train_mask = torch.zeros(n_nodes, dtype=torch.uint8)
    val_mask = torch.zeros(n_nodes, dtype=torch.uint8)
    test_mask = torch.zeros(n_nodes, dtype=torch.uint8)
    train_mask[:n_train] = True
    val_mask[n_train: n_train + n_val] = True
    test_mask[n_train + n_val:] = True
    graph.nodes[category].data["train_mask"] = train_mask
    graph.nodes[category].data["val_mask"] = val_mask
    graph.nodes[category].data["test_mask"] = test_mask
    return train_mask, val_mask, test_mask


g = build_knowledge_graph(df)
print(f"Knowledge Graph Structure is: {g}",'\n\n\n')

category = 'insured'
train_mask, val_mask, test_mask = create_mask(g,category)
train_idx = torch.nonzero(train_mask, as_tuple=False).squeeze()
val_idx = torch.nonzero(val_mask, as_tuple=False).squeeze()
test_idx = torch.nonzero(test_mask, as_tuple=False).squeeze()

labels = g.nodes[category].data.pop("label")
num_rels = len(g.canonical_etypes)
num_classes = 5 # classification class
# normalization factor
for cetype in g.canonical_etypes:
    g.edges[cetype].data["norm"] = dgl.norm_by_dst(g, cetype).unsqueeze(1)

category_id = g.ntypes.index(category)

# configurations
n_hidden = 16  # number of hidden units
n_bases = -1  # use number of relations as number of bases
n_hidden_layers = 1  # use 1 input layer, 1 output layer, no hidden layer
n_epochs = 20  # epochs to train
lr = 0.01  # learning rate
l2norm = 0  # L2 norm coefficient

# create graph
g = dgl.to_homogeneous(g, edata=["norm"])
node_ids = torch.arange(g.num_nodes())
target_idx = node_ids[g.ndata[dgl.NTYPE] == category_id]

# create model
model = Model(
    g.num_nodes(),
    n_hidden,
    num_classes,
    num_rels,
    num_bases=n_bases,
    num_hidden_layers=n_hidden_layers,
)


def print_grad(grad):
    print(grad)

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)
        param.register_hook(print_grad)


# optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
# optimizer = torch.optim.Adam(model.parameters(),  lr=lr, weight_decay=l2norm)

print("start training...")
model.train()
for epoch in range(n_epochs):
    optimizer.zero_grad()
    logits = model.forward(g)
    logits = logits[target_idx]
    loss = F.cross_entropy(logits[train_idx], labels[train_idx])
    loss.backward()
    # nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # 梯度裁剪?

    optimizer.step()

    train_acc = torch.sum(logits[train_idx].argmax(dim=1) == labels[train_idx])
    train_acc = train_acc.item() / len(train_idx)
    val_loss = F.cross_entropy(logits[test_idx], labels[test_idx])
    val_acc = torch.sum(logits[test_idx].argmax(dim=1) == labels[test_idx])
    val_acc = val_acc.item() / len(test_idx)
    print(
        "Epoch {:05d} | ".format(epoch)
        + "Train Accuracy: {:.4f} | Train Loss: {:.4f} | ".format(
            train_acc, loss.item()
        )
        + "Validation Accuracy: {:.4f} | Validation loss: {:.4f}".format(
            val_acc, val_loss.item()
        )
    )

The accuracy always be 0.8141, the train loss always be 1.6904, the validations are same