Loss is too high and the accuracy is less than 20% for semi supervised classification

Implementing Graph based Semi-supervised classification by Thomas Kipf and Max Welling: [https://arxiv.org/pdf/1609.02907.pdf](http://Kipf and Welling)
I am trying to implement graph convolutions. I am using log_softmax as the activation function in the output layer and nll_loss as the loss function. The loss is too high and the outputs are biased toward only a single class. My dataset has 7 classes.

This the Model I have written

class Graph_convloution(nn.Module):
def init(self,Vertices,Edges,num_classes):
super(Graph_convloution,self).init()
#print(len(Vertices))
self.vertices=Vertices
self.edges=Edges
self.graph=self.compute_graph(self.vertices,self.edges)
self.adj=torch.tensor(nx.to_numpy_matrix(self.graph),dtype=torch.double)
#print(len(self.adj))
self.a_tilda=self.A_tilda(self.adj)
self.a_cap=self.A_cap_mean_rule(self.a_tilda)
self.GClayer1=GClayer(1434,len(Vertices))
self.GClayer2=GClayer(len(Vertices),num_classes)
self.relu=nn.ReLU()
self.sigmoid=nn.Sigmoid()
def compute_graph(self,v,e):
G=nx.Graph()
G.add_nodes_from(v)
G.add_edges_from(e)
return G

def A_tilda(self,a):
    I=torch.eye(len(self.vertices),dtype=torch.double)
    A=a+I
    return A

def A_cap_mean_rule(self,A):
    degree=torch.sum(A,1)
    degree=degree**-1
    n=len(degree)
    D=np.zeros((n,n))
    np.fill_diagonal(D,degree)
    product=torch.matmul(torch.tensor(D,dtype=torch.float64,requires_grad=True),torch.tensor(A,dtype=torch.float64,requires_grad=True))
    return product

def A_cap_spectral_rule(self,A):
    degree=np.sum(A,axis=1,keepdims=True,dtype=np.double)
    temp=degree.copy()
    temp=np.sqrt(temp)
    temp=temp**-1
    n=len(temp)
    D=np.zeros((n,n))
    np.fill_diagonal(D,temp)
    product=torch.matmul(torch.tensor(D,dtype=torch.float64),torch.matmul(torch.tensor(A,dtype=torch.float64),torch.tensor(D,dtype=torch.float64)))
    return product

def forward(self,x):
    x=self.GClayer1(x,self.a_cap)
    x=self.relu(x)
    x=self.GClayer2(x,self.a_cap)
    probs = F.softmax(x)
    x=F.log_softmax(x)
    #print(x)
    
    return x, probs

This is the code for training:

labeled_len = 140
def train(model,label_idx,vertices, edges,labels,optimizer,num_epochs,features,num_classes):
for epoch in range(num_epochs):
loss=0
outputs, probs=model(features)# features here are sparse matrices

    output=torch.zeros((labeled_len,7))
    j=0
    
    for i in label_idx:
        output[j] = outputs[i]
        j += 1
    
    
    
    
    if epoch == num_epochs-1:
        prediction=outputs.argmax(dim=1)
        print(prediction)
    len(labels)
    labels = torch.tensor(labels)
    loss=F.nll_loss(output,labels)
    

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f'Training loss at epoch{epoch+1}:\t{loss.item()}')

Thanks in advance.

I have found out that the loss was too high due to the random weight initialisation and the weight values were too high ,loss is low and decreasing after every iteration but to my utter dismay all the predictions are only restricted to one class in each iteration.

I am using log_softmax and nll_loss as the activation function and loss function respectively.