Loss convergence failure

I trained the model for hours together and even longer, but still the loss doesn’t seem to converge. Are the weights getting detached and the optimization breaking up somewhere?

I’ve set the number of epochs = 10000 here, but I trained with much larger number earlier, but apparently, it doesn’t seem to be making any difference.

n=1000

adj = torch.randint(high=10, size=(n,n)).float()
features = torch.sum(adj, dim=1).view(1,-1).float()
features = torch.transpose(features,0,1)
adj_comp= torch.randint(low=1,high=100, size=(n,)).float()

class GraphConvolution(nn.Module):

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()

        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.parameter.Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = nn.parameter.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, x, adj):
        support = torch.mm(x, self.weight)
        output = torch.spmm(adj, support)
        return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

class GCN(nn.Module):
    def __init__(self, nin, nhid1, nhid2, nout):
        super(GCN, self).__init__()

        self.gc1 = GraphConvolution(nin, nhid1)
        self.gc2 = GraphConvolution(nhid1, nhid2)
        self.gc3 = GraphConvolution(nhid2, nout)
        self.dp = nn.Dropout(p=0.5)

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = F.relu(x)
        x = self.dp(x.float())
        x = self.gc2(x, adj)
        x = F.relu(x)
        x = self.dp(x.float())
        x = self.gc3(x, adj)
        x = F.relu(x)
        x = self.dp(x.float())
        return torch.sum(x, dim=1)

def train(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output = model(features, adj)

    loss_train = criteria(output, adj_comp) 
    loss_train.backward()
    optimizer.step()
    
    if not epoch%1000:
        print('Epoch: {:04d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss_train.item()),
              'time: {:.4f}s'.format(time.time() - t))


model = GCN(nin = features.shape[1], nhid1 = 100, nhid2 = 50, nout= 10)

criteria = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.000001)

net_time = time.time()
for epoch in range(10000):
    train(epoch)
    if not epoch%1000:
        print("Net time = {:.4f}s \n".format(time.time() - net_time))

Double post from here.

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

SKX= np.zeros(10000)
for i in range(10000):
SKX[i] = np.random.randint(100)

SKY = np.zeros(10000)
for i in range(10000):
SKY[i] = SKX[i]*5

class NN_new(nn.Module):
def init(self, input_shape, n_actions):
super(DQN_new, self).init()
self.fc = nn.Sequential(
nn.Linear(input_shape, 128),
nn.ReLU(),
nn.Linear(128, 256),
nn.ReLU(),
nn.Linear(256, 128),
nn.ReLU(),
nn.Linear(128, n_actions)
)

def forward(self, x):
    return self.fc(x.view(-1,1))

SKnet = NN_new(1,1)

def fit(x,y,net,opt,loss_function,epochs = 1000):
for epoch in range(epochs):
opt.zero_grad()
loss = loss_function(SKnet(x.float()),y.float())
print(epoch,loss)
loss.backward()
opt.step()

return loss.item()

loss_function = nn.MSELoss()
opt = optim.RMSprop(SKnet.parameters(), lr = 0.01)
fit(torch.tensor(SKX), torch.tensor(SKY), SKnet, opt, loss_function)

model not converging what will be the problem?