Net stop training

Hello!
Please help me figure it out. I sketched a simple network and try different configurations with 60 * 5 inputs. All the models I get are not learning, or, if I’m very lucky, stop learning after a few steps. I compared the network parameters at different steps - they are the same. The total weight is also unchanged.

class NeuralNetwork(torch.nn.Module):

    def crutch(self):
        after = []
        for param in self.net.parameters():
            after.append(param.clone().cpu())
        for i in range(len(self.before)):
            print(torch.equal(self.before[i], after[i]))
        print(" ")
        self.before = after

    def save(self):
        torch.save(self.net, "data/" + self.name)

    def init_weights(self, m):
        if type(m) == torch.nn.Linear:
            m.weight.data.uniform_(-0.5, 0.5)
            m.bias.data.fill_(0.0)

    def overallWeight(self):
        result = 0.0
        def passLayer(m):
            if type(m) == torch.nn.Linear:
                nonlocal result
                result += torch.sum(m.weight.data).item()
        self.net.apply(passLayer)
        return result

    def load(self):
        buffernet = None
        try:
            buffernet = torch.load("data/" + self.name)
            self.net = buffernet
            return True
        except:
            print("cannot load the model. will use the new one")
            return False

    def __init__(self, dimensions, name):
        super(NeuralNetwork, self).__init__()
        self.name = name
        for part in dimensions:
            self.name += "_"
            self.name += str(part)

        self.setTrainingDataCollection(False)
        self.dimensions = dimensions

        if self.load() == False:
            layers = OrderedDict([])
            for i in range(len(dimensions)):
                if i == 0:
                    continue
                layers.update({str(i): torch.nn.Linear(dimensions[i-1], dimensions[i], True).cuda()})
                layers.update({str(i)+'_func': torch.nn.Sigmoid()})
            self.net = torch.nn.Sequential(layers)
            self.net.apply(self.init_weights)
        self.optimizer = torch.optim.SGD(self.net.parameters(), lr=0.9, momentum=0.9)
        self.optimizerCriterion = torch.nn.MSELoss(reduction='none').cuda()
        self.before = []
        for param in self.net.parameters():
            self.before.append(param.clone().cpu())
 
    def forward(self, X):
        return self.net(X)
    
    def backward(self, y, o):
        self.optimizer.zero_grad()
        loss = self.optimizerCriterion(o, y)
        loss.backward()
        self.optimizer.step()
        
    def train(self, X, y):
        o = self.forward(X.cuda())
        self.backward(y.cuda(), o)
        return o

crutch with 1000 iterations difference outputs True for each layer.
overallWeight does not change.
Checked loss - it contains the correct deviation.

I would be grateful for any hint on what I am missing.

After removing the load method (as the checkpoint was undefined), the model seems to train, since I get a False output:

class NeuralNetwork(torch.nn.Module):

    def crutch(self):
        after = []
        for param in self.net.parameters():
            after.append(param.clone().cpu())
        for i in range(len(self.before)):
            print(torch.equal(self.before[i], after[i]))
        print(" ")
        self.before = after

    def save(self):
        torch.save(self.net, "data/" + self.name)

    def init_weights(self, m):
        if type(m) == torch.nn.Linear:
            m.weight.data.uniform_(-0.5, 0.5)
            m.bias.data.fill_(0.0)

    def overallWeight(self):
        result = 0.0
        def passLayer(m):
            if type(m) == torch.nn.Linear:
                nonlocal result
                result += torch.sum(m.weight.data).item()
        self.net.apply(passLayer)
        return result



    def __init__(self, dimensions, name):
        super(NeuralNetwork, self).__init__()
        self.name = name
        for part in dimensions:
            self.name += "_"
            self.name += str(part)

        
        self.dimensions = dimensions

        layers = OrderedDict([])
        for i in range(len(dimensions)):
            if i == 0:
                continue
            layers.update({str(i): torch.nn.Linear(dimensions[i-1], dimensions[i], True).cuda()})
            layers.update({str(i)+'_func': torch.nn.Sigmoid()})
        self.net = torch.nn.Sequential(layers)
        print(self.net)
        self.net.apply(self.init_weights)
        self.optimizer = torch.optim.SGD(self.net.parameters(), lr=0.9, momentum=0.9)
        self.optimizerCriterion = torch.nn.MSELoss(reduction='none')
        self.before = []
        for param in self.net.parameters():
            self.before.append(param.clone().cpu())
 
    def forward(self, X):
        return self.net(X)
    
    def backward(self, y, o):
        self.optimizer.zero_grad()
        loss = self.optimizerCriterion(o, y)
        loss.backward()
        self.optimizer.step()
        
    def train(self, X, y):
        o = self.forward(X.cuda())
        self.backward(y.cuda(), o)
        return o


model = NeuralNetwork([1, 1, 1], '')
model.train(torch.randn(1, 1).cuda(), torch.randn(1, 1).cuda())
model.crutch()

Unfortunately, problems grow at the same time as the size of the net. Random example, [300, 60, 30, 1] stops changing already in the first tens of iterations. Bigger ones are faster. The load will not do anything if there was no save before and a new one is simply created.

Based on the description it seems the the issue is not a detached computation graph, which wouldn’t create gradients at all and thus wouldn’t cause any updates, but apparently vanishing gradients with might go to zero, which would explain the initial updates.

Could you verify it by checking the gradients in each iterations and print the max. absolute magnitude of them?