Almost all model parameters becomes zeros

I’m making predictor with LSTMCells but it don’t want to improve no matter what loss function or optimizer im using. When i checked values that my cells are getting and what comes out as output it all seams alright, but i think the problem might be because I’m feeding my model with multidimensional data. This is how my model.parameters() looks inside after 100 epochs, I’m printing grad variable of each parameter:

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]) torch.Size([8, 5])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]]) torch.Size([8, 2])
tensor([0., 0., 0., 0., 0., 0., 0., 0.]) torch.Size([8])
tensor([0., 0., 0., 0., 0., 0., 0., 0.]) torch.Size([8])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]]) torch.Size([8, 2])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]]) torch.Size([8, 2])
tensor([0., 0., 0., 0., 0., 0., 0., 0.]) torch.Size([8])
tensor([0., 0., 0., 0., 0., 0., 0., 0.]) torch.Size([8])
tensor([[     0.0000,    224.0376],
        [     0.0000,    224.9807],
        [     0.0000,    222.3457],
        [     0.0000,    226.0519],
        [     0.0000, 209874.9688]]) torch.Size([5, 2])
tensor([   -224.0376,    -224.9806,    -222.3456,    -226.0518, -209874.8438]) torch.Size([5])

Changing learning rate only slows down or speeds up the rate of everything becoming zeros.
Im using fairly complicated method to feed input into my model:

    def forward(self, x: Tensor, future=0):
        outputs = []
        n_samples = x.size(0)

        h_t = zeros(n_samples, self.n_hidden, dtype=torch.float, device=self.device)  # hidden state for lstm1
        c_t = zeros(n_samples, self.n_hidden, dtype=torch.float, device=self.device)  # initial cell state for lstm1

        h_t2 = zeros(n_samples, self.n_hidden, dtype=torch.float, device=self.device)  # hidden state for lstm2
        c_t2 = zeros(n_samples, self.n_hidden, dtype=torch.float, device=self.device)  # initial cell state for lstm2

        output = None
        stockLayer = x.split([1, 1, 1, 1, 1], -2)
        for iT in range(0, stockLayer[0].size(-1)):
            in0 = stockLayer[0].split(1, -1)[iT].view(-1)
            in1 = stockLayer[1].split(1, -1)[iT].view(-1)
            in2 = stockLayer[2].split(1, -1)[iT].view(-1)
            in3 = stockLayer[3].split(1, -1)[iT].view(-1)
            in4 = stockLayer[4].split(1, -1)[iT].view(-1)
            in_t = torch.stack((in0, in1, in2, in3, in4), -1)
            h_t, c_t = self.lstm1(in_t, (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            output = self.linear(h_t2)
        outputs.append(output)

        for i in range(future):
            h_t, c_t = self.lstm1(output, (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            output = self.linear(h_t2)
            outputs.append(output)

        outputs = torch.cat(outputs, dim=1)
        return outputs

Here is an example of the input, 2 tensors from around middle of the for loop (print of in_t)

tensor([[0.9684, 0.9715, 0.9742, 0.9760, 1.2034],
        [0.8043, 0.8114, 0.8204, 0.8042, 0.1811],
        [0.9530, 0.9529, 0.9530, 0.9525, 0.0000],
        ...,
        [1.0169, 1.0169, 1.0169, 1.0169, 0.5000],
        [1.0000, 1.0000, 1.0000, 1.0000, 0.0000],
        [1.0290, 1.0290, 1.0290, 1.0290, 0.0000]])
tensor([[0.9736, 0.9648, 0.9720, 0.9775, 1.9208],
        [0.8140, 0.8232, 0.8164, 0.8061, 0.2725],
        [0.9520, 0.9508, 0.9528, 0.9517, 0.0000],
        ...,
        [1.0169, 1.0169, 1.0169, 1.0169, 0.2500],
        [1.0000, 1.0000, 1.0000, 1.0000, 0.0000],
        [1.0290, 1.0290, 1.0290, 1.0290, 0.0000]])

Loss function evaluates everything quite good from what I was able to test.
And lastly my train method:

net = LSTMModule(self.hid, self.device)
        net.float()
        if os.path.exists(self.name):
            net.load(self.name)

        # Training setup
        lossFun = nn.MSELoss()
        optimizer = optim.SGD(net.parameters(), lr=self.learningRate, momentum=0.3)

        samples, outputs = self.getSamples(period, tckList)

        # Training
        for epoch in range(epochs):
            totalLoss = 0.0
            for (batchS, batchO) in zip(samples, outputs):
                optimizer.zero_grad(True)
                prediction = net(batchS)
                loss = lossFun(prediction, batchO)
                totalLoss += loss
                print(loss)
                print(loss.backward())
                optimizer.step()