Fusion Model does not converge

I am training a model where I am fusing the features using weighted average but somehow the loss is not converging. The model code is given below.

import torch
import torch.nn as nn


class Fusion3D(nn.Module):

    def __init__(self, bn=True):
        super(Fusion3D, self).__init__()

        self.block1 = self.encoder(in_channels=1)
        self.block2 = self.encoder(in_channels=3)  # Feature: input = 2 + 1, output = 9
        self.block3 = self.encoder(in_channels=9)  # Feature: input = 6 + 3, output = 27
        self.block4 = self.encoder(in_channels=27)  # Feature: input = 18 + 9, output = 81

        # feature reducer block
        self.block5 = self.decoder(81)  # Feature: input = 81, output = 27
        self.block6 = self.decoder(27)  # iFeature: input = 27, output = 9
        self.block7 = self.decoder(9)  # Feature: input = 9, output = 3
        self.block8 = self.decoder(3)  # Feature: input = 3, output = 1

        self.final_tsdf = self.final_tsdf_decoder(1)  # Feature: input = 1, output = 1 (global tsdf)

    def encoder(self, in_channels):
        t = nn.Sequential(
            nn.Conv3d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm3d(in_channels),
            nn.LeakyReLU(0.15),
            nn.Conv3d(in_channels, in_channels * 2, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm3d(in_channels * 2),
            nn.LeakyReLU(0.15)
        )

        return t

    def decoder(self, in_channels):
        t = nn.Sequential(
            nn.Conv3d(in_channels, in_channels // 3, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm3d(in_channels // 3),
            nn.LeakyReLU(0.15),
            nn.Conv3d(in_channels // 3, in_channels // 3, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm3d(in_channels // 3),
            nn.LeakyReLU(0.15),
        )

        return t

    def final_tsdf_decoder(self, in_channels):
        t = nn.Sequential(
            nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0),
            nn.Tanh()
        )

        return t

    def feature_extractor(self, tsdf_vol):
        x1 = self.block1(tsdf_vol)
        x1 = torch.cat([x1, tsdf_vol], dim=1)
        x2 = self.block2(x1)
        x2 = torch.cat([x2, x1], dim=1)
        x3 = self.block3(x2)
        x3 = torch.cat([x3, x2], dim=1)
        x4 = self.block4(x3)
        x4 = torch.cat([x4, x3], dim=1)

        return x4

    def forward(self, input_tsdf_vol, input_tsdf_weight, global_tsdf_vol, global_tsdf_weights):

        input_tsdf_feat = self.feature_extractor(input_tsdf_vol)
        global_tsdf_feat = self.feature_extractor(global_tsdf_vol)

        total_weight = input_tsdf_weight + global_tsdf_weights
        accumlated_features = (input_tsdf_feat * input_tsdf_weight + global_tsdf_feat * global_tsdf_weights) / (
                    total_weight + 1e-6)  # to avoid nan


        d1 = self.block5(accumlated_features)
        d2 = self.block6(d1)
        d3 = self.block7(d2)
        d4 = self.block8(d3)

        tsdf = self.final_tsdf(d4)
        return tsdf, total_weight


if __name__ == "__main__":
    i = torch.rand((1, 1, 25, 29, 21))
    m = Fusion3D(bn=True)
    t, c = m(i, i, i, i)
    print(t.shape)
    print(c.shape)

The description of data is such that one batch contains multiple data which I am fusing iteratively as well as backpropogating the loss. The situation can be described as fusion the current data at index k and all fused data till index k-1. Due to this incremental fusion I am using batch size of 1 and updating the gradients after 8 epochs by accumulating the gradient. I can see the loss decreasing for one batch but as new batch starts It jumps suddenly and then decrease and this pattern continues. Also, my input and output data is3D which have values dominated by 1 and small fraction between -1 to 1. I am using L1 loss function for the training. I am training the model end-2-end fashion.