Why are losses not stable for the densenet implemented?

Hi I am using a time series data classification using densenet with two classes 0/1. The accuracy is 56% on test, 51% training and 48% on test for 100 epochs. following is the densenet architecture with 12 layers :

class BasicBlock(nn.Module):
    
    def __init__(self, in_planes, out_planes, dropRate=0.0):
        super(BasicBlock, self).__init__()
        self.bn1 = nn.BatchNorm1d(in_planes)
        self.relu = nn.SELU(inplace=True)
        self.conv1 = nn.Conv1d(in_planes, out_planes, kernel_size=3, stride=1,
                               padding=1, bias=True)##bias=False)
        self.droprate = dropRate
        
    def forward(self, x):
        out = self.conv1(self.relu(self.bn1(x)))
        if self.droprate > 0:
            out = F.dropout(out, p=self.droprate, training=self.training)
        return torch.cat([x, out], 1) 

class BottleneckBlock(nn.Module):
    
    def __init__(self, in_planes, out_planes, dropRate=0.0):
        super(BottleneckBlock, self).__init__()
        inter_planes = out_planes * 4
        self.bn1 = nn.BatchNorm1d(in_planes)
        self.relu = nn.SELU(inplace=True)
        self.conv1 = nn.Conv1d(in_planes, inter_planes, kernel_size=1, stride=1,
                               padding=0, bias = True)##bias=False)
        self.bn2 = nn.BatchNorm1d(inter_planes)
        self.conv2 = nn.Conv1d(inter_planes, out_planes, kernel_size=3, stride=1, padding=1, bias=True)

        self.droprate = dropRate
        
    def forward(self, x):
        out = self.conv1(self.relu(self.bn1(x)))
        if self.droprate > 0:
            out = F.dropout(out, p=self.droprate, inplace=False, training=self.training)
        out = self.conv2(self.relu(self.bn2(out)))
        if self.droprate > 0:
            out = F.dropout(out, p=self.droprate, inplace=False, training=self.training)
        return torch.cat([x, out], 1)

class TransitionBlock(nn.Module):
    
    def __init__(self, in_planes, out_planes, dropRate=0.0):
        super(TransitionBlock, self).__init__()
        self.bn1 = nn.BatchNorm1d(in_planes)
        ##self.relu = nn.ReLU(inplace=True)
        self.relu = nn.SELU(inplace=True)
        self.conv1 = nn.Conv1d(in_planes, out_planes, kernel_size=1, stride=1,
                               padding=0, bias=True)
        self.droprate = dropRate
        
    def forward(self, x):
        out = self.conv1(self.relu(self.bn1(x)))
        if self.droprate > 0:
            out = F.dropout(out, p=self.droprate, inplace=False, training=self.training)
        return F.avg_pool1d(out, 2)

class DenseBlock(nn.Module):
    def __init__(self, nb_layers, in_planes, growth_rate, block, dropRate=0.0):
        super(DenseBlock, self).__init__()
        self.layer = self._make_layer(block, in_planes, growth_rate, nb_layers, dropRate)
    def _make_layer(self, block, in_planes, growth_rate, nb_layers, dropRate):
        layers = []
        for i in range(nb_layers):
            layers.append(block(in_planes+i*growth_rate, growth_rate, dropRate))
        return nn.Sequential(*layers)
    def forward(self, x):
        return self.layer(x)


class DenseNet(nn.Module): 
    def __init__(self, depth, num_classes, growth_rate=12,
                 reduction=0.5, bottleneck=True, dropRate=0.0): #reduction = 0.5 is compression and used 1x1 before 3x3
        #therefore, Dense-BC architecture it is, k:growth_rate = 12  
        super(DenseNet, self).__init__()
        in_planes = 2 * growth_rate
        n = (depth - 4) / 3
        if bottleneck == True:
            n = n/2
            block = BottleneckBlock
        else:
            block = BasicBlock
        n = int(n)
        print('n:',n)
        # 1st conv before any dense block
        #self.conv1 = nn.Conv1d(3, in_planes, kernel_size=3, stride=1,
        #                       padding=1, bias=False)
        self.conv1 = nn.Conv1d(1, in_planes, kernel_size=3, stride=1,
                               padding=1, bias=True)##bias=False)
        # 1st block
        self.block1 = DenseBlock(n, in_planes, growth_rate, block, dropRate)
        in_planes = int(in_planes+n*growth_rate)
        self.trans1 = TransitionBlock(in_planes, int(math.floor(in_planes*reduction)), dropRate=dropRate)
        in_planes = int(math.floor(in_planes*reduction))
        # 2nd block
        self.block2 = DenseBlock(n, in_planes, growth_rate, block, dropRate)
        in_planes = int(in_planes+n*growth_rate)
        ##self.trans2 = TransitionBlock(in_planes, int(math.floor(in_planes*reduction)), dropRate=dropRate)
        ##in_planes = int(math.floor(in_planes*reduction))
        # 3rd block
        ##self.block3 = DenseBlock(n, in_planes, growth_rate, block, dropRate)
        ##in_planes = int(in_planes+n*growth_rate)
        
        # global average pooling and classifier
        self.bn1 = nn.BatchNorm1d(in_planes)
        self.relu = nn.SELU(inplace=True)
        self.in_planes = in_planes

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.bias.data.zero_()
    
    def getInPlanes(self):
        return self.in_planes
    
    
    def forward(self, x):
        out = self.conv1(x)
        out = self.block1(out)
        out = self.trans1(out)
        out = self.block2(out)
        out = self.trans2(out)
        out = self.block3(out)
        out = self.relu(self.bn1(out))
        out = F.avg_pool1d(out, 8)#(input,kernel_size)
        ##out = F.adaptive_avg_pool1d(out,8)
        out = out.view(-1, self.in_planes)
        return out


class Network(nn.Module):   
    def __init__(self,depth = 40,num_classes=2,growth_rate = 12,dropRate = 0.0):
        super(Network, self).__init__()
        print('depth:',depth)
        self.DenseNet1=DenseNet(depth = depth,num_classes=num_classes,growth_rate = growth_rate,dropRate=dropRate)#3,num_classes=2)
        self.DenseNet2=DenseNet(depth = depth,num_classes=num_classes,growth_rate = growth_rate,dropRate=dropRate)
        self.DenseNet3=DenseNet(depth = depth,num_classes=num_classes,growth_rate = growth_rate,dropRate=dropRate)
        self.DenseNet4=DenseNet(depth = depth,num_classes=num_classes,growth_rate = growth_rate,dropRate=dropRate)
        self.DenseNet5=DenseNet(depth = depth,num_classes=num_classes,growth_rate = growth_rate,dropRate=dropRate)
        self.in_planes = self.DenseNet1.getInPlanes()
        #print('in_planes : ',self.in_planes)
        self.fc = nn.Linear(self.in_planes*5, num_classes)
        self.softmax=nn.Softmax()
        
        
    def forward(self,x):
        batch_size, no_of_series, no_of_days = x.shape
        
        close=np.reshape(x[:,0],(batch_size,1,no_of_days))
        out1=self.DenseNet1(close)
        #print('*******************************************************')
        #print('out1 size : ',out1.shape)
        
        _open = np.reshape(x[:,1],(batch_size,1,no_of_days))
        out2=self.DenseNet2(_open)
        
        high = np.reshape(x[:,2],(batch_size,1,no_of_days))
        out3=self.DenseNet3(high)
        
        low = np.reshape(x[:,3],(batch_size,1,no_of_days))
        out4=self.DenseNet4(low)
        
        volume = np.reshape(x[:,4],(batch_size,1,no_of_days))
        out5=self.DenseNet5(volume)
        gp = torch.stack((out1,out2,out3,out4,out5),1)
        gp = gp.view(gp.size(0), -1)
        #print("*******************stacked tensors***************")
        #print(gp)
        ##print('stacked tensor\'s shape : ', gp.shape)
        x=self.fc(gp)
        ##return x 
        y=self.softmax(x)
        return y


def train_model(epoch, model, optimizer, train_loader,cuda):
    model.train()
    t0 = time.time()
    #test_loss = 0
    correct=0
    equals=0
    total_correct=0
    final_loss = 0
    for batch_idx, (X,labels) in enumerate(train_loader):
        Data,labels = map(lambda x: Variable(x), [X,labels])
        optimizer.zero_grad()
        data=Data
        output = model(data)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print('device : ', device)
        final_output=output.to(device)
        #print(labels.size(0))
        num_samples = labels.size(0)
        ##print('predicted labels',final_output.squeeze())
        ##print('Actual labels',labels.squeeze())
        loss = F.cross_entropy(final_output, labels, size_average=False)
        final_loss += loss
        ##print('Loss : ',loss)
        loss.backward()
        optimizer.step()
        avg_loss = loss / num_samples
        print('Train Epoch: {} Batch: {} [{}/{} ({:.2f}%, time:{:.2f}s)]\tLoss: {:.6f}'.format(
                epoch, batch_idx, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), time.time() - t0,
                avg_loss.data))
        #t0 = time.time()
        pred = output.data.max(1, keepdim=True)[1]  # get the index of the max log-probability
        labels = labels.to("cpu")
        correct = pred.eq(labels.data.view_as(pred))
        equals  = torch.sum(correct)
        total_correct+=equals
        t0 = time.time()
    samples=train_loader.dataset.total_samples
    loss /= samples
    final_loss /= samples
    accuracy =  total_correct.item()/samples
    print('Training Accuracy : ',accuracy*100)
    print('Training Loss : ',final_loss.item())
    if epoch%20 == 0 or epoch ==1 or epoch == epochs: 
        epochs_list.append(epoch)
        train_loss.append(final_loss.item())
        train_accuracies.append(accuracy*100)
        path = 'some path'
        torch.save(model,path+str(epoch)+'.pth')
        torch.save(model,path)
        print('model saved')



def validate(epoch,model, validation_loader,optimizer):
    model.eval()
    test_loss = 0
    correct = 0
    equals=0
    total_correct=0
    for batch_id,(X,labels) in enumerate(validation_loader):
        #if cuda:
        #    Data ,labels= map(lambda x:x.cuda(),[X,labels])
        Data,labels = map(lambda x: Variable(x), [X,labels])
        optimizer.zero_grad()
        data=Data
        output = model(data)
        output = output.squeeze()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        final_val_output=output.to(device)
        labels = labels.squeeze()
        test_loss += F.cross_entropy(final_val_output, labels, size_average=False)  # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1]  # get the index of the max log-probability
        labels = labels.to("cpu")
        correct = pred.eq(labels.data.view_as(pred))
        equals  = torch.sum(correct)
        total_correct+=equals
    samples=validation_loader.dataset.total_samples
    test_loss /= samples
    accuracy =  total_correct.item()/samples
    if epoch%20 == 0 or epoch ==1 or epoch == epochs: 
        validation_loss.append(test_loss.item())
        val_accuracies.append(accuracy*100)
    
    print('Accuracy : ',accuracy)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.4f}%)\n'.format(
        test_loss, total_correct,samples,100 * accuracy))
    print("==============================================")
    return "{:.4f}%".format(100. * total_correct / samples), accuracy



def test(test_loader):
    path = 'some path'
    print('path:',path)
    model2 = torch.load(path)
    print('model loaded')
    model2.eval()
    correct = 0
    total = 0
    cnt = 0
    with torch.no_grad():
        for data in test_loader:
            cnt+=1
            test_data, labels = data
            outputs = model(test_data)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('cnt:',cnt)
    print('Test Accuracy: {:.4f}%'.format(100 * correct / total))


X_train,Y_train,X_val,Y_val,X_test,Y_test = splitData(feat_wise_data,labels_new)
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
epochs=100
batch_size=8
lr=0.0005##0.001##0.0005##0.001##0.0005##0.05#0.001
momentum=0.9
cuda=False
seed=1
log_interval=300
torch.manual_seed(seed)


model=Network(depth = 15,dropRate=0.3)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', factor=0.5, patience=2, verbose=True,  threshold_mode='abs', threshold=0.01, min_lr=1e-6)



for epoch in range(1, epochs + 1):
    train_model(epoch, model, optimizer, train_loader,cuda=True)
    acc_str, acc = validate(epoch,model,validation_loader,optimizer)
    scheduler.step(acc)
test(test_loader)

My plots for losses are coming out as :

Loss4

unable to understand why is this happening? I have tried changing learning rates, batch size, epochs but nothing has worked, dropout rates. Losses and accuracies are not stable as well as good since there are only two classes and this seems to give random model. Please suggest as I am new to Deep learning and pytorch.