CNN does not learn and only outputs 0

I am trying to train a CNN to identify arrhythmias in ECG (normal, atrial fibrillation, other, noisy). At first, the loss was reaming the same, I tried changing the loss function and the loss started declining by only .001. I soon found that my net was only outputting zeros.

I am using Google Colab (https://colab.research.google.com/drive/1fR29ffY9iJW-to7fgsjRVmAKo4jtImP1#scrollTo=cl0jdeW9IlUf) - w/ CrossEntropyLoss function.

Here is my code (w/ MSELoss function):

import os
import cv2
import numpy as np
from tqdm import tqdm
from scipy.io import loadmat
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

if torch.cuda.is_available():
 device = torch.device("cuda:0")
 print("Running on GPU -", device )
else:
 device = torch.device("cpu")
 print("Running on CPU -", device )


REBUILD_DATA = True # processing data

class ECG_DATA():
 ECG_LENGTH = 3000
 LABEL_SIZE = 485
 DATA = "physionet.org/files/challenge-2017/1.0.0/training/"
 NORMAL = "physionet.org/files/challenge-2017/1.0.0/training/RECORDS-normal" 
 AF = "physionet.org/files/challenge-2017/1.0.0/training/RECORDS-af"
 OTHER = "physionet.org/files/challenge-2017/1.0.0/training/RECORDS-other"
 NOISY = "physionet.org/files/challenge-2017/1.0.0/training/RECORDS-noisy"
 LABELS = {NORMAL: 0, AF: 1, OTHER:2, NOISY: 3}
 trainingData = []
 dataCount = {NORMAL: 0, AF: 0, OTHER: 0, NOISY: 0}

 def make_training_data(self):
   for records in self.LABELS:
     with open(records) as label:
       for ecgFile in tqdm(label):
         ecg = loadmat(self.DATA+ecgFile[:-1]+".mat")["val"][0].tolist()

         if records == self.NOISY:
           #self.zero_padding(ecg)
           for x in range(self.ECG_LENGTH, len(ecg), self.ECG_LENGTH):
             if self.dataCount[records] <= self.LABEL_SIZE and x <= len(ecg):
               self.trainingData.append([np.array(ecg[x-self.ECG_LENGTH:x]), np.eye(len(self.LABELS))[self.LABELS[records]]])
               self.dataCount[records] += 1

         elif self.dataCount[records] <= self.LABEL_SIZE and self.ECG_LENGTH <= len(ecg):
           self.trainingData.append([np.array(ecg[:self.ECG_LENGTH]), np.eye(len(self.LABELS))[self.LABELS[records]]])
           self.dataCount[records] += 1    
   
   print(self.dataCount)
   np.random.shuffle(self.trainingData)
   np.save("training_Data.npy", self.trainingData)
 
 def zero_padding(self, ecg):
   ecg += [0] * (self.ECG_LENGTH-(len(ecg)%self.ECG_LENGTH))
class Net(nn.Module):
   def __init__(self):
       super().__init__() # just run the init of parent class (nn.Module)
       self.conv1 = nn.Conv1d(1, 32, 5) # input is 1 image, 32 output channels, 5x5 kernel / window
       self.conv2 = nn.Conv1d(32, 64, 5) # input is 32, bc the first layer output 32. Then we say the output will be 64 channels, 5x5 kernel / window
       self.conv3 = nn.Conv1d(64, 128, 5)

       x = torch.randn(1,3000).view(-1,1,3000)
       self._to_linear = None
       self.convs(x)

       self.fc1 = nn.Linear(self._to_linear, 512) #flattening.
       self.fc2 = nn.Linear(512, 4) # 512 in, 2 out bc we're doing 2 classes (dog vs cat).

   def convs(self, x):
       x = F.max_pool1d(F.relu(self.conv1(x)), 1) # adjust shape of pooling?
       x = F.max_pool1d(F.relu(self.conv2(x)), 1) # x = F.max_pool1d(F.relu(self.conv1(x)), (2, 2))
       x = F.max_pool1d(F.relu(self.conv3(x)), 1)

       if self._to_linear is None:
           self._to_linear = x[0].shape[0]*x[0].shape[1]
       return x

   def forward(self, x):
       x = self.convs(x)
       x = x.view(-1, self._to_linear)  # .view is reshape ... this flattens X before 
       x = F.relu(self.fc1(x))
       x = self.fc2(x) # bc this is our output layer. No activation here.
       return F.softmax(x, dim=1)


net = Net().to(device)
print(net)

if REBUILD_DATA:
 ECG = ECG_DATA()
 ECG.make_training_data()

training_data = np.load("training_Data.npy", allow_pickle=True)
print(len(training_data))

optimizer = optim.Adam(net.parameters(), lr = 0.01)
loss_function = nn.MSELoss().to(device)

X = torch.Tensor([i[0] for i in training_data])
y = torch.Tensor([i[1] for i in training_data])

VAL_PCT = 0.1
val_size = int(len(X)*VAL_PCT)
print(val_size)

train_X = X[:-val_size]
train_y = y[:-val_size]

test_X = X[-val_size:]
test_y = y[-val_size:]
print(len(train_X), len(test_X))

BATCH_SIZE = 100
EPOCHS = 1
plot = []

for epoch in range(EPOCHS):
   for i in tqdm(range(0, len(train_X), BATCH_SIZE)): # from 0, to the len of x, stepping BATCH_SIZE at a time. [:50] ..for now just to dev
       #print(f"{i}:{i+BATCH_SIZE}")
       batch_X = train_X[i:i+BATCH_SIZE].view(-1,1,3000).to(device)
       batch_y = train_y[i:i+BATCH_SIZE].to(device)

       net.zero_grad()

       outputs = net(batch_X)
       loss = loss_function(outputs, batch_y)
       loss.backward()
       optimizer.step()    # Does the update
   
   plot.append([epoch, float(loss)])
   print(f"\nEpoch: {epoch}. Loss: {loss}")

plot = list(map(list, zip(*plot)))
plt.plot(plot[0], plot[1])

What do you think the problem is @ptrblck?

I would try to use a loss function built for classification, e.g. nn.CrossEntropyLoss.
To do that, just swap this criterion with nn.MSELoss, remove the softmax in your model, and make sure your target tensors contain the class index for each sample.
I’m not sure, how batch_y is defined at the moment, but if it’s one-hour encoded, just call target = torch.argmax(batch_y, 1) and use target to compute the loss.

In the Colab code I used the the nn.CrossEntropyLoss function, but forgot to remove the softmax function. The loss changed when removing the softmax function, but remained the same when testing with multiple epochs. I tried another dataset, and the same thing happened.
The code so far…

import os
import cv2
import numpy as np
from tqdm import tqdm
from scipy.io import loadmat
import torch
import matplotlib.pyplot as plt


if torch.cuda.is_available():
  device = torch.device("cuda:0")
  print("Running on GPU -", device )
else:
  device = torch.device("cpu")
  print("Running on CPU -", device )

# to run on gpu .to(device)

REBUILD_DATA = False # processing data

class ECG_DATA():
  ECG_LENGTH = 3000
  LABEL_SIZE = 485
  DATA = "physionet.org/files/challenge-2017/1.0.0/training/"
  NORMAL = "physionet.org/files/challenge-2017/1.0.0/training/RECORDS-normal" 
  AF = "physionet.org/files/challenge-2017/1.0.0/training/RECORDS-af"
  OTHER = "physionet.org/files/challenge-2017/1.0.0/training/RECORDS-other"
  NOISY = "physionet.org/files/challenge-2017/1.0.0/training/RECORDS-noisy"
  LABELS = {NORMAL: 0, AF: 1, OTHER:2, NOISY: 3}
  trainingData = []
  dataCount = {NORMAL: 0, AF: 0, OTHER: 0, NOISY: 0}

  def make_training_data(self):
    for records in self.LABELS:
      with open(records) as label:
        for ecgFile in tqdm(label):
          ecg = loadmat(self.DATA+ecgFile[:-1]+".mat")["val"][0].tolist()

          
          if records == self.NOISY:
            #self.zero_padding(ecg)
            for x in range(self.ECG_LENGTH, len(ecg), self.ECG_LENGTH):
              if self.dataCount[records] <= self.LABEL_SIZE and x <= len(ecg):
                self.trainingData.append([np.array(ecg[x-self.ECG_LENGTH:x]), self.LABELS[records]])
                self.dataCount[records] += 1

          elif self.dataCount[records] <= self.LABEL_SIZE and self.ECG_LENGTH <= len(ecg):
            self.trainingData.append([np.array(ecg[:self.ECG_LENGTH]), self.LABELS[records]])
            self.dataCount[records] += 1
          
          # try with this is overfitting first
          '''
          if len(ecg)%self.ECG_LENGTH != 0:
            self.zero_padding(ecg)

          for x in range(self.ECG_LENGTH, len(ecg), self.ECG_LENGTH):
            if self.dataCount[records] <= self.LABEL_SIZE:
              self.trainingData.append([np.array(ecg[x-self.ECG_LENGTH:x]), np.eye(len(self.LABELS))[self.LABELS[records]]])
              self.dataCount[records] += 1
            else:
              break
          '''
            
    
    print(self.dataCount)
    np.random.shuffle(self.trainingData)
    np.save("ECG_DATA/training_Data.npy", self.trainingData)
  
  def zero_padding(self, ecg):
    ecg += [0] * (self.ECG_LENGTH-(len(ecg)%self.ECG_LENGTH))

if REBUILD_DATA:
  ECG = ECG_DATA()
  ECG.make_training_data()

training_data = np.load("ECG_DATA/training_Data.npy", allow_pickle=True)
print(len(training_data))

import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super().__init__() 
        self.conv1 = nn.Conv1d(1, 32, 5) 
        self.conv2 = nn.Conv1d(32, 64, 5) 
        self.conv3 = nn.Conv1d(64, 128, 5)

        x = torch.randn(1, 1,3000).view(-1,1,3000)
        self._to_linear = None
        self.convs(x)

        self.fc1 = nn.Linear(self._to_linear, 512)
        self.fc2 = nn.Linear(512, 4) 

    def convs(self, x):
        x = F.max_pool1d(F.relu(self.conv1(x)), 1) 
        x = F.max_pool1d(F.relu(self.conv2(x)), 1) 
        x = F.max_pool1d(F.relu(self.conv3(x)), 1)

        if self._to_linear is None:
            self._to_linear = x[0].shape[0]*x[0].shape[1]
        return x

    def forward(self, x):
        x = self.convs(x)
        x = x.view(-1, self._to_linear)  
        x = F.relu(self.fc1(x))
        x = self.fc2(x) 
        return x


net = Net().to(device)
print(net)

import torch.optim as optim

optimizer = optim.Adam(net.parameters(), lr = 0.01)
loss_function = nn.CrossEntropyLoss().to(device)

X = torch.Tensor([i[0] for i in training_data])
y = torch.Tensor([np.argmax(i[1]) for i in training_data])
 
VAL_PCT = 0.1
val_size = int(len(X)*VAL_PCT)
print(val_size)

train_X = X[:-val_size]
train_y = y[:-val_size]

test_X = X[-val_size:]
test_y = y[-val_size:]
print(y)

import matplotlib.pyplot as plt

BATCH_SIZE = 100
EPOCHS = 5
plot = []

for epoch in range(EPOCHS):
    for i in tqdm(range(0, len(train_X), BATCH_SIZE)):
        #print(f"{i}:{i+BATCH_SIZE}")
        batch_X = train_X[i:i+BATCH_SIZE].view(-1,1,3000).to(device)
        batch_y = train_y[i:i+BATCH_SIZE].to(device)
        net.zero_grad()

        outputs = net(batch_X)
        loss = loss_function(outputs, batch_y.long())
        
        loss.backward()
        optimizer.step()
    
    plot.append([epoch, float(loss)])
    print(f"\nEpoch: {epoch}. Loss: {loss}")

plot = list(map(list, zip(*plot)))
plt.plot(plot[0], plot[1])

correct = 0
total = 0
with torch.no_grad():
    for i in tqdm(range(len(test_X))):
        real_class = test_y[i].to(device)
        net_out = net(test_X[i].view(-1,1,3000).to(device))[0]  # returns a list
        predicted_class = torch.argmax(net_out)
        if predicted_class == real_class:
            correct += 1
        total += 1
print("Accuracy: ", round(correct/total, 3))

I cannot see anything obviously wrong besides some minor issues:

  • your F.max_pool1d won’t do anything, as you are using a kernel size and stride of 1
  • could you use torch.tensor (lowercase t) to create X and y? Your code should work, but the usage of torch.Tensor is not recommended, as it might return an uninitialized tensor

Could you use a small data sample (e.g. just 10 observations) and try to overfit your model?
If this doesn’t work, there might be some bugs in the code I’m missing.

Thank you, the problem was because I was originally inputing the target classes wrong into the loss function.