Constant Prediction in CNN

Ok, so I have a model to predict the class of image, cat or dog. I receive %95 accuracy in training. But for some reason, I stuck with constant output when I try to predict single image.

I read similar topics from forum but that hasn’t contributed much in my case.

Below, you can find all info. Pls help ^^

What I tried so far:

  • Changing epochs 5 to 20.
  • Changing lr= 0.001 to 0.01 and 0.0001
  • I implemented with both dropout regularization model and batch normalization model.
  • I changed testing pictures.
  • I am thinking of making input images to colorful s.t maybe if the problem is the range of values, somehow, but that would be different model as well. I want to know what the problem in this one.
  • Changing last activation layer to torch.sigmoid
  • Reducing batch size to 30…
    Still no change…

Infos:

Dataset : https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip

Model:

import os
import cv2
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
import matplotlib.pyplot as plt
from matplotlib import style

MODEL_NAME = f"model-{time.asctime()}"  # gives a dynamic model name, to just help with things getting messy over
# time.

REBUILD_DATA = False  # set to true to one once, then back to false unless you want to change something in your

IMG_SIZE = 50
# training data.
PATH = "model/model.pt"
BATCH_SIZE = 100
EPOCHS = 20

class DogsVSCats():
    IMG_SIZE = 50
    CATS = "PetImages/Cat"
    DOGS = "PetImages/Dog"
    TESTING = "PetImages/Testing"
    LABELS = {CATS: 0, DOGS: 1}
    training_data = []

    catcount = 0
    dogcount = 0

    def make_training_data(self):
        for label in self.LABELS:
            print(label)
            for f in tqdm(os.listdir(label)):
                if "jpg" in f:
                    try:
                        path = os.path.join(label, f)
                        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
                        img = cv2.resize(img, (self.IMG_SIZE, self.IMG_SIZE))
                        self.training_data.append([np.array(img), np.eye(2)[
                            self.LABELS[label]]])  # do something like print(np.eye(2)[1]), just makes one_hot
                        # print(np.eye(2)[self.LABELS[label]])

                        if label == self.CATS:
                            self.catcount += 1
                        elif label == self.DOGS:
                            self.dogcount += 1

                    except Exception as e:
                        pass
                        # print(label, f, str(e))

        np.random.shuffle(self.training_data)
        np.save("training_data.npy", self.training_data)
        print('Cats:', dogsvcats.catcount)
        print('Dogs:', dogsvcats.dogcount)



class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, 5)
#        self.bn1 = nn.BatchNorm2d(num_features=32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.conv2 = nn.Conv2d(32, 64, 5)
#        self.bn2 = nn.BatchNorm2d(num_features=64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.conv3 = nn.Conv2d(64, 128, 5)

        x = torch.randn(50, 50).view(-1, 1, 50, 50)
        self._to_linear = None
        self.convs(x)

        self.fc1 = nn.Linear(self._to_linear, 512)
        self.bn1 = nn.BatchNorm1d(num_features=512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.fc2 = nn.Linear(512, 2)
        self.dropout = nn.AlphaDropout(p=0.3)

    def convs(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv3(x)), (2, 2))

        if self._to_linear is None:
            self._to_linear = x[0].shape[0] * x[0].shape[1] * x[0].shape[2]
        return x

        '''>>> # With Learnable Parameters
        >>> m = nn.BatchNorm2d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm2d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45)
        >>> output = m(input)'''

    def forward(self, x):
        x = self.convs(x)
        x = x.view(-1, self._to_linear)
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.fc2(x)
        x = self.dropout(x)
        return F.softmax(x, dim=1)

if torch.cuda.is_available():
    device = torch.device("cuda:0")  # you can continue going on here, like cuda:1 cuda:2....etc.
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

net = Net().to(device)

if REBUILD_DATA:
    dogsvcats = DogsVSCats()
    dogsvcats.make_training_data()

training_data = np.load("training_data.npy", allow_pickle=True)
print(len(training_data))

optimizer = optim.Adam(net.parameters(), lr=0.01)
loss_function = nn.MSELoss()

X = torch.Tensor([i[0] for i in training_data]).view(-1, 50, 50)
X = X / 255.0
y = torch.Tensor([i[1] for i in training_data])

VAL_PCT = 0.1
val_size = int(len(X) * VAL_PCT)
print(val_size)

train_X = X[:-val_size]
train_y = y[:-val_size]

test_X = X[-val_size:]
test_y = y[-val_size:]

print(len(train_X))
print(len(test_X))


def fwd_pass(X_, y_, train_=False):
    if train_:
        net.zero_grad()

    outputs = net(X_)
    matches = [torch.argmax(i) == torch.argmax(j) for i, j in zip(outputs, y_)]
    acc = matches.count(True) / len(matches)
    loss = loss_function(outputs, y_)

    if train_:
        loss.backward()
        optimizer.step()

    return acc, loss


style.use("ggplot")

print(MODEL_NAME)


def train(net_):


    with open("model_4.log", "a") as f:
        for epoch in range(EPOCHS):
            for i in tqdm(range(0, len(train_X), BATCH_SIZE)):
                batch_X = train_X[i:i + BATCH_SIZE].view(-1, 1, 50, 50)
                batch_y = train_y[i:i + BATCH_SIZE]

                batch_X, batch_y = batch_X.to(device), batch_y.to(device)

                acc, loss = fwd_pass(batch_X, batch_y, train_=True)

                # print(f"Acc: {round(float(acc),2)}  Loss: {round(float(loss),4)}")
                # f.write(f"{MODEL_NAME},{round(time.time(),3)},train,{round(float(acc),2)},{round(float(loss),4)}\n")
                # just to show the above working, and then get out:
                if i % 50 == 0:
                    val_acc, val_loss = test(size=100)
                    f.write(
                        f"{MODEL_NAME},{round(time.time(), 3)},{round(float(acc), 2)},{round(float(loss), 4)},"
                        f"{round(float(val_acc), 2)},{round(float(val_loss), 4)},{epoch}\n")
                    torch.save({
                        'dropout cnn model': MODEL_NAME,
                        'epoch': epoch,
                        'model_state_dict': net.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict()
                    }, PATH)

def test(size=32):
    X, y = test_X[:size], test_y[:size]
    val_acc, val_loss = fwd_pass(X.view(-1, 1, 50, 50).to(device), y.to(device))
    return val_acc, val_loss


def create_acc_loss_graph(model_name):
    contents = open("model_4.log", "r").read().split("\n")

    times = []
    accuracies = []
    losses = []

    val_accs = []
    val_losses = []

    for c in contents:
        if model_name in c:
            name, timestamp, acc, loss, val_acc, val_loss, epoch = c.split(",")

            times.append(float(timestamp))
            accuracies.append(float(acc))
            losses.append(float(loss))

            val_accs.append(float(val_acc))
            val_losses.append(float(val_loss))

    fig = plt.figure()

    ax1 = plt.subplot2grid((2, 1), (0, 0))
    ax2 = plt.subplot2grid((2, 1), (1, 0), sharex=ax1)

    ax1.plot(times, accuracies, label="acc")
    ax1.plot(times, val_accs, label="val_acc")
    ax1.legend(loc=2)
    ax2.plot(times, losses, label="loss")
    ax2.plot(times, val_losses, label="val_loss")
    ax2.legend(loc=2)
    plt.show()

Similar Topics here:

Hi,

This is a little strange, because you are getting desirable outputs while you are training the model and only it happens in testing stage. So, literally if you pass test images during trainining but intentionally zero gathered gradients before updating weights, you should get correct predictions. Because of that, I think you might have implemented passing values to test function wrong or just doing something concerning code rather than math/logic.

Try not to use your current code after training stage, and just input a single instance once from training set and another from test set.

sample = # get a single instance batch
model.eval()
with torch.no_grad():
     output = model(sample)

And check if the values are reasonable.

Bests

1 Like
CP = torch.load(PATH)
net.load_state_dict(CP['model_state_dict'])
optimizer.load_state_dict(CP['optimizer_state_dict'])
net.eval()

sample = train_X[0:0 + BATCH_SIZE].view(-1, 1, 50, 50)
sample = sample.to(device)
with torch.no_grad():
    output = net(sample)
    print(output, "=?", y)

output:

tensor([[2.0367e-11, 1.0000e+00],
        [2.8479e-02, 9.7152e-01],
        [1.4075e-14, 1.0000e+00],
        [9.9997e-01, 3.3164e-05],
        [5.6208e-08, 1.0000e+00],
        [1.0000e+00, 1.1212e-07],
        [1.3695e-03, 9.9863e-01],
        [6.8686e-02, 9.3131e-01],
        [5.6444e-09, 1.0000e+00],
        [1.7498e-04, 9.9983e-01],
        [2.0622e-13, 1.0000e+00],
        [1.8598e-21, 1.0000e+00],
        [8.6885e-01, 1.3115e-01],
        [4.4393e-01, 5.5607e-01],
        [9.9974e-01, 2.5576e-04],
        [3.4825e-09, 1.0000e+00],
        [8.9833e-02, 9.1017e-01],
        [2.6258e-04, 9.9974e-01],
        [9.9997e-01, 2.8542e-05],
        [1.5574e-01, 8.4426e-01],
        [7.8556e-06, 9.9999e-01],
        [3.8838e-05, 9.9996e-01],
        [9.1441e-01, 8.5585e-02],
        [9.9998e-01, 2.0286e-05],
        [3.4435e-04, 9.9966e-01],
        [3.5586e-05, 9.9996e-01],
        [1.3090e-11, 1.0000e+00],
        [3.3178e-06, 1.0000e+00],
        [8.9947e-02, 9.1005e-01],
        [3.9299e-06, 1.0000e+00],
        [1.4769e-01, 8.5231e-01],
        [1.5370e-04, 9.9985e-01],
        [8.2074e-01, 1.7926e-01],
        [9.9904e-01, 9.6462e-04],
        [2.9493e-03, 9.9705e-01],
        [7.6622e-03, 9.9234e-01],
        [4.7667e-01, 5.2333e-01],
        [1.0248e-01, 8.9752e-01],
        [1.2424e-11, 1.0000e+00],
.....

This gives particularly different outputs, but not so success, though! When I applied on images one by one, not batches, it gives:

tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[1., 0.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')
tensor([[1.0000e+00, 1.7080e-21]], device='cuda:0')
...

However, this is not the effect of with torch.no_grad() . Outputs are same without it. I notice now, one or two are different in the output list and they are actually different number when testing batches, so I doubt this is because of code.

Actually I did not say that it is due to torch.no_grad or anything else, I thought you might using something in a wrong way. For the first output you have provided (train data), here is the predicted labels:

tensor([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1])

And seems ok to me. Although I wanted you to do same experiment with a small batch of test data too to see the logits there.

Sorry, tired-head

#path = "D:/projects/Neural_Networks/coursera_v2/coursera/samples/sample_image.JPG"
#print(os.path.exists(path))
CP = torch.load(PATH)
net.load_state_dict(CP['model_state_dict'])
optimizer.load_state_dict(CP['optimizer_state_dict'])
net.eval()

sample = test_X[0:0 + BATCH_SIZE].view(-1, 1, 50, 50)
sample = sample.to(device)
#with torch.no_grad():
output = net(sample)
print(output, "=?", y)

output:

tensor([[0.5262, 0.4738],
        [0.4902, 0.5098],
        [0.5168, 0.4832],
        [0.5221, 0.4779],
        [0.5194, 0.4806],
        [0.5202, 0.4798],
        [0.5314, 0.4686],
        [0.4758, 0.5242],
        [0.5112, 0.4888],
        [0.5211, 0.4789],
        [0.4424, 0.5576],
        [0.5285, 0.4715],
        [0.5028, 0.4972],
        [0.5260, 0.4740],
        [0.5188, 0.4812],
        [0.4901, 0.5099],
        [0.4978, 0.5022],
        [0.5263, 0.4737],
        [0.5042, 0.4958],
        [0.5131, 0.4869],
        [0.4867, 0.5133],
        [0.5095, 0.4905],
        [0.5131, 0.4869],
        [0.5183, 0.4817],
        [0.5271, 0.4729],
        [0.4828, 0.5172],
        [0.3326, 0.6674],
        [0.5277, 0.4723],
        [0.5032, 0.4968],
        [0.5120, 0.4880],
        [0.4935, 0.5065],
        [0.5211, 0.4789],
        [0.5047, 0.4953],
        [0.5116, 0.4884],
        [0.4575, 0.5425],
        [0.4901, 0.5099],
        [0.5274, 0.4726],
        [0.3905, 0.6095],
        [0.5228, 0.4772],
        [0.5022, 0.4978],
        [0.5184, 0.4816],
        [0.4686, 0.5314],
        [0.5269, 0.4731],
        [0.5301, 0.4699],
        [0.3842, 0.6158],
        [0.5019, 0.4981],
        [0.3743, 0.6257],
        [0.5181, 0.4819],
        [0.5249, 0.4751],
        [0.5182, 0.4818],
        [0.5230, 0.4770],
        [0.5179, 0.4821],
        [0.3982, 0.6018],
        [0.3678, 0.6322],
        [0.5241, 0.4759],
        [0.4905, 0.5095],
        [0.5161, 0.4839],
        [0.5165, 0.4835],
        [0.4989, 0.5011],
        [0.4563, 0.5437],
        [0.5252, 0.4748],
        [0.5284, 0.4716],
        [0.4954, 0.5046],
        [0.3056, 0.6944],
        [0.4673, 0.5327],
        [0.3595, 0.6405],
        [0.5051, 0.4949],
        [0.5013, 0.4987],
        [0.5112, 0.4888],
        [0.2500, 0.7500],
        [0.5266, 0.4734],
        [0.5135, 0.4865],
        [0.5213, 0.4787],
        [0.5234, 0.4766],
        [0.5277, 0.4723]], device='cuda:0', grad_fn=<SoftmaxBackward>) =? tensor([[0., 1.],
        [1., 0.],
        [0., 1.],
        ...,
        [0., 1.],
        [1., 0.],
        [1., 0.]])

And how you managed to get labels is, I guess by simply observing which side is greater, am i right?
Then these goes like as (an array):

[0, 1, 0, 0, 0, 0, 0, 1,0, 0, 1, 0, 0, 0, 0, 1 , 1, 0...]

Where to go from here?
Also, thanks for your interest. ^^

EDIT: with no grad

tensor([[0.4923, 0.5077],
        [0.4956, 0.5044],
        [0.4919, 0.5081],
        [0.4925, 0.5075],
        [0.4923, 0.5077],
        [0.4915, 0.5085],
        [0.4930, 0.5070],
        [0.5804, 0.4196],
        [0.4914, 0.5086],
        [0.4954, 0.5046],
        [0.5091, 0.4909],
        [0.4925, 0.5075],
        [0.4925, 0.5075],
        [0.4929, 0.5071],
        [0.4925, 0.5075],
        [0.4914, 0.5086],
        [0.4926, 0.5074],
        [0.4922, 0.5078],
        [0.4933, 0.5067],
        [0.4881, 0.5119],
        [0.4916, 0.5084],
        [0.4918, 0.5082],
        [0.4915, 0.5085],
        [0.4920, 0.5080],
        [0.4923, 0.5077],
        [0.4914, 0.5086],
        [0.5260, 0.4740],
        [0.4927, 0.5073],
        [0.4921, 0.5079],
        [0.4933, 0.5067],
        [0.4920, 0.5080],
        [0.4932, 0.5068],
        [0.4915, 0.5085],
        [0.4943, 0.5057],
        [0.5046, 0.4954],
        [0.4854, 0.5146],
        [0.4970, 0.5030],
        [0.5419, 0.4581],
        [0.4924, 0.5076],
        [0.4933, 0.5067],
        [0.4943, 0.5057],
        [0.4934, 0.5066],
        [0.4925, 0.5075],
        [0.4931, 0.5069],
        [0.5168, 0.4832],
        [0.4916, 0.5084],
        [0.5217, 0.4783],
        [0.4922, 0.5078],
        [0.4937, 0.5063],

OW, everything is strange for me now.
Literally, torch.no_grad should not change the output as it actually just prevents from accumulating gradients not updating weights.
Also, Yes, I used argmax to decide which class has been chosen as softmax returns probabilities of logits.

I really cannot figure much why this is happening.
Have you found any solutions so far?

No :((, I will be trying to prepare dataset from one single picture same way which way I prepared mine and load it, if maybe somehow that’s the case.

Hey, problem was, when I predicting on single image, I did not scale the image, which is nothing but a single line code: x= x/255.0

Now, this works fantastic! Thanks a lot for your instructions too ^^.

1 Like

Great!
So partially I was wrong that I thought you that have some coding problem and partially correct because model was training just fine and only issue was the way you test.

But in your first post you did scale your input data X. Am I wrong?

1 Like

Yes, I did.

I did not scale when I applied the model on one image, that’s why batches’ outputs were fine. Thx ^^