Net won't learn - what should I check next?

I’m working with a colleague on a net to classify basic shapes and we’re stuck on a net that won’t learn. I’m attempting to predict among 4 categories (shapes) and the loss consistently converges to 1.38xx, which would be the expected result for a net that won’t learn when using nn.CrossEntropyLoss() since -ln(0.25) = 1.38xx.

Here is the code to make the dataset:

# 1_make_dataset.py

import os
import numpy as np
import cv2
import random
from tqdm import tqdm

SHAPE_MAP = {"triangle": 0, "square": 1, "pentagon": 2, "hexagon": 3}
# COLOR_MAP: key => color name (string), value => 2-element tuple, 0th element is the OpenCV color, 1th element is the color index
COLOR_MAP = {"red": [(0, 0, 255), 0], "green": [(0, 255, 0), 1], "blue": [(255, 0, 0), 2]}

NUM_TRAIN_SAMPS = 50000
NUM_TEST_SAMPS = 10000

TRAIN_DATA_DIR_LOC = os.path.join(os.getcwd(), 'train_data')
TEST_DATA_DIR_LOC = os.path.join(os.getcwd(), 'test_data')

def main():
    print('\n' + 'starting . . .' + '\n')

    os.makedirs(TRAIN_DATA_DIR_LOC, exist_ok=True)
    os.makedirs(TEST_DATA_DIR_LOC, exist_ok=True)

    print('\n' + 'writing training data . . .')
    write_images(NUM_TRAIN_SAMPS, TRAIN_DATA_DIR_LOC)
    print('\n' + 'writing test data . . .')
    write_images(NUM_TEST_SAMPS, TEST_DATA_DIR_LOC)

    print('\n' + 'done !!' + '\n')
# end function

def write_images(num_samps, dir_loc):
    for i in tqdm(range(num_samps)):
        image, shape, color, center_x, center_y = draw_random_shape()

        # write image to file
        image_file_name = str(i) + '.jpg'
        cv2.imwrite(os.path.join(dir_loc, image_file_name), image)

        # write ground truth data to file
        gnd_tr_file_name = str(i) + '.csv'
        with open(os.path.join(dir_loc, gnd_tr_file_name), 'w') as f:
            f.write(shape + ',' + color + ',' + center_x + ',' + center_y)
        # end with
    # end for
# end function

def draw_random_shape():
    # random select a shape
    shape = np.random.choice(list(SHAPE_MAP.keys()))
    color = np.random.choice(list(COLOR_MAP.keys()))

    # Create a black canvas
    image = np.zeros((224, 224, 3), dtype=np.uint8)

    # Generate random position for the shape center
    center_x = random.randint(50, 174)  # Random x-coordinate within the canvas
    center_y = random.randint(50, 174)  # Random y-coordinate within the canvas

    # Calculate the length of the polygon's sides
    side_length = 40

    # Calculate the coordinates of the polygon vertices
    angles = np.linspace(0, 2 * np.pi, SHAPE_MAP[shape]+4)[:-1]  # Angles for pentagon vertices (excluding the last vertex)
    x_coords = center_x + np.cos(angles) * side_length
    y_coords = center_y + np.sin(angles) * side_length
    vertices = np.stack((x_coords, y_coords), axis=1).astype(int)

    # Draw the polygon on the canvas
    cv2.fillPoly(image, pts = [vertices.reshape((-1, 1, 2))], color=COLOR_MAP[color][0])
    # draw a small, white, filled-in circle at the center of the shape as well
    image = cv2.circle(image, (center_x, center_y), 10, color=(255, 255, 255), thickness=-1)

    # cv2.imshow('image', image)
    # cv2.waitKey()

    return image, shape, color, str(center_x), str(center_y)
# end function

if __name__ == '__main__':
    main()

Which produces this dataset of basic shapes that are either a triangle, square, pentagon, or hexagon:

I’ve reviewed the dataset and I can attest it’s well-balanced. The .csv files look like this:

Which is exactly how they should be. The intent of the color classification and object location data is that eventually I’d like to add 2 more heads, one for color classification and one for detecting object location, and perhaps even a 4th head eventually to regress the object width and height, but for now I’m trying to get just the classification working.

Here is the training code:

# 2_train.py

import os
import glob
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import torchvision
import numpy as np
import cv2
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
from torchvision import transforms
from termcolor import colored

SHAPE_MAP = {"triangle": 0, "square": 1, "pentagon": 2, "hexagon": 3}
# COLOR_MAP: key => color name (string), value => 2-element tuple, 0th element is the OpenCV color, 1th element is the color index
COLOR_MAP = {"red": [(0, 0, 255), 0], "green": [(0, 255, 0), 1], "blue": [(255, 0, 0), 2]}

NUM_EPOCHS = 10
BATCH_SIZE = 16
LOG_FREQ = 10

TRAIN_DATA_DIR_LOC = os.path.join(os.getcwd(), 'train_data')
TEST_DATA_DIR_LOC = os.path.join(os.getcwd(), 'test_data')

class ShapeDataset(Dataset):
    def __init__(self, data_dir_loc):
        self.data_dir_loc = data_dir_loc

        self.image_file_locs = glob.glob(os.path.join(data_dir_loc, '*.jp*g'))
        self.csv_file_locs = glob.glob(os.path.join(data_dir_loc, '*.csv'))

        # un-comment these lines to try training on only 1 image
        # self.image_file_locs = [self.image_file_locs[0]]
        # self.csv_file_locs = [self.csv_file_locs[0]]

        # un-comment these lines to try training on 10 images
        # self.image_file_locs = self.image_file_locs[0:10]
        # self.csv_file_locs = self.csv_file_locs[0:10]

        # un-comment these lines to try training on 100 images
        # self.image_file_locs = self.image_file_locs[0:100]
        # self.csv_file_locs = self.csv_file_locs[0:100]

        assert len(self.image_file_locs) == len(self.csv_file_locs)
    # end function

    def __len__(self):
        return len(self.image_file_locs)
    # end function

    def __getitem__(self, idx):
        # open image and convert to tensor
        image_file_loc = self.image_file_locs[idx]
        image = Image.open(image_file_loc)
        image = torchvision.transforms.ToTensor()(image)

        # read csv file
        csv_file_loc = self.csv_file_locs[idx]
        with open(csv_file_loc, 'r') as f:
            line = f.readline().rstrip()
            tokens = line.split(',')
            assert len(tokens) == 4, 'error, for file ' + csv_file_loc + ', len(tokens) = ' + str(len(tokens)) + ', should be 4'
            shape, color, center_x, center_y = tokens
        # end with

        # convert shape and color to index values and PyTorch tensors
        shape = torch.tensor(SHAPE_MAP[shape], dtype=torch.int64)
        color = torch.tensor(COLOR_MAP[color][0], dtype=torch.int64)

        # convert x and y to float tensors
        # ToDo: what about x/y and height/width swap here ??
        # ToDo: should these be integers or floats ??
        center_x = torch.tensor(float(center_x), dtype=torch.float32)
        center_y = torch.tensor(float(center_y), dtype=torch.float32)

        return image, shape, color, center_x, center_y
    # end function
# end class

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.block0_conv0 = nn.Conv2d(in_channels=3, out_channels=128, kernel_size=3, stride=1, padding='same')
        self.block0_conv1 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding='same')

        self.block1_conv0 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding='same')
        self.block1_conv1 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding='same')

        self.fc0 = nn.Linear(in_features=256 * 56 * 56, out_features=256)
        self.fc1 = nn.Linear(256, 4)
        # self.color_head = nn.Linear(in_features=256 * 56 * 56, out_features=3)
        # self.center_head = nn.Linear(in_features=256 * 56 * 56, out_features=2)

    # end function

    def forward(self, x):
        # (bs, 3, 224, 224)
        x = F.relu(self.block0_conv0(x))
        # (bs, 128, 224, 224)
        x = F.relu(self.block0_conv1(x))
        # (bs, 128, 224, 224)
        x = F.max_pool2d(x, kernel_size=2)
        # (bs, 128, 112, 112)
        x = F.relu(self.block1_conv0(x))
        # (bs, 256, 112, 112)
        x = F.relu(self.block1_conv1(x))
        # (bs, 256, 112, 112)
        x = F.max_pool2d(x, kernel_size=2)
        # (bs, 256, 56, 56)
        x = torch.flatten(x, start_dim=1) # flatten all dimensions except for the batch dimension
        # (bs, 802816)      Note: 256 * 56 * 56 = 802816

        # shape_output = F.softmax(self.shape_head(x))
        # color_output = F.softmax(self.color_head(x))
        # center_output = self.center_head(x)

        # (bs, 802816)
        x = F.relu(self.fc0(x))
        # (bs, 256)
        logits = self.fc1(x)
        # (bs, 4)

        # return shape_output # , color_output, center_output
        return logits
    # end function
# end class

def main():
    # suppress PyTorch printing in scientific notation
    torch.set_printoptions(sci_mode=False)

    # suppress numpy printing in scientific notation
    np.set_printoptions(suppress=True)

    # determine device
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    # end if

    train_dataset = ShapeDataset(TRAIN_DATA_DIR_LOC)
    train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                    batch_size=BATCH_SIZE,
                                                    drop_last=True,
                                                    shuffle=True)

    # instantiate net
    net = Net()
    net.to(device)
    print(net)

    # instantiate loss and optimizer
    cross_entropy_loss_fn = nn.CrossEntropyLoss()

    optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

    for epoch in range(NUM_EPOCHS):

        net.train()
        epoch_train_losses = []

        # for batch in train batches . . .
        for step, (images, gt_shapes, gt_colors, gt_center_xs, gt_center_ys) in enumerate(tqdm(train_data_loader)):
            # move images and ground truth data to device
            images = images.to(device)
            gt_shapes = gt_shapes.to(device)

            # clear the gradients from the previous step
            optimizer.zero_grad()

            # forward pass
            preds = net(images)

            # compute loss
            loss = cross_entropy_loss_fn(preds, gt_shapes)

            # compute partial derivatives of loss functions for weights and biases
            loss.backward()

            # update weights and biases using partial derivatives
            optimizer.step()

            epoch_train_losses.append(loss.item())

            if step % LOG_FREQ == 0:
                print('epoch ' + str(epoch) + ', step ' + str(step) + ', loss = ' + str(loss.item()))
            # end if
        # end for

        print('epoch ' + str(epoch) + ', train loss = ' + '{:.4f}'.format(sum(epoch_train_losses) / len(epoch_train_losses)))
    # end for

    print('\n' + 'finished training !!' + '\n')
# end function

if __name__ == '__main__':
    main()

The loss quickly gets to 1.38 (-ln(0.25)) and never changes.

In consideration that the net may be the problem, I replaced it with the PyTorch ResNet50 net, which should be easily elaborate enough for such a simple task:

# 2_train_resnet50.py

import os
import glob
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import torchvision
import numpy as np
import cv2
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
from torchvision import transforms
from termcolor import colored

SHAPE_MAP = {"triangle": 0, "square": 1, "pentagon": 2, "hexagon": 3}
# COLOR_MAP: key => color name (string), value => 2-element tuple, 0th element is the OpenCV color, 1th element is the color index
COLOR_MAP = {"red": [(0, 0, 255), 0], "green": [(0, 255, 0), 1], "blue": [(255, 0, 0), 2]}

NUM_EPOCHS = 10
BATCH_SIZE = 16
LOG_FREQ = 10

TRAIN_DATA_DIR_LOC = os.path.join(os.getcwd(), 'train_data')
TEST_DATA_DIR_LOC = os.path.join(os.getcwd(), 'test_data')

class ShapeDataset(Dataset):
    def __init__(self, data_dir_loc):
        self.data_dir_loc = data_dir_loc

        self.image_file_locs = glob.glob(os.path.join(data_dir_loc, '*.jp*g'))
        self.csv_file_locs = glob.glob(os.path.join(data_dir_loc, '*.csv'))

        # un-comment these lines to try training on only 1 image
        # self.image_file_locs = [self.image_file_locs[0]]
        # self.csv_file_locs = [self.csv_file_locs[0]]

        assert len(self.image_file_locs) == len(self.csv_file_locs)
    # end function

    def __len__(self):
        return len(self.image_file_locs)
    # end function

    def __getitem__(self, idx):
        # open image and convert to tensor
        image_file_loc = self.image_file_locs[idx]
        image = Image.open(image_file_loc)
        image = torchvision.transforms.ToTensor()(image)

        # read csv file
        csv_file_loc = self.csv_file_locs[idx]
        with open(csv_file_loc, 'r') as f:
            line = f.readline().rstrip()
            tokens = line.split(',')
            assert len(tokens) == 4, 'error, for file ' + csv_file_loc + ', len(tokens) = ' + str(len(tokens)) + ', should be 4'
            shape, color, center_x, center_y = tokens
        # end with

        # convert shape and color to index values and PyTorch tensors
        shape = torch.tensor(SHAPE_MAP[shape], dtype=torch.int64)
        color = torch.tensor(COLOR_MAP[color][0], dtype=torch.int64)

        # convert x and y to float tensors
        # ToDo: what about x/y and height/width swap here ??
        # ToDo: should these be integers or floats ??
        center_x = torch.tensor(float(center_x), dtype=torch.float32)
        center_y = torch.tensor(float(center_y), dtype=torch.float32)

        return image, shape, color, center_x, center_y
    # end function
# end class

def main():
    # suppress PyTorch printing in scientific notation
    torch.set_printoptions(sci_mode=False)

    # suppress numpy printing in scientific notation
    np.set_printoptions(suppress=True)

    # determine device
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    # end if

    train_dataset = ShapeDataset(TRAIN_DATA_DIR_LOC)
    train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                    batch_size=BATCH_SIZE,
                                                    drop_last=True,
                                                    shuffle=True)

    # instantiate net
    net = torchvision.models.resnet50()

    print(net)

    # replace last layer with 4 output features (4 possible shapes)
    net.fc = torch.nn.Sequential(
        torch.nn.Linear(
            in_features=2048,
            out_features=4
        )
    )

    net.to(device)

    # instantiate losses and optimizer
    cross_entropy_loss_fn = nn.CrossEntropyLoss()

    optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

    for epoch in range(NUM_EPOCHS):

        net.train()
        epoch_train_losses = []

        # for batch in train batches . . .
        for step, (images, gt_shapes, gt_colors, gt_center_xs, gt_center_ys) in enumerate(tqdm(train_data_loader)):
            # move images and ground truth data to device
            images = images.to(device)
            gt_shapes = gt_shapes.to(device)

            # # clear the gradients from the previous step
            optimizer.zero_grad()

            # forward pass
            preds = net(images)

            # compute loss
            loss = cross_entropy_loss_fn(preds, gt_shapes)

            # compute partial derivatives of loss functions for weights and biases
            loss.backward()

            # update weights and biases using partial derivatives
            optimizer.step()

            epoch_train_losses.append(loss.item())

            if step % LOG_FREQ == 0:
                print('epoch ' + str(epoch) + ', step ' + str(step) + ', loss = ' + str(loss.item()))
            # end if
        # end for

        print('epoch ' + str(epoch) + ', train loss = ' + '{:.4f}'.format(sum(epoch_train_losses) / len(epoch_train_losses)))
    # end for

    print('\n' + 'finished training !!' + '\n')
# end function

if __name__ == '__main__':
    main()

But I get the same result (loss stuck on 1.38xx).

Here are some things I’ve tried to troubleshoot things so far:

-Compared line by line with a working MNIST PyTorch net
-Tried all the suggestions in this forum and also on Stack Overflow for “my net won’t learn” or similar posts
-Changing the learning rate
-Changing the optimizer
-Writing my own loss function
-Checking the dataset
-Changing the net to the PyTorch built-in ResNet50 (mentioned above)

Ref the commented-out code in ShapeDataset, if I use a training dataset size of 1 or 10 the loss almost immediately drops to very close to zero. If I use a training dataset size of 100 it stays at 1.38, same as with the 50,000 item dataset.

What else should I check?

I had a bug in the 2 above train scripts. I was not sorting the images and csv file lists after loading so the order between the two was not consistent. This updated version with the ResNet50 option works:

# 1_make_dataset.py

import os
import numpy as np
import cv2
import random
from tqdm import tqdm

SHAPE_MAP = {"triangle": 0, "square": 1, "pentagon": 2, "hexagon": 3}
# COLOR_MAP: key => color name (string), value => 2-element tuple, 0th element is the OpenCV color, 1th element is the color index
COLOR_MAP = {"red": [(0, 0, 255), 0], "green": [(0, 255, 0), 1], "blue": [(255, 0, 0), 2]}

NUM_TRAIN_SAMPS = 50000
NUM_TEST_SAMPS = 10000

TRAIN_DATA_DIR_LOC = os.path.join(os.getcwd(), 'train_data')
TEST_DATA_DIR_LOC = os.path.join(os.getcwd(), 'test_data')

def main():
    print('\n' + 'starting . . .' + '\n')

    os.makedirs(TRAIN_DATA_DIR_LOC, exist_ok=True)
    os.makedirs(TEST_DATA_DIR_LOC, exist_ok=True)

    print('\n' + 'writing training data . . .')
    write_images(NUM_TRAIN_SAMPS, TRAIN_DATA_DIR_LOC)
    print('\n' + 'writing test data . . .')
    write_images(NUM_TEST_SAMPS, TEST_DATA_DIR_LOC)

    print('\n' + 'done !!' + '\n')
# end function

def write_images(num_samps, dir_loc):
    for i in tqdm(range(num_samps)):
        image, shape, color, center_x, center_y = draw_random_shape()

        # write image to file
        image_file_name = str(i) + '.jpg'
        cv2.imwrite(os.path.join(dir_loc, image_file_name), image)

        # write ground truth data to file
        gnd_tr_file_name = str(i) + '.csv'
        with open(os.path.join(dir_loc, gnd_tr_file_name), 'w') as f:
            f.write(shape + ',' + color + ',' + center_x + ',' + center_y)
        # end with
    # end for
# end function

def draw_random_shape():
    # random select a shape
    shape = np.random.choice(list(SHAPE_MAP.keys()))
    color = np.random.choice(list(COLOR_MAP.keys()))

    # Create a black canvas
    image = np.zeros((224, 224, 3), dtype=np.uint8)

    # Generate random position for the shape center
    center_x = random.randint(50, 174)  # Random x-coordinate within the canvas
    center_y = random.randint(50, 174)  # Random y-coordinate within the canvas

    # Calculate the length of the polygon's sides
    side_length = 40

    # Calculate the coordinates of the polygon vertices
    angles = np.linspace(0, 2 * np.pi, SHAPE_MAP[shape]+4)[:-1]  # Angles for pentagon vertices (excluding the last vertex)
    x_coords = center_x + np.cos(angles) * side_length
    y_coords = center_y + np.sin(angles) * side_length
    vertices = np.stack((x_coords, y_coords), axis=1).astype(int)

    # Draw the polygon on the canvas
    cv2.fillPoly(image, pts = [vertices.reshape((-1, 1, 2))], color=COLOR_MAP[color][0])
    # draw a small, white, filled-in circle at the center of the shape as well
    image = cv2.circle(image, (center_x, center_y), 10, color=(255, 255, 255), thickness=-1)

    # cv2.imshow('image', image)
    # cv2.waitKey()

    return image, shape, color, str(center_x), str(center_y)
# end function

if __name__ == '__main__':
    main()

and train script with ResNet50:

# 2_train_resnet50.py

import os
import glob
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import torchvision
import numpy as np
import cv2
import random
import natsort
from tqdm import tqdm
import matplotlib.pyplot as plt
from torchvision import transforms
from termcolor import colored

SHAPE_MAP = {"triangle": 0, "square": 1, "pentagon": 2, "hexagon": 3}
# COLOR_MAP: key => color name (string), value => 2-element tuple, 0th element is the OpenCV color, 1th element is the color index
COLOR_MAP = {"red": [(0, 0, 255), 0], "green": [(0, 255, 0), 1], "blue": [(255, 0, 0), 2]}

NUM_EPOCHS = 10
BATCH_SIZE = 16
LOG_FREQ = 10

TRAIN_DATA_DIR_LOC = os.path.join(os.getcwd(), 'train_data')
TEST_DATA_DIR_LOC = os.path.join(os.getcwd(), 'test_data')

class ShapeDataset(Dataset):
    def __init__(self, data_dir_loc):
        self.data_dir_loc = data_dir_loc

        self.image_file_locs = glob.glob(os.path.join(data_dir_loc, '*.jp*g'))
        self.csv_file_locs = glob.glob(os.path.join(data_dir_loc, '*.csv'))

        self.image_file_locs = natsort.natsorted(self.image_file_locs)
        self.csv_file_locs = natsort.natsorted(self.csv_file_locs)

        # un-comment these lines to try training on only 1 image
        # self.image_file_locs = [self.image_file_locs[0]]
        # self.csv_file_locs = [self.csv_file_locs[0]]

        assert len(self.image_file_locs) == len(self.csv_file_locs)
    # end function

    def __len__(self):
        return len(self.image_file_locs)
    # end function

    def __getitem__(self, idx):
        # open image and convert to tensor
        image_file_loc = self.image_file_locs[idx]
        image = Image.open(image_file_loc)
        image = torchvision.transforms.ToTensor()(image)

        # read csv file
        csv_file_loc = self.csv_file_locs[idx]
        with open(csv_file_loc, 'r') as f:
            line = f.readline().rstrip()
            tokens = line.split(',')
            assert len(tokens) == 4, 'error, for file ' + csv_file_loc + ', len(tokens) = ' + str(len(tokens)) + ', should be 4'
            shape, color, center_x, center_y = tokens
        # end with

        # convert shape and color to index values and PyTorch tensors
        shape = torch.tensor(SHAPE_MAP[shape], dtype=torch.int64)
        color = torch.tensor(COLOR_MAP[color][0], dtype=torch.int64)

        # convert x and y to float tensors
        # ToDo: what about x/y and height/width swap here ??
        # ToDo: should these be integers or floats ??
        center_x = torch.tensor(float(center_x), dtype=torch.float32)
        center_y = torch.tensor(float(center_y), dtype=torch.float32)

        return image, shape, color, center_x, center_y
    # end function
# end class

def main():
    # suppress PyTorch printing in scientific notation
    torch.set_printoptions(sci_mode=False)

    # suppress numpy printing in scientific notation
    np.set_printoptions(suppress=True)

    # determine device
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    # end if

    train_dataset = ShapeDataset(TRAIN_DATA_DIR_LOC)
    train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                    batch_size=BATCH_SIZE,
                                                    drop_last=True,
                                                    shuffle=True)

    # instantiate net
    net = torchvision.models.resnet50()

    print(net)

    # replace last layer with 4 output features (4 possible shapes)
    net.fc = torch.nn.Sequential(
        torch.nn.Linear(
            in_features=2048,
            out_features=4
        )
    )

    net.to(device)

    # instantiate losses and optimizer
    cross_entropy_loss_fn = nn.CrossEntropyLoss()

    optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

    for epoch in range(NUM_EPOCHS):

        net.train()
        epoch_train_losses = []

        # for batch in train batches . . .
        for step, (images, gt_shapes, gt_colors, gt_center_xs, gt_center_ys) in enumerate(tqdm(train_data_loader)):
            # move images and ground truth data to device
            images = images.to(device)
            gt_shapes = gt_shapes.to(device)

            # # clear the gradients from the previous step
            optimizer.zero_grad()

            # forward pass
            preds = net(images)

            # compute loss
            loss = cross_entropy_loss_fn(preds, gt_shapes)

            # compute partial derivatives of loss functions for weights and biases
            loss.backward()

            # update weights and biases using partial derivatives
            optimizer.step()

            epoch_train_losses.append(loss.item())

            if step % LOG_FREQ == 0:
                print('epoch ' + str(epoch) + ', step ' + str(step) + ', loss = ' + str(loss.item()))
            # end if
        # end for

        print('epoch ' + str(epoch) + ', train loss = ' + '{:.4f}'.format(sum(epoch_train_losses) / len(epoch_train_losses)))
    # end for

    print('\n' + 'finished training !!' + '\n')
# end function

if __name__ == '__main__':
    main()

Next I need to take a look at my custom net and determine why that was not sufficient.