Gradients too small and training loss almost constant

I have made my own dataloader. I am trying to train a network to solve jigsaw puzzle. The training loss remains constant and the gradients are very small. What could be the possible reason?
Also volatile gpu util is 0 most of the time.

This is how i am doing my training:

steps = args.iter_start
train_data = DataLoader(video_path=train_path, batch_size=args.batch, frame_size=args.img_shape,
classes=args.classes, jigsaw_frames=args.jigsaw_frame)
total_combinations = train_data.get_total_combinations()
batches_per_epoch = total_combinations // args.batch

for epoch in range(args.epochs):
    train_data.frame_counter = 0
    train_data.video_counter = 0

    
    lr = adjust_learning_rate(optimizer, epoch, init_lr=args.lr, step=20, decay=0.1)


    for batch_count in range(batches_per_epoch):

        images, labels, original = train_data.get_next_batch(batch_size=args.batch)
        images, labels = Variable(images).cuda(), Variable(labels).cuda()

        # Forward + Backward + Optimize
        outputs = net(images)

        prec1, prec5 = compute_accuracy(outputs.cuda().data, labels.cuda().data, topk=(1, 5))


        acc = prec1.item()
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if steps % 20 == 0:
            print(
                ('[Epoch: %2d/%2d], [Steps: %5d], [LR %.5f], [Loss: % 1.3f], [Accuracy % 2.2f%%]' % (
                    epoch + 1, args.epochs, steps,
                    lr, loss, acc)))

        
        steps += 1

Network is as follow:

class Network(nn.Module):

def __init__(self, classes=1000):
    super(Network, self).__init__()

    self.conv = nn.Sequential()
    self.conv.add_module('conv1_s1', nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=0))
    self.conv.add_module('relu1_s1', nn.ReLU(inplace=True))
    self.conv.add_module('pool1_s1', nn.MaxPool2d(kernel_size=3, stride=2))
    self.conv.add_module('lrn1_s1', LRN(local_size=5, alpha=0.0001, beta=0.75))

    self.conv.add_module('conv2_s1', nn.Conv2d(96, 256, kernel_size=5, padding=2, groups=2))
    self.conv.add_module('relu2_s1', nn.ReLU(inplace=True))
    self.conv.add_module('pool2_s1', nn.MaxPool2d(kernel_size=3, stride=2))
    self.conv.add_module('lrn2_s1', LRN(local_size=5, alpha=0.0001, beta=0.75))

    self.conv.add_module('conv3_s1', nn.Conv2d(256, 384, kernel_size=3, padding=1))
    self.conv.add_module('relu3_s1', nn.ReLU(inplace=True))

    self.conv.add_module('conv4_s1', nn.Conv2d(384, 384, kernel_size=3, padding=1, groups=2))
    self.conv.add_module('relu4_s1', nn.ReLU(inplace=True))

    self.conv.add_module('conv5_s1', nn.Conv2d(384, 256, kernel_size=3, padding=1, groups=2))
    self.conv.add_module('relu5_s1', nn.ReLU(inplace=True))
    self.conv.add_module('pool5_s1', nn.MaxPool2d(kernel_size=3, stride=2))

    self.fc6 = nn.Sequential()
    self.fc6.add_module('fc6_s1', nn.Linear(256 * 6 * 6, 4096))
    self.fc6.add_module('relu6_s1', nn.ReLU(inplace=True))
    self.fc6.add_module('drop6_s1', nn.Dropout(p=0.5))

    self.fc7 = nn.Sequential()
    self.fc7.add_module('fc7_s1', nn.Linear(4096, 4096))
    self.fc7.add_module('relu7_s1', nn.ReLU(inplace=True))
    self.fc7.add_module('drop7_s1', nn.Dropout(p=0.5))

    self.fc8 = nn.Sequential()
    self.fc8.add_module('fc8', nn.Linear(9 * 4096, 4096))
    self.fc8.add_module('relu8', nn.ReLU(inplace=True))
    self.fc8.add_module('drop8', nn.Dropout(p=0.5))

    self.classifier = nn.Sequential()
    self.classifier.add_module('fc9', nn.Linear(4096, classes))
    self.apply(weights_init)


def forward(self, x):

    B, T, C, H, W = x.size()
    x = x.transpose(0, 1)

    x_list = []
    for i in range(9):
        z = self.conv(x[i])

        z = self.fc6(z.view(B, -1))
        z = self.fc7(z)
        z = z.view([B, 1, -1])
        x_list.append(z)

    x = cat(x_list, 1)
    x = self.fc8(x.view(B, -1))
    x = self.classifier(x)

    return x

def weights_init(model):
if type(model) in [nn.Conv2d, nn.Linear]:
nn.init.xavier_normal(model.weight.data)
nn.init.constant(model.bias.data, 0.1)