Weight tensor should be defined either for all or no classes error

LuanSantos95 · September 21, 2021, 3:30pm

Hey everyone. I got this error:

Traceback (most recent call last):
  File "train.py", line 163, in <module>
    output, total_loss = model.eval_net_with_loss(model, image, gt, class_weights, device)
  File "/home/luan/Documentos/Codigos/doc_det/lib/vgg_unet_aspp_detection.py", line 206, in eval_net_with_loss
    loss = torch.nn.functional.nll_loss(softmax, gt, ignore_index=-1, weight=weights)
  File "/home/luan/.local/lib/python3.8/site-packages/torch/nn/functional.py", line 2527, in nll_loss
    return torch._C._nn.nll_loss_nd(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: weight tensor should be defined either for all or no classes

My code:

import datetime as dt
import dateutil.parser
import os
import pathlib as pl
from argparse import ArgumentParser
# ### external deps
import cv2
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.utils.data import DataLoader
# ### local deps
from lib.data_utils import SegmentationDataset
from lib.evaluation import TrainingMetrics
from lib.vgg_unet_aspp_detection import UNetVgg

print("Loaded!")

parser = ArgumentParser()
parser.add_argument("-t", "--timestamp", type=dateutil.parser.parse, default=dt.datetime.now(), help="Iso Formated date time")
args = parser.parse_args()

c_time = args.timestamp.replace(second=0, microsecond=0)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
gt_folder_train = ['Data/Training0', 'Data/Training1', 'Data/Training2']
gt_folder_val = ['Data/Validation']
model_dir = pl.Path("checkpoints").expanduser().absolute()
model_dir.mkdir(parents=True, exist_ok=True)

model_name = f'{model_dir}/detect_document-fixCnhHalf_{c_time.isoformat()}.pth'
mscoco_path = pl.Path('/home/luan/coco/images/val2017').expanduser().absolute()
patience = 80
plot_val = True
plot_train = False
plot_train_starting = True
save_dir = pl.Path(f"plots/{c_time.isoformat()}_train").expanduser().absolute()

max_epochs = 1000

batchs_per_update = 8

# Width x Height - MUST be divisible by 32
resolution = [640, 640]
max_side = 640
class_weights = [1, 0.5]
nClasses = 1
#class_weights = [1]
#nClasses = 1

# Color in RGB
class_to_color = {'etiqueta': (255, 0, 0), 'background': (0, 0, 255)}
class_to_id = {'etiqueta': 0, 'background': 1}
id_to_class = {v: k for k, v in class_to_id.items()}

print("Set up!")

# ### Dataloaders
def _init_loader_seed(worker_id):
    np.random.seed(torch.initial_seed() % 2**32)

train_dataset = SegmentationDataset(gt_folder_train, mscoco_path, True, class_to_id, resolution, True, max_side = max_side)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=8, drop_last=False, worker_init_fn = _init_loader_seed)

val_dataset = SegmentationDataset(gt_folder_val, mscoco_path, False, class_to_id, resolution, max_side = max_side)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True, num_workers=8, drop_last=False, worker_init_fn = _init_loader_seed)


# ### Plot and save an epoch of the training dataset ?
if plot_train_starting:
    plot_dir = save_dir / "pre_train"
    plot_dir.mkdir(parents=True, exist_ok=True)

    print("saving pre train samples")

    for i_batch, sample_batched in enumerate(train_loader):
            image_np = sample_batched['image_original'].cpu().numpy()
            gt = sample_batched['gt'].cpu().numpy()
            img_names = sample_batched['image_name']

            for sample_i in range(gt.shape[0]):
                current_gt = gt[sample_i, ...]
                current_img = image_np[sample_i, ...]
                img_name = img_names[sample_i]

                color_label = np.zeros((current_img.shape[0], current_img.shape[1], 3))
                for key, val in id_to_class.items():
                    color_label[current_gt] = class_to_color[val]

                file_img = f"{plot_dir}/{img_name.rsplit('.')[0]}.png"
                file_gt = f"{plot_dir}/{img_name.rsplit('.')[0]}_gt.png"

                alpha_blend = current_img * 0.5 + color_label.astype(np.uint8) * 0.5

                cv2.imwrite(file_img, alpha_blend[..., ::-1])
                cv2.imwrite(file_gt, color_label[..., ::-1].astype(np.uint8))
                    
                # plt.figure()
                # plt.imshow((current_img/255) * 0.5 + (color_label/255) * 0.5)
                # plt.show()
                #
                # plt.figure()
                # plt.imshow(color_label.astype(np.uint8))
                # plt.show()
                # print(end="")

# ### Start training...
# ### Network
model = UNetVgg(nClasses, device)
model.init_params()
model.to(device)

# ### Optimization (hyper)parameters
core_lr = 0.005
base_lr = 0.005
base_vgg_weight, base_vgg_bias, core_weight, core_bias = UNetVgg.get_params_by_kind(model, 2)

optimizer = torch.optim.SGD([{'params': base_vgg_bias, 'lr': base_lr},
                             {'params': base_vgg_weight, 'lr': base_lr},
                             {'params': core_bias, 'lr': core_lr},
                             {'params': core_weight, 'lr': core_lr, 'weight_decay': 0.00005}], momentum=0.9)

lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=60, verbose=True, factor=0.5)

eval_classes_train = ['etiqueta']
train_metrics = TrainingMetrics({valid_c: class_to_id[valid_c] for valid_c in eval_classes_train},
                                metric_best='mean_iou')

eval_classes_val = ['etiqueta']
val_metrics = TrainingMetrics({valid_c: class_to_id[valid_c] for valid_c in eval_classes_val}, metric_best='mean_iou')



best_val_acc = -1
best_epoch = 0

g_i_train = 0
        
for epoch in range(max_epochs):
    print('Epoch %d starting...' % (epoch+1))
    
    model.train()
    mean_loss = 0.0
    optimizer.zero_grad()
    
    for i_batch, sample_batched in enumerate(train_loader):
        print(f"batch: {i_batch} / {len(train_loader)}")
    
        image = sample_batched['image'].to(device)
        image_np = sample_batched['image_original'].cpu().numpy()
        gt = sample_batched['gt'].to(device)
        img_names = sample_batched['image_name']

        output, total_loss = model.eval_net_with_loss(model, image, gt, class_weights, device)
        total_loss = total_loss / batchs_per_update
        total_loss.backward()

Any ideas what it may be?

ptrblck · September 21, 2021, 9:49pm

Based on the error message your weight tensor doesn’t match the number of classes in your use case.
Assuming class_weight is used in the criterion, make sure your model outputs logits for two classes.