Hey everyone. I got this error:
Traceback (most recent call last):
File "train.py", line 163, in <module>
output, total_loss = model.eval_net_with_loss(model, image, gt, class_weights, device)
File "/home/luan/Documentos/Codigos/doc_det/lib/vgg_unet_aspp_detection.py", line 206, in eval_net_with_loss
loss = torch.nn.functional.nll_loss(softmax, gt, ignore_index=-1, weight=weights)
File "/home/luan/.local/lib/python3.8/site-packages/torch/nn/functional.py", line 2527, in nll_loss
return torch._C._nn.nll_loss_nd(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: weight tensor should be defined either for all or no classes
My code:
import datetime as dt
import dateutil.parser
import os
import pathlib as pl
from argparse import ArgumentParser
# ### external deps
import cv2
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.utils.data import DataLoader
# ### local deps
from lib.data_utils import SegmentationDataset
from lib.evaluation import TrainingMetrics
from lib.vgg_unet_aspp_detection import UNetVgg
print("Loaded!")
parser = ArgumentParser()
parser.add_argument("-t", "--timestamp", type=dateutil.parser.parse, default=dt.datetime.now(), help="Iso Formated date time")
args = parser.parse_args()
c_time = args.timestamp.replace(second=0, microsecond=0)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
gt_folder_train = ['Data/Training0', 'Data/Training1', 'Data/Training2']
gt_folder_val = ['Data/Validation']
model_dir = pl.Path("checkpoints").expanduser().absolute()
model_dir.mkdir(parents=True, exist_ok=True)
model_name = f'{model_dir}/detect_document-fixCnhHalf_{c_time.isoformat()}.pth'
mscoco_path = pl.Path('/home/luan/coco/images/val2017').expanduser().absolute()
patience = 80
plot_val = True
plot_train = False
plot_train_starting = True
save_dir = pl.Path(f"plots/{c_time.isoformat()}_train").expanduser().absolute()
max_epochs = 1000
batchs_per_update = 8
# Width x Height - MUST be divisible by 32
resolution = [640, 640]
max_side = 640
class_weights = [1, 0.5]
nClasses = 1
#class_weights = [1]
#nClasses = 1
# Color in RGB
class_to_color = {'etiqueta': (255, 0, 0), 'background': (0, 0, 255)}
class_to_id = {'etiqueta': 0, 'background': 1}
id_to_class = {v: k for k, v in class_to_id.items()}
print("Set up!")
# ### Dataloaders
def _init_loader_seed(worker_id):
np.random.seed(torch.initial_seed() % 2**32)
train_dataset = SegmentationDataset(gt_folder_train, mscoco_path, True, class_to_id, resolution, True, max_side = max_side)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=8, drop_last=False, worker_init_fn = _init_loader_seed)
val_dataset = SegmentationDataset(gt_folder_val, mscoco_path, False, class_to_id, resolution, max_side = max_side)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True, num_workers=8, drop_last=False, worker_init_fn = _init_loader_seed)
# ### Plot and save an epoch of the training dataset ?
if plot_train_starting:
plot_dir = save_dir / "pre_train"
plot_dir.mkdir(parents=True, exist_ok=True)
print("saving pre train samples")
for i_batch, sample_batched in enumerate(train_loader):
image_np = sample_batched['image_original'].cpu().numpy()
gt = sample_batched['gt'].cpu().numpy()
img_names = sample_batched['image_name']
for sample_i in range(gt.shape[0]):
current_gt = gt[sample_i, ...]
current_img = image_np[sample_i, ...]
img_name = img_names[sample_i]
color_label = np.zeros((current_img.shape[0], current_img.shape[1], 3))
for key, val in id_to_class.items():
color_label[current_gt] = class_to_color[val]
file_img = f"{plot_dir}/{img_name.rsplit('.')[0]}.png"
file_gt = f"{plot_dir}/{img_name.rsplit('.')[0]}_gt.png"
alpha_blend = current_img * 0.5 + color_label.astype(np.uint8) * 0.5
cv2.imwrite(file_img, alpha_blend[..., ::-1])
cv2.imwrite(file_gt, color_label[..., ::-1].astype(np.uint8))
# plt.figure()
# plt.imshow((current_img/255) * 0.5 + (color_label/255) * 0.5)
# plt.show()
#
# plt.figure()
# plt.imshow(color_label.astype(np.uint8))
# plt.show()
# print(end="")
# ### Start training...
# ### Network
model = UNetVgg(nClasses, device)
model.init_params()
model.to(device)
# ### Optimization (hyper)parameters
core_lr = 0.005
base_lr = 0.005
base_vgg_weight, base_vgg_bias, core_weight, core_bias = UNetVgg.get_params_by_kind(model, 2)
optimizer = torch.optim.SGD([{'params': base_vgg_bias, 'lr': base_lr},
{'params': base_vgg_weight, 'lr': base_lr},
{'params': core_bias, 'lr': core_lr},
{'params': core_weight, 'lr': core_lr, 'weight_decay': 0.00005}], momentum=0.9)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=60, verbose=True, factor=0.5)
eval_classes_train = ['etiqueta']
train_metrics = TrainingMetrics({valid_c: class_to_id[valid_c] for valid_c in eval_classes_train},
metric_best='mean_iou')
eval_classes_val = ['etiqueta']
val_metrics = TrainingMetrics({valid_c: class_to_id[valid_c] for valid_c in eval_classes_val}, metric_best='mean_iou')
best_val_acc = -1
best_epoch = 0
g_i_train = 0
for epoch in range(max_epochs):
print('Epoch %d starting...' % (epoch+1))
model.train()
mean_loss = 0.0
optimizer.zero_grad()
for i_batch, sample_batched in enumerate(train_loader):
print(f"batch: {i_batch} / {len(train_loader)}")
image = sample_batched['image'].to(device)
image_np = sample_batched['image_original'].cpu().numpy()
gt = sample_batched['gt'].to(device)
img_names = sample_batched['image_name']
output, total_loss = model.eval_net_with_loss(model, image, gt, class_weights, device)
total_loss = total_loss / batchs_per_update
total_loss.backward()
Any ideas what it may be?