Hi guys!
I try to train a classifier based on COCO dataset. I have few classes and after every epoch I am checking f1 and mae. The problem is that always after one iteration output values are kinda random and later (after 2-5 iterations) always every sample is classified as the same class (for example when in the batch I have 100 elements in 100 classes as a result I will predict 10000 elements of class 0). I can’t see any mistakes in my code. My loss function works fine so where is the problem?
Like I said it’s based on COCO dataset so maybe I chose wrong learning rate or optimizer? I will be glad for any help or suggestions.
def train(train_loader, model, optimizer, epoch, device):
model.train()
loss_monitor = AverageMeter()
lr_scheduler = None
if epoch == 0:
warmup_factor = 1.0 / 1000
warmup_iters = min(1000, len(train_loader) - 1)
lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
with tqdm(train_loader) as _tqdm:
for x, y in _tqdm:
x = x.to(device)
for key, value in y.items():
y[key] = torch.tensor(value).to(device)
y_list = []
for i in range(0, len(x)):
y_list.append(y)
outputs = model(x, y_list)
cur_loss = outputs["my_loss"].detach().item()
# measure accuracy and record loss
sample_num = x.size(0)
loss_monitor.update(cur_loss, sample_num)
# compute gradient and do step
optimizer.zero_grad()
(outputs["my_loss"]).backward()
optimizer.step()
if lr_scheduler is not None:
lr_scheduler.step()
_tqdm.set_postfix(
OrderedDict(stage="train", epoch=epoch, loss=loss_monitor.avg),
)
return loss_monitor.avg # unnecessary
def validate(val_loader, model, epoch, device):
model.eval()
preds = []
gt = []
with torch.no_grad():
with tqdm(val_loader) as _tqdm:
for x, y in _tqdm:
x = x.to(device)
for key, value in y.items():
y[key] = torch.tensor(value).to(device)
gt.append(y["value"].cpu().numpy())
outputs = model(x)
for output in outputs:
pred = np.argmax(
output["value"][0].cpu().numpy()
) # just changes format
preds.append(pred)
_tqdm.set_postfix(OrderedDict(stage="val", epoch=epoch),)
mae = calculate_mae(gt, np.array(preds)) # my own functions but works well - that's not the problem
f1 = calculate_f1(gt, preds)
return mae, f1
def main():
start_epoch = 0
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == torch.device("cuda"):
cudnn.benchmark = True
val_dataset = LoadDataset("val") # normal dataset maker - works fine
train_dataset = LoadDataset("train")
model = PornRCNN.create_resnet_50()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
model = model.to(device)
model.set_my_loss_fn(my_loss)
# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
num_epoch = 100
for epoch in range(start_epoch, num_epoch):
val_loader = DataLoader(
val_dataset, batch_size=24, shuffle=False, num_workers=0
)
train_loader = DataLoader(
train_dataset, batch_size=24, shuffle=False, num_workers=0
)
train_loss = train(train_loader, model, optimizer, epoch, device)
mae, f1 = validate(val_loader, model, epoch, device)
"""later I just check if mae or f1 is better then before and save model"""