Same model, weight, input data, but output is different

gongmil · March 29, 2022, 2:19pm

Hello,

i have simple FCN network for learning.

within a single epoch, the output while learning in the model.train() is different from the output when the all batch loop is ending.

Also changing model.train() to model.eval(), outputs are also different.

###############################################
below is training code:

def train(model, train_set, valid_set, save, criterion, optimizer, epoch, prev_time):
mLoss = AverageMeter()
mAcc = AverageMeter()
force_acc = ForceAcc()

save.plot_reset()

model.train()

##### real train ####
training_data = torch.tensor([], dtype=torch.float)
training_output = torch.tensor([], dtype=torch.float)

for batch_idx, samples in enumerate(train_set):
    inputs = samples[0].cuda()
    labels = samples[1].cuda()

    outputs = model(inputs)

    if batch_idx == 0:
        training_data = inputs
        training_output = outputs
    else:
        training_data = torch.cat([training_data, inputs],dim=0)
        training_output = torch.cat([training_output, outputs],dim=0)

    loss = criterion(outputs.view(labels.shape[0],1), labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    mLoss.update(loss.item())

    force_acc.detection(outputs, labels)
    mAcc.update(force_acc.acc)

    batches_done = epoch * len(train_set) + batch_idx
    batches_left = args.epochs * len(train_set) - batches_done
    time_left = datetime.timedelta(seconds=batches_left * (time.time() - prev_time))
    prev_time = time.time()

    sys.stdout.write(
        "\r[Train][Epoch %d/%d] [Batch %d/%d] [mAcc: %f] [mLoss: %f] ETA: %s"
        % (
            epoch,
            args.epochs,
            batch_idx,
            len(train_set),
            mAcc.avg * 100,
            mLoss.avg,
            time_left
        )
    )
    save.plot_append(inputs, outputs, labels)

save.update(epoch, mAcc.avg)
if not args.split_shuffle:
    save.check_point(model)

save.plot(name = "real_training")
save.plot_reset()

#### real train end ####

#### model.train valid ####
training_run_data = torch.tensor([], dtype=torch.float)
training_run_output = torch.tensor([], dtype=torch.float)

mLoss = AverageMeter()
mAcc  = AverageMeter()

for batch_idx, samples in enumerate(train_set):
    inputs = samples[0].cuda()
    labels = samples[1].cuda()
    
    outputs = model(inputs)

    if batch_idx == 0:
        training_run_data = inputs
        training_run_output = outputs
    else:
        training_run_data = torch.cat([training_run_data, inputs],dim=0)
        training_run_output = torch.cat([training_run_output, outputs],dim=0)

    loss = criterion(outputs.view(labels.shape[0],1), labels)

    mLoss.update(loss.item())

    force_acc.detection(outputs, labels)
    mAcc.update(force_acc.acc)

    batches_done = epoch * len(train_set) + batch_idx
    batches_left = args.epochs * len(train_set) - batches_done
    time_left = datetime.timedelta(seconds=batches_left * (time.time() - prev_time))
    prev_time = time.time()

    sys.stdout.write(
        "\r[Train][Epoch %d/%d] [Batch %d/%d] [mAcc: %f] [mLoss: %f] ETA: %s"
        % (
            epoch,
            args.epochs,
            batch_idx,
            len(train_set),
            mAcc.avg * 100,
            mLoss.avg,
            time_left
        )
    )
    save.plot_append(inputs, outputs, labels)

save.update(epoch, mAcc.avg)

save.plot(name = "train")
save.plot_reset()

#### model.train valid end ####

#### model.eval valid ####

model.eval()

mLoss = AverageMeter()
mAcc  = AverageMeter()

eval_data = torch.tensor([], dtype=torch.float)
eval_output = torch.tensor([], dtype=torch.float)

for batch_idx, samples in enumerate(train_set):
    inputs = samples[0].cuda()
    labels = samples[1].cuda()
    
    outputs = model(inputs)

    if batch_idx == 0:
        eval_data = inputs
        eval_output = outputs
    else:
        eval_data = torch.cat([eval_data, inputs],dim=0)
        eval_output = torch.cat([eval_output, outputs],dim=0)

    loss = criterion(outputs.view(labels.shape[0],1), labels)

    mLoss.update(loss.item())

    force_acc.detection(outputs, labels)
    mAcc.update(force_acc.acc)

    batches_done = epoch * len(train_set) + batch_idx
    batches_left = args.epochs * len(train_set) - batches_done
    time_left = datetime.timedelta(seconds=batches_left * (time.time() - prev_time))
    prev_time = time.time()

    sys.stdout.write(
        "\r[Train][Epoch %d/%d] [Batch %d/%d] [mAcc: %f] [mLoss: %f] ETA: %s"
        % (
            epoch,
            args.epochs,
            batch_idx,
            len(train_set),
            mAcc.avg * 100,
            mLoss.avg,
            time_left
        )
    )
    save.plot_append(inputs, outputs, labels)

save.update(epoch, mAcc.avg)

save.plot(name = "valid")

#### model.eval valid end ####

###############################################
below is example of output:

[Train][Epoch 3/5] [Batch 1079/1080] [mAcc: 50.963112] [mLoss: 0.016647] ETA: 0:00:04.060802314480
[INFO] Epoch3’s inference image saved
[Train][Epoch 3/5] [Batch 1079/1080] [mAcc: 64.160314] [mLoss: 0.013769] ETA: 0:00:01.336590
[INFO] Epoch3’s inference image saved
[Train][Epoch 3/5] [Batch 1079/1080] [mAcc: 64.160314] [mLoss: 0.013769] ETA: 0:00:02.207977
[INFO] Epoch3’s inference image saved

###############################################
below is my network code:

class PoseForceNet_FCN(nn.Module):
def init(self):
super(PoseForceNet_FCN, self).init()

    self.fc1 = nn.Linear(dof, 128)
    self.fc2 = nn.Linear(128, 128)
    self.fc3 = nn.Linear(128, 128)
    self.fc4 = nn.Linear(128, 1)

    # self.relu = nn.ReLU()

def forward(self, v):
    v = self.fc1(v)
    v = self.fc2(v)
    v = self.fc3(v)
    band = self.fc4(v)

    return band

#########################################
I haven’t found a solution to this problem in a month.
thanks for your help

ptrblck · March 29, 2022, 5:48pm

I’m not sure I understand the issue completely. Could you describe which values you are comparing and what the expectation would be?

gongmil · March 30, 2022, 4:00am

Thanks for your reply.

After posting the question, I found a serious mistake in my code.

If the problem persists even after fixing the problem, I will ask you again.

thanks for your reply again!