Hello,
i have simple FCN network for learning.
within a single epoch, the output while learning in the model.train() is different from the output when the all batch loop is ending.
Also changing model.train() to model.eval(), outputs are also different.
###############################################
below is training code:
def train(model, train_set, valid_set, save, criterion, optimizer, epoch, prev_time):
mLoss = AverageMeter()
mAcc = AverageMeter()
force_acc = ForceAcc()
save.plot_reset()
model.train()
##### real train ####
training_data = torch.tensor([], dtype=torch.float)
training_output = torch.tensor([], dtype=torch.float)
for batch_idx, samples in enumerate(train_set):
inputs = samples[0].cuda()
labels = samples[1].cuda()
outputs = model(inputs)
if batch_idx == 0:
training_data = inputs
training_output = outputs
else:
training_data = torch.cat([training_data, inputs],dim=0)
training_output = torch.cat([training_output, outputs],dim=0)
loss = criterion(outputs.view(labels.shape[0],1), labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
mLoss.update(loss.item())
force_acc.detection(outputs, labels)
mAcc.update(force_acc.acc)
batches_done = epoch * len(train_set) + batch_idx
batches_left = args.epochs * len(train_set) - batches_done
time_left = datetime.timedelta(seconds=batches_left * (time.time() - prev_time))
prev_time = time.time()
sys.stdout.write(
"\r[Train][Epoch %d/%d] [Batch %d/%d] [mAcc: %f] [mLoss: %f] ETA: %s"
% (
epoch,
args.epochs,
batch_idx,
len(train_set),
mAcc.avg * 100,
mLoss.avg,
time_left
)
)
save.plot_append(inputs, outputs, labels)
save.update(epoch, mAcc.avg)
if not args.split_shuffle:
save.check_point(model)
save.plot(name = "real_training")
save.plot_reset()
#### real train end ####
#### model.train valid ####
training_run_data = torch.tensor([], dtype=torch.float)
training_run_output = torch.tensor([], dtype=torch.float)
mLoss = AverageMeter()
mAcc = AverageMeter()
for batch_idx, samples in enumerate(train_set):
inputs = samples[0].cuda()
labels = samples[1].cuda()
outputs = model(inputs)
if batch_idx == 0:
training_run_data = inputs
training_run_output = outputs
else:
training_run_data = torch.cat([training_run_data, inputs],dim=0)
training_run_output = torch.cat([training_run_output, outputs],dim=0)
loss = criterion(outputs.view(labels.shape[0],1), labels)
mLoss.update(loss.item())
force_acc.detection(outputs, labels)
mAcc.update(force_acc.acc)
batches_done = epoch * len(train_set) + batch_idx
batches_left = args.epochs * len(train_set) - batches_done
time_left = datetime.timedelta(seconds=batches_left * (time.time() - prev_time))
prev_time = time.time()
sys.stdout.write(
"\r[Train][Epoch %d/%d] [Batch %d/%d] [mAcc: %f] [mLoss: %f] ETA: %s"
% (
epoch,
args.epochs,
batch_idx,
len(train_set),
mAcc.avg * 100,
mLoss.avg,
time_left
)
)
save.plot_append(inputs, outputs, labels)
save.update(epoch, mAcc.avg)
save.plot(name = "train")
save.plot_reset()
#### model.train valid end ####
#### model.eval valid ####
model.eval()
mLoss = AverageMeter()
mAcc = AverageMeter()
eval_data = torch.tensor([], dtype=torch.float)
eval_output = torch.tensor([], dtype=torch.float)
for batch_idx, samples in enumerate(train_set):
inputs = samples[0].cuda()
labels = samples[1].cuda()
outputs = model(inputs)
if batch_idx == 0:
eval_data = inputs
eval_output = outputs
else:
eval_data = torch.cat([eval_data, inputs],dim=0)
eval_output = torch.cat([eval_output, outputs],dim=0)
loss = criterion(outputs.view(labels.shape[0],1), labels)
mLoss.update(loss.item())
force_acc.detection(outputs, labels)
mAcc.update(force_acc.acc)
batches_done = epoch * len(train_set) + batch_idx
batches_left = args.epochs * len(train_set) - batches_done
time_left = datetime.timedelta(seconds=batches_left * (time.time() - prev_time))
prev_time = time.time()
sys.stdout.write(
"\r[Train][Epoch %d/%d] [Batch %d/%d] [mAcc: %f] [mLoss: %f] ETA: %s"
% (
epoch,
args.epochs,
batch_idx,
len(train_set),
mAcc.avg * 100,
mLoss.avg,
time_left
)
)
save.plot_append(inputs, outputs, labels)
save.update(epoch, mAcc.avg)
save.plot(name = "valid")
#### model.eval valid end ####
###############################################
below is example of output:
[Train][Epoch 3/5] [Batch 1079/1080] [mAcc: 50.963112] [mLoss: 0.016647] ETA: 0:00:04.060802314480
[INFO] Epoch3’s inference image saved
[Train][Epoch 3/5] [Batch 1079/1080] [mAcc: 64.160314] [mLoss: 0.013769] ETA: 0:00:01.336590
[INFO] Epoch3’s inference image saved
[Train][Epoch 3/5] [Batch 1079/1080] [mAcc: 64.160314] [mLoss: 0.013769] ETA: 0:00:02.207977
[INFO] Epoch3’s inference image saved
###############################################
below is my network code:
class PoseForceNet_FCN(nn.Module):
def init(self):
super(PoseForceNet_FCN, self).init()
self.fc1 = nn.Linear(dof, 128)
self.fc2 = nn.Linear(128, 128)
self.fc3 = nn.Linear(128, 128)
self.fc4 = nn.Linear(128, 1)
# self.relu = nn.ReLU()
def forward(self, v):
v = self.fc1(v)
v = self.fc2(v)
v = self.fc3(v)
band = self.fc4(v)
return band
#########################################
I haven’t found a solution to this problem in a month.
thanks for your help