I create the same data loader and same data iter for keras、pytorch、mxnet with vgg network, the net init is all random and not use pre train model. But pytorch’s loss can’t go down when loss is about 200. At the same time, keras’s and mxnet’s loss go down about 20.
the pytorch code is here, I can’t belive every thing is right, could you help me:
class myModel(nn.Module):
def __init__(self):
super(myModel, self).__init__()
self.features = list(make_model('vgg16', num_classes=1, pretrained=False, input_size=INPUT_SIZE).children())[0]
self.globalavg = nn.AdaptiveAvgPool2d(1)
self.head = nn.Sequential(nn.Linear(512, 1024, bias=True),
nn.ReLU(),
nn.Linear(1024, 1, bias=True))
def forward(self, x):
x = self.features(x)
x = self.globalavg(x)
x = x.view(x.size(0), -1)
x = self.head(x)
return x
model = myModel()
model = model.cuda()
# train code copy from pytorch.org
def train_model(model, dataloaders, criterion, optimizer, num_epochs=21, is_inception=False):
since = time.time()
val_acc_history = []
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in tqdm(range(num_epochs)):
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
bar = tqdm(dataloaders[phase])
for inputs, labels in bar:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
# Get model outputs and calculate loss
# Special case for inception because in training it has an auxiliary output. In train
# mode we calculate the loss by summing the final output and the auxiliary output
# but in testing we only consider the final output.
if is_inception and phase == 'train':
# From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958
outputs = model(inputs)
loss1 = criterion(outputs, labels)
loss = loss1
else:
outputs = model(inputs)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
bar.set_description('loss: {:.4f}'.format(loss.item()))
# statistics
running_loss += loss.item() * inputs.size(0)
epoch_loss = running_loss / len(dataloaders[phase].dataset)
print('{} Loss: {:.4f}'.format(phase, epoch_loss))
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
# load best model weights
model.load_state_dict(best_model_wts)
return model, val_acc_history
dataloaders = {'train':train_dl, 'val':valid_dl}
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train_model(model, dataloaders, nn.MSELoss(reduction='mean'), optim.Adam(model.parameters(),lr=0.001), 21)
net is as follows:
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 64, 375, 375] 1,792
ReLU-2 [-1, 64, 375, 375] 0
Conv2d-3 [-1, 64, 375, 375] 36,928
ReLU-4 [-1, 64, 375, 375] 0
MaxPool2d-5 [-1, 64, 187, 187] 0
Conv2d-6 [-1, 128, 187, 187] 73,856
ReLU-7 [-1, 128, 187, 187] 0
Conv2d-8 [-1, 128, 187, 187] 147,584
ReLU-9 [-1, 128, 187, 187] 0
MaxPool2d-10 [-1, 128, 93, 93] 0
Conv2d-11 [-1, 256, 93, 93] 295,168
ReLU-12 [-1, 256, 93, 93] 0
Conv2d-13 [-1, 256, 93, 93] 590,080
ReLU-14 [-1, 256, 93, 93] 0
Conv2d-15 [-1, 256, 93, 93] 590,080
ReLU-16 [-1, 256, 93, 93] 0
MaxPool2d-17 [-1, 256, 46, 46] 0
Conv2d-18 [-1, 512, 46, 46] 1,180,160
ReLU-19 [-1, 512, 46, 46] 0
Conv2d-20 [-1, 512, 46, 46] 2,359,808
ReLU-21 [-1, 512, 46, 46] 0
Conv2d-22 [-1, 512, 46, 46] 2,359,808
ReLU-23 [-1, 512, 46, 46] 0
MaxPool2d-24 [-1, 512, 23, 23] 0
Conv2d-25 [-1, 512, 23, 23] 2,359,808
ReLU-26 [-1, 512, 23, 23] 0
Conv2d-27 [-1, 512, 23, 23] 2,359,808
ReLU-28 [-1, 512, 23, 23] 0
Conv2d-29 [-1, 512, 23, 23] 2,359,808
ReLU-30 [-1, 512, 23, 23] 0
MaxPool2d-31 [-1, 512, 11, 11] 0
AdaptiveAvgPool2d-32 [-1, 512, 1, 1] 0
Linear-33 [-1, 1024] 525,312
ReLU-34 [-1, 1024] 0
Linear-35 [-1, 1] 1,025
================================================================
Total params: 15,241,025
Trainable params: 15,241,025
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 1.61
Forward/backward pass size (MB): 606.82
Params size (MB): 58.14
Estimated Total Size (MB): 666.57
----------------------------------------------------------------