What is the difference between these TensorFlow and PyTorch classification networks? PyTorch network does not work

I am new to PyTorch and trying to implement a simple image classification (binary classification, 1/0) network in PyTorch. I have already got good results (~90 %) with a TensorFlow/Tflearn implementation and trying to regenerate it in PyTorch. However I get high training accuracy but very low test accuracy in PyTorch. Below are the two implementations.

What could be the reason for the difference? Is it because of a difference in the two network implementations or training procedure, if so, what is it?

TensorFlow/Tflearn implementation that works well:

# ConvNet on stacks of grayscale images
def apply_3d_cnn_single_view(train_data, train_labels, test_data, w=100, h=100, NS=6):

train_labels = to_categorical(train_labels, nb_classes=2)

da = tflearn.data_augmentation.ImageAugmentation()
da.add_random_blur (sigma_max=1.5)

net = tflearn.input_data(shape=[None, h, w, NS], data_augmentation=da)

net = tflearn.conv_2d(net, 32, (3,3), activation='relu', regularizer="L2")
net = tflearn.conv_2d(net, 32, (3,3), activation='relu', regularizer="L2")
net = tflearn.max_pool_2d(net, 3, 2) # 50x50

net = tflearn.conv_2d(net, 64, (3,3), activation='relu', regularizer="L2")
net = tflearn.conv_2d(net, 64, (3,3), activation='relu', regularizer="L2")
net = tflearn.max_pool_2d(net, 3, 3) # 16x16

net = tflearn.conv_2d(net, 32, (3,3), activation='relu', regularizer="L2")
net = tflearn.conv_2d(net, 32, (3,3), activation='relu', regularizer="L2")
net = tflearn.max_pool_2d(net, 3, 3) # 5x5

net = tflearn.fully_connected(net, 32, activation='relu', regularizer="L2")
net = tflearn.fully_connected(net, 2, activation='softmax', regularizer="L2")
net = tflearn.regression(net, optimizer='adam', learning_rate=0.0001, loss='categorical_crossentropy')

model = tflearn.DNN(net)
model.fit(train_data, train_labels, n_epoch=100, show_metric=True, batch_size=10)

pre_labels = model.predict_label(test_data)
print 'pre_labels:', pre_labels

tf.reset_default_graph()

return category_to_class_labels(pre_labels)

PyTorch implementation, which gives low/zero test accuracy

def random_blur_img(image, sigma_max=1.5):    
  if bool(random.getrandbits(1)):
    # Random sigma
    sigma = random.uniform(0., sigma_max)
    return scipy.ndimage.filters.gaussian_filter(image, sigma)
  return image

 class DataSet(torchdata.Dataset):
   def __init__(self, data, labels, transform=None):
     self.data = data
     self.labels = labels
     self.transform = transform

def __len__(self):
 return len(self.labels)

def __getitem__(self, index):
  image = self.data[index]
  label = self.labels[index]

  if  self.transform:
    image =  self.transform(image)

  return image, label # torch cross entropy loss requires label id (0...C-1)

# CNN Model
class CNN(nn.Module):
  def __init__(self, num_channels=3, net='small'):
  super(CNN, self).__init__()

  self.layer1 = nn.Sequential(
    nn.Conv2d(num_channels, 32, kernel_size=3, padding=1, dilation=1),
    #nn.BatchNorm2d(32),
    nn.ReLU(),
    nn.Conv2d(32, 32, kernel_size=3, padding=1, dilation=1),
    #nn.BatchNorm2d(32),
    nn.ReLU(),
    nn.MaxPool2d(3,2))

 self.layer2 = nn.Sequential(
    nn.Conv2d(32, 64, kernel_size=3, padding=1, dilation=1),
    #nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.Conv2d(64, 64, kernel_size=3, padding=1, dilation=1),
    #nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.MaxPool2d(3,3))

 self.layer3 = nn.Sequential(
    nn.Conv2d(64, 32, kernel_size=3, padding=1, dilation=1),
    nn.ReLU(),
    #nn.BatchNorm2d(32),
    nn.Conv2d(32, 32, kernel_size=3, padding=1, dilation=1),
    #nn.BatchNorm2d(32),
    nn.ReLU(),
    nn.MaxPool2d(3,3))

 self.fc1 = nn.Linear(5*5*32, 32)
 self.fc2 = nn.Linear(32, 2)

def forward(self, x):
  out = self.layer1(x)
  out = self.layer2(out)
  out = self.layer3(out)
  out = out.view(out.size(0), -1)
  out = self.fc1(out)
  out = self.fc2(out)        


  return out   

def train_net(self, train_data, train_labels, lrate=0.001, epochs=100, batch_size=10, shuffle=True):
  self.cuda()
  self.train()

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(self.parameters(), lr=lrate)

dataset = DataSet(train_data, train_labels, transform=random_blur_img)
data_loader  = torchdata.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=1)      

timer = Timer()
for epoch in range(epochs):
   timer.tic()
   correct = 0         # number of correct predictions
  epoch_loss = 0.0    # total loss for the current epoch
  for i,data in enumerate(data_loader):
     batch,labels = data

    inputs = Variable(batch.float().cuda())
    vlabels = Variable(labels.cuda())

    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    outputs = self.forward(inputs)          
    loss = criterion(outputs, vlabels)
    loss.backward()        
    optimizer.step()

   predicted = outputs.data.max(1)[1] # get the index of the max log-probability
   correct += predicted.cpu().eq(labels.cpu()).sum()

   epoch_loss += loss.data[0]

 cls_accuracy = 100*(float(correct)/len(dataset))
 runtime = timer.toc()
 print('Epoch %d  accuracy : %.2f  loss : %f \t time : %f'%(epoch, cls_accuracy, epoch_loss, runtime))


def test_net(self, test_data, test_labels, batch_size=10):
  self.cuda()
  self.eval()

 dataset = DataSet(test_data, test_labels)
 data_loader  = torchdata.DataLoader(dataset, batch_size=batch_size, num_workers=1)     

pre_labels = []
for i,data in enumerate(data_loader):
   batch,labels = data

  inputs = Variable(batch.float().cuda())
  outputs = self.forward(inputs)

  predicted = outputs.data.max(1)[1] # get the index of the max log-probability        
  pre_labels += list(predicted.cpu().numpy().flatten())


 return pre_labels

def apply_3d_cnn(train_data, train_labels, test_data, test_labels, w=100, h=100, NS=6):

  network = CNN(num_channels=6)

  timer = Timer()
  timer.tic()

  network.train_net(train_data, train_labels, lrate=0.0001, epochs=100, batch_size=10, shuffle=True)

  runtime = timer.toc()
  print('Training time: %f'%(runtime))

  timer.tic()
  pre_labels = network.test_net(test_data, test_labels, batch_size=10)
  runtime = timer.toc()
  print('Test time: %f'%(runtime))

  report_class_performance(test_labels, pre_labels)

Notes:

  • It was hard to indent the code here; you can assume the indentation is correct.
  • PyTorch network converges very quickly, 100 % accuracy on training set in 30 epochs, whereas TF reaches ~97 % in 30 epochs and never reaches exactly 100 %.
  • I run trained PyTorch network on the training set and get 100 % (to check if there is a problem in evaluation)
  • I am aware of the different input formats (WxHxC in TF, CxWxH in PyTorch) and prepare the input data accordingly.

Is there a difference between the two network implementations and training procedure that may cause the problem?

1 Like

It seems, the difference is the L2 regularization, which prevents overfitting in TF implementation.
What is an easy way to add L2 regularization to PyTorch layer? or should I try other regularization techniques, like dropout?

L2 regularization in PyTorch is available in all the optimizers, through the weight_decay parameter.
I modified the code accordingly.
optimizer = torch.optim.Adam(self.parameters(), lr=lrate, weight_decay=0.001)

I tried different values of weight_decay (0.0001, 0.001, 0.01). There is little improvement, but test accuracy is still far worse than TF.
The default weight_decay in tflearn is 0.001. http://tflearn.org/layers/conv/

I met the same problem.