What is the difference between these TensorFlow and PyTorch classification networks? PyTorch network does not work

I am new to PyTorch and trying to implement a simple image classification (binary classification, 1/0) network in PyTorch. I have already got good results (~90 %) with a TensorFlow/Tflearn implementation and trying to regenerate it in PyTorch. However I get high training accuracy but very low test accuracy in PyTorch. Below are the two implementations.

What could be the reason for the difference? Is it because of a difference in the two network implementations or training procedure, if so, what is it?

TensorFlow/Tflearn implementation that works well:

# ConvNet on stacks of grayscale images
def apply_3d_cnn_single_view(train_data, train_labels, test_data, w=100, h=100, NS=6):

train_labels = to_categorical(train_labels, nb_classes=2)

da = tflearn.data_augmentation.ImageAugmentation()
da.add_random_blur (sigma_max=1.5)

net = tflearn.input_data(shape=[None, h, w, NS], data_augmentation=da)

net = tflearn.conv_2d(net, 32, (3,3), activation='relu', regularizer="L2")
net = tflearn.conv_2d(net, 32, (3,3), activation='relu', regularizer="L2")
net = tflearn.max_pool_2d(net, 3, 2) # 50x50

net = tflearn.conv_2d(net, 64, (3,3), activation='relu', regularizer="L2")
net = tflearn.conv_2d(net, 64, (3,3), activation='relu', regularizer="L2")
net = tflearn.max_pool_2d(net, 3, 3) # 16x16

net = tflearn.conv_2d(net, 32, (3,3), activation='relu', regularizer="L2")
net = tflearn.conv_2d(net, 32, (3,3), activation='relu', regularizer="L2")
net = tflearn.max_pool_2d(net, 3, 3) # 5x5

net = tflearn.fully_connected(net, 32, activation='relu', regularizer="L2")
net = tflearn.fully_connected(net, 2, activation='softmax', regularizer="L2")
net = tflearn.regression(net, optimizer='adam', learning_rate=0.0001, loss='categorical_crossentropy')

model = tflearn.DNN(net)
model.fit(train_data, train_labels, n_epoch=100, show_metric=True, batch_size=10)

pre_labels = model.predict_label(test_data)
print 'pre_labels:', pre_labels

tf.reset_default_graph()

return category_to_class_labels(pre_labels)

PyTorch implementation, which gives low/zero test accuracy

def random_blur_img(image, sigma_max=1.5):    
  if bool(random.getrandbits(1)):
    # Random sigma
    sigma = random.uniform(0., sigma_max)
    return scipy.ndimage.filters.gaussian_filter(image, sigma)
  return image

 class DataSet(torchdata.Dataset):
   def __init__(self, data, labels, transform=None):
     self.data = data
     self.labels = labels
     self.transform = transform

def __len__(self):
 return len(self.labels)

def __getitem__(self, index):
  image = self.data[index]
  label = self.labels[index]

  if  self.transform:
    image =  self.transform(image)

  return image, label # torch cross entropy loss requires label id (0...C-1)

# CNN Model
class CNN(nn.Module):
  def __init__(self, num_channels=3, net='small'):
  super(CNN, self).__init__()

  self.layer1 = nn.Sequential(
    nn.Conv2d(num_channels, 32, kernel_size=3, padding=1, dilation=1),
    #nn.BatchNorm2d(32),
    nn.ReLU(),
    nn.Conv2d(32, 32, kernel_size=3, padding=1, dilation=1),
    #nn.BatchNorm2d(32),
    nn.ReLU(),
    nn.MaxPool2d(3,2))

 self.layer2 = nn.Sequential(
    nn.Conv2d(32, 64, kernel_size=3, padding=1, dilation=1),
    #nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.Conv2d(64, 64, kernel_size=3, padding=1, dilation=1),
    #nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.MaxPool2d(3,3))

 self.layer3 = nn.Sequential(
    nn.Conv2d(64, 32, kernel_size=3, padding=1, dilation=1),
    nn.ReLU(),
    #nn.BatchNorm2d(32),
    nn.Conv2d(32, 32, kernel_size=3, padding=1, dilation=1),
    #nn.BatchNorm2d(32),
    nn.ReLU(),
    nn.MaxPool2d(3,3))

 self.fc1 = nn.Linear(5*5*32, 32)
 self.fc2 = nn.Linear(32, 2)

def forward(self, x):
  out = self.layer1(x)
  out = self.layer2(out)
  out = self.layer3(out)
  out = out.view(out.size(0), -1)
  out = self.fc1(out)
  out = self.fc2(out)        


  return out   

def train_net(self, train_data, train_labels, lrate=0.001, epochs=100, batch_size=10, shuffle=True):
  self.cuda()
  self.train()

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(self.parameters(), lr=lrate)

dataset = DataSet(train_data, train_labels, transform=random_blur_img)
data_loader  = torchdata.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=1)      

timer = Timer()
for epoch in range(epochs):
   timer.tic()
   correct = 0         # number of correct predictions
  epoch_loss = 0.0    # total loss for the current epoch
  for i,data in enumerate(data_loader):
     batch,labels = data

    inputs = Variable(batch.float().cuda())
    vlabels = Variable(labels.cuda())

    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    outputs = self.forward(inputs)          
    loss = criterion(outputs, vlabels)
    loss.backward()        
    optimizer.step()

   predicted = outputs.data.max(1)[1] # get the index of the max log-probability
   correct += predicted.cpu().eq(labels.cpu()).sum()

   epoch_loss += loss.data[0]

 cls_accuracy = 100*(float(correct)/len(dataset))
 runtime = timer.toc()
 print('Epoch %d  accuracy : %.2f  loss : %f \t time : %f'%(epoch, cls_accuracy, epoch_loss, runtime))


def test_net(self, test_data, test_labels, batch_size=10):
  self.cuda()
  self.eval()

 dataset = DataSet(test_data, test_labels)
 data_loader  = torchdata.DataLoader(dataset, batch_size=batch_size, num_workers=1)     

pre_labels = []
for i,data in enumerate(data_loader):
   batch,labels = data

  inputs = Variable(batch.float().cuda())
  outputs = self.forward(inputs)

  predicted = outputs.data.max(1)[1] # get the index of the max log-probability        
  pre_labels += list(predicted.cpu().numpy().flatten())


 return pre_labels

def apply_3d_cnn(train_data, train_labels, test_data, test_labels, w=100, h=100, NS=6):

  network = CNN(num_channels=6)

  timer = Timer()
  timer.tic()

  network.train_net(train_data, train_labels, lrate=0.0001, epochs=100, batch_size=10, shuffle=True)

  runtime = timer.toc()
  print('Training time: %f'%(runtime))

  timer.tic()
  pre_labels = network.test_net(test_data, test_labels, batch_size=10)
  runtime = timer.toc()
  print('Test time: %f'%(runtime))

  report_class_performance(test_labels, pre_labels)

Notes:

  • It was hard to indent the code here; you can assume the indentation is correct.
  • PyTorch network converges very quickly, 100 % accuracy on training set in 30 epochs, whereas TF reaches ~97 % in 30 epochs and never reaches exactly 100 %.
  • I run trained PyTorch network on the training set and get 100 % (to check if there is a problem in evaluation)
  • I am aware of the different input formats (WxHxC in TF, CxWxH in PyTorch) and prepare the input data accordingly.

Is there a difference between the two network implementations and training procedure that may cause the problem?

It seems, the difference is the L2 regularization, which prevents overfitting in TF implementation.
What is an easy way to add L2 regularization to PyTorch layer? or should I try other regularization techniques, like dropout?

L2 regularization in PyTorch is available in all the optimizers, through the weight_decay parameter.
I modified the code accordingly.
optimizer = torch.optim.Adam(self.parameters(), lr=lrate, weight_decay=0.001)

I tried different values of weight_decay (0.0001, 0.001, 0.01). There is little improvement, but test accuracy is still far worse than TF.
The default weight_decay in tflearn is 0.001. http://tflearn.org/layers/conv/

I met the same problem.