[Caffe2] CNN Training on CPU is faster than GPU

Hi all,

while training a CNN called DPNet I noticed that training was faster on CPU(i7) than GPU(NVIDIA GTX 1050). As far as I understand, GPU should be faster than CPU for training CNNs.

Any ideas/reasons for this behavior?

Training configuration:
Dataset: 40,000 samples
Batch size: 64
learning rate: 0.01
learning rate decay: 10% for every 8000 iterations.

Hello, can you share your training script? I think it might provide more information. Did you use pin_memory=True in DataLoader? What about num_workers parameter?

Hello @roaffix,

are those parameters also available on Caffe2? I didn’t use them on my training script.

Below the relevant part of the script used for training:

def get_total_num_iter(self, num_epoch, batch_size, dataset_size):
        #Force floating point calculation
        batch_size_float = float(batch_size)
        dataset_size_float = float(dataset_size)

        iterations_float = math.ceil(num_epoch*(dataset_size_float/batch_size_float))
        iterations_int = int(iterations_float)

        return iterations_int


    def add_input(self, model, batch_size, db, db_type, device_opts):
        with core.DeviceScope(device_opts):
            # load the data
            data_uint8, label = brew.db_input(
                model,
                blobs_out=["data_uint8", "label"],
                batch_size=batch_size,
                db=db,
                db_type=db_type,
            )

            # cast the data to float
            data = model.Cast(data_uint8, "data", to=core.DataType.FLOAT)

            # scale data from [0,255] down to [0,1]
            data = model.Scale(data, data, scale=float(1./256))

            # don't need the gradient for the backward pass
            data = model.StopGradient(data, data)

            dataset_size = int (lmdb.open(db).stat()['entries'])

            return data, label, dataset_size

    def create_model(self, model, data, label, device_opts, is_test):
    	with core.DeviceScope(device_opts):

    		data = data
      		conv1_ = brew.conv(model, data, 'conv1_', dim_in=3, dim_out=96, kernel=11, stride=4, pad=1)
    		relu1_ = brew.relu(model, conv1_, conv1_)
    		pool1_ = brew.max_pool(model, relu1_, 'pool1_', kernel=3, stride=2, pad=1)
      		conv2_ = brew.conv(model, pool1_, 'conv2_', dim_in=96, dim_out=256, kernel=5, stride=4, pad=1)
    		relu2_ = brew.relu(model, conv2_, conv2_)
    		pool2_ = brew.max_pool(model, relu2_, 'pool2_', kernel=3, stride=2, pad=1)
      		conv3_ = brew.conv(model, pool2_, 'conv3_', dim_in=256, dim_out=384, kernel=3, stride=1, pad=1)
    		relu3_ = brew.relu(model, conv3_, conv3_)
      		conv4_ = brew.conv(model, relu3_, 'conv4_', dim_in=384, dim_out=384, kernel=3, stride=1, pad=1)
    		relu4_ = brew.relu(model, conv4_, conv4_)
      		conv5_ = brew.conv(model, relu4_, 'conv5_', dim_in=384, dim_out=256, kernel=3, stride=1, pad=1)
    		relu5_ = brew.relu(model, conv5_, conv5_)
    		pool5_ = brew.max_pool(model, relu5_, 'pool5_', kernel=3, stride=2, pad=1)
    		fc5_ = brew.fc(model, pool5_, 'fc5_', dim_in=256 * 2 * 2, dim_out=4096)
    		relu6_ = brew.relu(model, fc5_, fc5_)
    		dropout6_ = brew.dropout(model, relu6_, 'dropout6_', ratio=0.5, is_test=False)
    		fc6_ = brew.fc(model, dropout6_, 'fc6_', dim_in=4096, dim_out=4096)    		
    		relu7_ = brew.relu(model, fc6_, fc6_)
    		dropout7_ = brew.dropout(model, relu7_, 'dropout7_', ratio=0.5, is_test=False)
    		fc7_ = brew.fc(model, dropout7_, 'fc7_', dim_in=4096, dim_out=256)    		
    		relu8_ = brew.relu(model, fc7_, fc7_)
    		dropout8_ = brew.dropout(model, relu8_, 'dropout8_', ratio=0.5, is_test=False)
    		fc8_ = brew.fc(model, dropout8_, 'fc8_', dim_in=256, dim_out=14)

            #Sigmoid + L2 Loss
    		predictions = model.net.Sigmoid(fc8_, 'predictions')        
    		dist = model.net.SquaredL2Distance([label, predictions], 'dist')
    		loss = dist.AveragedLoss([], ['loss'])

    		return predictions, loss

			
    def add_training_operators(self, model, output, label, loss, device_opts, opt_type, base_learning_rate, policy, stepsize, epsilon, beta1, beta2, gamma, momentum) :
    	with core.DeviceScope(device_opts):

    		model.AddGradientOperators([loss])    		    		  
    		opt = optimizer.build_sgd(model, base_learning_rate=base_learning_rate, policy=policy, stepsize=stepsize, gamma=gamma, momentum=momentum)    
    		print("sgd optimizer selected")
    		

    def add_accuracy(self, model, output, label, device_opts, eval_metric):
        with core.DeviceScope(device_opts):
            if eval_metric == 'accuracy':
                accuracy = brew.accuracy(model, [output, label], "accuracy")
            elif eval_metric == 'top_k_accuracy':
                accuracy = brew.accuracy(model, [output, label], "accuracy", top_k=3)
            return accuracy

    def train(self, num_epoch=23, batch_size=64, context='gpu', eval_metric='accuracy', opt_type='sgd', base_learning_rate=0.001, weight_decay=0.001, policy='step', stepsize=1, epsilon=1E-8, beta1=0.9, beta2=0.999, gamma=0.999, momentum=0.9) :
        if context == 'cpu':
            device_opts = core.DeviceOption(caffe2_pb2.CPU, 0)
        elif context == 'gpu':
            device_opts = core.DeviceOption(caffe2_pb2.CUDA, 0)

    	workspace.ResetWorkspace(self._model_dir_)

    	arg_scope = {"order": "NHWC"}
    	# == Training model ==
    	train_model= model_helper.ModelHelper(name="train_net", arg_scope=arg_scope)
    	data, label, train_dataset_size = self.add_input(train_model, batch_size=batch_size, db=os.path.join(self._data_dir_, 'torcs-train-nchw-lmdb'), db_type='lmdb', device_opts=device_opts)
    	predictions, loss = self.create_model(train_model, data, label, device_opts=device_opts, is_test=False)
    	self.add_training_operators(train_model, predictions, label, loss, device_opts, opt_type, base_learning_rate, policy, stepsize, epsilon, beta1, beta2, gamma, momentum)    	
    	with core.DeviceScope(device_opts):
    		brew.add_weight_decay(train_model, weight_decay)

    	# Initialize and create the training network
    	workspace.RunNetOnce(train_model.param_init_net)
    	workspace.CreateNet(train_model.net, overwrite=True)

    	# Main Training Loop
    	iterations = self.get_total_num_iter(num_epoch, batch_size, train_dataset_size)
        print("** Starting Training for " + str(num_epoch) + " epochs = " + str(iterations) + " iterations **")
        start_date = datetime.datetime.now()
    	for i in range(iterations):
    		workspace.RunNet(train_model.net)

    		if i % 50 == 0:
    		          print 'Iter ' + str(i) + ': ' + 'Loss ' + str(workspace.FetchBlob("loss"))
    		          print str(i) + "/" + str(iterations)
    		          current_time = datetime.datetime.now()
    		          elapsed_time = current_time - start_date
    		          print("\t Current time spent: " + str(elapsed_time))
   
        print str(iterations) + "/" + str(iterations) + " Training done"
        current_time = datetime.datetime.now()
        elapsed_time = current_time - start_date
        print("\t Total time spent: " + str(elapsed_time))
    	
    	deploy_model = model_helper.ModelHelper(name="deploy_net", arg_scope=arg_scope, init_params=False)
    	self.create_model(deploy_model, "data", label, device_opts, is_test=True)
    	
    	self.save_net(self.INIT_NET, self.PREDICT_NET, deploy_model)

I get the following messages when I train on GPU. Are they supposed to appear? Is there any CUDNN version which has implementatios for those operators?:

WARNING: Logging before InitGoogleLogging() is written to STDERR
I1209 20:24:53.741214 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741343 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741375 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPool.
I1209 20:24:53.741422 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741497 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741523 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPool.
I1209 20:24:53.741605 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741652 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741660 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741727 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741755 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741835 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741845 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPool.
I1209 20:24:53.741935 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741966 16045 operator.cc:167] Engine CUDNN is not available for operator Dropout.
I1209 20:24:53.742022 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.742033 16045 operator.cc:167] Engine CUDNN is not available for operator Dropout.
I1209 20:24:53.742090 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.742100 16045 operator.cc:167] Engine CUDNN is not available for operator Dropout.
I1209 20:24:53.742224 16045 operator.cc:167] Engine CUDNN is not available for operator DropoutGrad.
I1209 20:24:53.742260 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742282 16045 operator.cc:167] Engine CUDNN is not available for operator DropoutGrad.
I1209 20:24:53.742296 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742318 16045 operator.cc:167] Engine CUDNN is not available for operator DropoutGrad.
I1209 20:24:53.742332 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742353 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPoolGradient.
I1209 20:24:53.742390 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742403 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.
I1209 20:24:53.742449 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742461 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.
I1209 20:24:53.742498 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742508 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.
I1209 20:24:53.742579 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPoolGradient.
I1209 20:24:53.742638 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742650 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.
I1209 20:24:53.742686 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPoolGradient.
I1209 20:24:53.742720 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742732 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.

Thanks for the help

I’m not sure if these parameters are available on Caffe2, but they can slow down your training on pytorch & GPU.
What versions of CUDA and CUDNN do you use?

This is my setup:

  • PyTorch Version (e.g., 1.0): Caffer2 tag v0.4.0
  • OS (e.g., Linux): Ubuntu 16.04
  • How you installed PyTorch (conda, pip, source): Build from source (tag v0.4.0)
  • Build command you used (if compiling from source):
  • Python version: Python 2.7
  • CUDA/cuDNN version: 8.0/7.0.5
  • GPU models and configuration: GTX 1050