[Caffe2] CNN Training on CPU is faster than GPU

CarlosYeverino · December 9, 2018, 10:54pm

Hi all,

while training a CNN called DPNet I noticed that training was faster on CPU(i7) than GPU(NVIDIA GTX 1050). As far as I understand, GPU should be faster than CPU for training CNNs.

Any ideas/reasons for this behavior?

Training configuration:
Dataset: 40,000 samples
Batch size: 64
learning rate: 0.01
learning rate decay: 10% for every 8000 iterations.

roaffix · December 10, 2018, 6:32am

Hello, can you share your training script? I think it might provide more information. Did you use pin_memory=True in DataLoader? What about num_workers parameter?

CarlosYeverino · December 10, 2018, 9:34am

Hello @roaffix,

are those parameters also available on Caffe2? I didn’t use them on my training script.

Below the relevant part of the script used for training:

def get_total_num_iter(self, num_epoch, batch_size, dataset_size):
        #Force floating point calculation
        batch_size_float = float(batch_size)
        dataset_size_float = float(dataset_size)

        iterations_float = math.ceil(num_epoch*(dataset_size_float/batch_size_float))
        iterations_int = int(iterations_float)

        return iterations_int


    def add_input(self, model, batch_size, db, db_type, device_opts):
        with core.DeviceScope(device_opts):
            # load the data
            data_uint8, label = brew.db_input(
                model,
                blobs_out=["data_uint8", "label"],
                batch_size=batch_size,
                db=db,
                db_type=db_type,
            )

            # cast the data to float
            data = model.Cast(data_uint8, "data", to=core.DataType.FLOAT)

            # scale data from [0,255] down to [0,1]
            data = model.Scale(data, data, scale=float(1./256))

            # don't need the gradient for the backward pass
            data = model.StopGradient(data, data)

            dataset_size = int (lmdb.open(db).stat()['entries'])

            return data, label, dataset_size

    def create_model(self, model, data, label, device_opts, is_test):
    	with core.DeviceScope(device_opts):

    		data = data
      		conv1_ = brew.conv(model, data, 'conv1_', dim_in=3, dim_out=96, kernel=11, stride=4, pad=1)
    		relu1_ = brew.relu(model, conv1_, conv1_)
    		pool1_ = brew.max_pool(model, relu1_, 'pool1_', kernel=3, stride=2, pad=1)
      		conv2_ = brew.conv(model, pool1_, 'conv2_', dim_in=96, dim_out=256, kernel=5, stride=4, pad=1)
    		relu2_ = brew.relu(model, conv2_, conv2_)
    		pool2_ = brew.max_pool(model, relu2_, 'pool2_', kernel=3, stride=2, pad=1)
      		conv3_ = brew.conv(model, pool2_, 'conv3_', dim_in=256, dim_out=384, kernel=3, stride=1, pad=1)
    		relu3_ = brew.relu(model, conv3_, conv3_)
      		conv4_ = brew.conv(model, relu3_, 'conv4_', dim_in=384, dim_out=384, kernel=3, stride=1, pad=1)
    		relu4_ = brew.relu(model, conv4_, conv4_)
      		conv5_ = brew.conv(model, relu4_, 'conv5_', dim_in=384, dim_out=256, kernel=3, stride=1, pad=1)
    		relu5_ = brew.relu(model, conv5_, conv5_)
    		pool5_ = brew.max_pool(model, relu5_, 'pool5_', kernel=3, stride=2, pad=1)
    		fc5_ = brew.fc(model, pool5_, 'fc5_', dim_in=256 * 2 * 2, dim_out=4096)
    		relu6_ = brew.relu(model, fc5_, fc5_)
    		dropout6_ = brew.dropout(model, relu6_, 'dropout6_', ratio=0.5, is_test=False)
    		fc6_ = brew.fc(model, dropout6_, 'fc6_', dim_in=4096, dim_out=4096)    		
    		relu7_ = brew.relu(model, fc6_, fc6_)
    		dropout7_ = brew.dropout(model, relu7_, 'dropout7_', ratio=0.5, is_test=False)
    		fc7_ = brew.fc(model, dropout7_, 'fc7_', dim_in=4096, dim_out=256)    		
    		relu8_ = brew.relu(model, fc7_, fc7_)
    		dropout8_ = brew.dropout(model, relu8_, 'dropout8_', ratio=0.5, is_test=False)
    		fc8_ = brew.fc(model, dropout8_, 'fc8_', dim_in=256, dim_out=14)

            #Sigmoid + L2 Loss
    		predictions = model.net.Sigmoid(fc8_, 'predictions')        
    		dist = model.net.SquaredL2Distance([label, predictions], 'dist')
    		loss = dist.AveragedLoss([], ['loss'])

    		return predictions, loss

			
    def add_training_operators(self, model, output, label, loss, device_opts, opt_type, base_learning_rate, policy, stepsize, epsilon, beta1, beta2, gamma, momentum) :
    	with core.DeviceScope(device_opts):

    		model.AddGradientOperators([loss])    		    		  
    		opt = optimizer.build_sgd(model, base_learning_rate=base_learning_rate, policy=policy, stepsize=stepsize, gamma=gamma, momentum=momentum)    
    		print("sgd optimizer selected")
    		

    def add_accuracy(self, model, output, label, device_opts, eval_metric):
        with core.DeviceScope(device_opts):
            if eval_metric == 'accuracy':
                accuracy = brew.accuracy(model, [output, label], "accuracy")
            elif eval_metric == 'top_k_accuracy':
                accuracy = brew.accuracy(model, [output, label], "accuracy", top_k=3)
            return accuracy

    def train(self, num_epoch=23, batch_size=64, context='gpu', eval_metric='accuracy', opt_type='sgd', base_learning_rate=0.001, weight_decay=0.001, policy='step', stepsize=1, epsilon=1E-8, beta1=0.9, beta2=0.999, gamma=0.999, momentum=0.9) :
        if context == 'cpu':
            device_opts = core.DeviceOption(caffe2_pb2.CPU, 0)
        elif context == 'gpu':
            device_opts = core.DeviceOption(caffe2_pb2.CUDA, 0)

    	workspace.ResetWorkspace(self._model_dir_)

    	arg_scope = {"order": "NHWC"}
    	# == Training model ==
    	train_model= model_helper.ModelHelper(name="train_net", arg_scope=arg_scope)
    	data, label, train_dataset_size = self.add_input(train_model, batch_size=batch_size, db=os.path.join(self._data_dir_, 'torcs-train-nchw-lmdb'), db_type='lmdb', device_opts=device_opts)
    	predictions, loss = self.create_model(train_model, data, label, device_opts=device_opts, is_test=False)
    	self.add_training_operators(train_model, predictions, label, loss, device_opts, opt_type, base_learning_rate, policy, stepsize, epsilon, beta1, beta2, gamma, momentum)    	
    	with core.DeviceScope(device_opts):
    		brew.add_weight_decay(train_model, weight_decay)

    	# Initialize and create the training network
    	workspace.RunNetOnce(train_model.param_init_net)
    	workspace.CreateNet(train_model.net, overwrite=True)

    	# Main Training Loop
    	iterations = self.get_total_num_iter(num_epoch, batch_size, train_dataset_size)
        print("** Starting Training for " + str(num_epoch) + " epochs = " + str(iterations) + " iterations **")
        start_date = datetime.datetime.now()
    	for i in range(iterations):
    		workspace.RunNet(train_model.net)

    		if i % 50 == 0:
    		          print 'Iter ' + str(i) + ': ' + 'Loss ' + str(workspace.FetchBlob("loss"))
    		          print str(i) + "/" + str(iterations)
    		          current_time = datetime.datetime.now()
    		          elapsed_time = current_time - start_date
    		          print("\t Current time spent: " + str(elapsed_time))
   
        print str(iterations) + "/" + str(iterations) + " Training done"
        current_time = datetime.datetime.now()
        elapsed_time = current_time - start_date
        print("\t Total time spent: " + str(elapsed_time))
    	
    	deploy_model = model_helper.ModelHelper(name="deploy_net", arg_scope=arg_scope, init_params=False)
    	self.create_model(deploy_model, "data", label, device_opts, is_test=True)
    	
    	self.save_net(self.INIT_NET, self.PREDICT_NET, deploy_model)

I get the following messages when I train on GPU. Are they supposed to appear? Is there any CUDNN version which has implementatios for those operators?:

WARNING: Logging before InitGoogleLogging() is written to STDERR
I1209 20:24:53.741214 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741343 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741375 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPool.
I1209 20:24:53.741422 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741497 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741523 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPool.
I1209 20:24:53.741605 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741652 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741660 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741727 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741755 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741835 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741845 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPool.
I1209 20:24:53.741935 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741966 16045 operator.cc:167] Engine CUDNN is not available for operator Dropout.
I1209 20:24:53.742022 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.742033 16045 operator.cc:167] Engine CUDNN is not available for operator Dropout.
I1209 20:24:53.742090 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.742100 16045 operator.cc:167] Engine CUDNN is not available for operator Dropout.
I1209 20:24:53.742224 16045 operator.cc:167] Engine CUDNN is not available for operator DropoutGrad.
I1209 20:24:53.742260 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742282 16045 operator.cc:167] Engine CUDNN is not available for operator DropoutGrad.
I1209 20:24:53.742296 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742318 16045 operator.cc:167] Engine CUDNN is not available for operator DropoutGrad.
I1209 20:24:53.742332 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742353 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPoolGradient.
I1209 20:24:53.742390 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742403 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.
I1209 20:24:53.742449 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742461 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.
I1209 20:24:53.742498 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742508 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.
I1209 20:24:53.742579 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPoolGradient.
I1209 20:24:53.742638 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742650 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.
I1209 20:24:53.742686 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPoolGradient.
I1209 20:24:53.742720 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742732 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.

Thanks for the help

roaffix · December 11, 2018, 2:44pm

I’m not sure if these parameters are available on Caffe2, but they can slow down your training on pytorch & GPU.
What versions of CUDA and CUDNN do you use?

CarlosYeverino · December 11, 2018, 3:02pm

This is my setup:

PyTorch Version (e.g., 1.0): Caffer2 tag v0.4.0
OS (e.g., Linux): Ubuntu 16.04
How you installed PyTorch (conda, pip, source): Build from source (tag v0.4.0)
Build command you used (if compiling from source):
Python version: Python 2.7
CUDA/cuDNN version: 8.0/7.0.5
GPU models and configuration: GTX 1050