Hello @roaffix,
are those parameters also available on Caffe2? I didn’t use them on my training script.
Below the relevant part of the script used for training:
def get_total_num_iter(self, num_epoch, batch_size, dataset_size):
#Force floating point calculation
batch_size_float = float(batch_size)
dataset_size_float = float(dataset_size)
iterations_float = math.ceil(num_epoch*(dataset_size_float/batch_size_float))
iterations_int = int(iterations_float)
return iterations_int
def add_input(self, model, batch_size, db, db_type, device_opts):
with core.DeviceScope(device_opts):
# load the data
data_uint8, label = brew.db_input(
model,
blobs_out=["data_uint8", "label"],
batch_size=batch_size,
db=db,
db_type=db_type,
)
# cast the data to float
data = model.Cast(data_uint8, "data", to=core.DataType.FLOAT)
# scale data from [0,255] down to [0,1]
data = model.Scale(data, data, scale=float(1./256))
# don't need the gradient for the backward pass
data = model.StopGradient(data, data)
dataset_size = int (lmdb.open(db).stat()['entries'])
return data, label, dataset_size
def create_model(self, model, data, label, device_opts, is_test):
with core.DeviceScope(device_opts):
data = data
conv1_ = brew.conv(model, data, 'conv1_', dim_in=3, dim_out=96, kernel=11, stride=4, pad=1)
relu1_ = brew.relu(model, conv1_, conv1_)
pool1_ = brew.max_pool(model, relu1_, 'pool1_', kernel=3, stride=2, pad=1)
conv2_ = brew.conv(model, pool1_, 'conv2_', dim_in=96, dim_out=256, kernel=5, stride=4, pad=1)
relu2_ = brew.relu(model, conv2_, conv2_)
pool2_ = brew.max_pool(model, relu2_, 'pool2_', kernel=3, stride=2, pad=1)
conv3_ = brew.conv(model, pool2_, 'conv3_', dim_in=256, dim_out=384, kernel=3, stride=1, pad=1)
relu3_ = brew.relu(model, conv3_, conv3_)
conv4_ = brew.conv(model, relu3_, 'conv4_', dim_in=384, dim_out=384, kernel=3, stride=1, pad=1)
relu4_ = brew.relu(model, conv4_, conv4_)
conv5_ = brew.conv(model, relu4_, 'conv5_', dim_in=384, dim_out=256, kernel=3, stride=1, pad=1)
relu5_ = brew.relu(model, conv5_, conv5_)
pool5_ = brew.max_pool(model, relu5_, 'pool5_', kernel=3, stride=2, pad=1)
fc5_ = brew.fc(model, pool5_, 'fc5_', dim_in=256 * 2 * 2, dim_out=4096)
relu6_ = brew.relu(model, fc5_, fc5_)
dropout6_ = brew.dropout(model, relu6_, 'dropout6_', ratio=0.5, is_test=False)
fc6_ = brew.fc(model, dropout6_, 'fc6_', dim_in=4096, dim_out=4096)
relu7_ = brew.relu(model, fc6_, fc6_)
dropout7_ = brew.dropout(model, relu7_, 'dropout7_', ratio=0.5, is_test=False)
fc7_ = brew.fc(model, dropout7_, 'fc7_', dim_in=4096, dim_out=256)
relu8_ = brew.relu(model, fc7_, fc7_)
dropout8_ = brew.dropout(model, relu8_, 'dropout8_', ratio=0.5, is_test=False)
fc8_ = brew.fc(model, dropout8_, 'fc8_', dim_in=256, dim_out=14)
#Sigmoid + L2 Loss
predictions = model.net.Sigmoid(fc8_, 'predictions')
dist = model.net.SquaredL2Distance([label, predictions], 'dist')
loss = dist.AveragedLoss([], ['loss'])
return predictions, loss
def add_training_operators(self, model, output, label, loss, device_opts, opt_type, base_learning_rate, policy, stepsize, epsilon, beta1, beta2, gamma, momentum) :
with core.DeviceScope(device_opts):
model.AddGradientOperators([loss])
opt = optimizer.build_sgd(model, base_learning_rate=base_learning_rate, policy=policy, stepsize=stepsize, gamma=gamma, momentum=momentum)
print("sgd optimizer selected")
def add_accuracy(self, model, output, label, device_opts, eval_metric):
with core.DeviceScope(device_opts):
if eval_metric == 'accuracy':
accuracy = brew.accuracy(model, [output, label], "accuracy")
elif eval_metric == 'top_k_accuracy':
accuracy = brew.accuracy(model, [output, label], "accuracy", top_k=3)
return accuracy
def train(self, num_epoch=23, batch_size=64, context='gpu', eval_metric='accuracy', opt_type='sgd', base_learning_rate=0.001, weight_decay=0.001, policy='step', stepsize=1, epsilon=1E-8, beta1=0.9, beta2=0.999, gamma=0.999, momentum=0.9) :
if context == 'cpu':
device_opts = core.DeviceOption(caffe2_pb2.CPU, 0)
elif context == 'gpu':
device_opts = core.DeviceOption(caffe2_pb2.CUDA, 0)
workspace.ResetWorkspace(self._model_dir_)
arg_scope = {"order": "NHWC"}
# == Training model ==
train_model= model_helper.ModelHelper(name="train_net", arg_scope=arg_scope)
data, label, train_dataset_size = self.add_input(train_model, batch_size=batch_size, db=os.path.join(self._data_dir_, 'torcs-train-nchw-lmdb'), db_type='lmdb', device_opts=device_opts)
predictions, loss = self.create_model(train_model, data, label, device_opts=device_opts, is_test=False)
self.add_training_operators(train_model, predictions, label, loss, device_opts, opt_type, base_learning_rate, policy, stepsize, epsilon, beta1, beta2, gamma, momentum)
with core.DeviceScope(device_opts):
brew.add_weight_decay(train_model, weight_decay)
# Initialize and create the training network
workspace.RunNetOnce(train_model.param_init_net)
workspace.CreateNet(train_model.net, overwrite=True)
# Main Training Loop
iterations = self.get_total_num_iter(num_epoch, batch_size, train_dataset_size)
print("** Starting Training for " + str(num_epoch) + " epochs = " + str(iterations) + " iterations **")
start_date = datetime.datetime.now()
for i in range(iterations):
workspace.RunNet(train_model.net)
if i % 50 == 0:
print 'Iter ' + str(i) + ': ' + 'Loss ' + str(workspace.FetchBlob("loss"))
print str(i) + "/" + str(iterations)
current_time = datetime.datetime.now()
elapsed_time = current_time - start_date
print("\t Current time spent: " + str(elapsed_time))
print str(iterations) + "/" + str(iterations) + " Training done"
current_time = datetime.datetime.now()
elapsed_time = current_time - start_date
print("\t Total time spent: " + str(elapsed_time))
deploy_model = model_helper.ModelHelper(name="deploy_net", arg_scope=arg_scope, init_params=False)
self.create_model(deploy_model, "data", label, device_opts, is_test=True)
self.save_net(self.INIT_NET, self.PREDICT_NET, deploy_model)
I get the following messages when I train on GPU. Are they supposed to appear? Is there any CUDNN version which has implementatios for those operators?:
WARNING: Logging before InitGoogleLogging() is written to STDERR
I1209 20:24:53.741214 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741343 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741375 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPool.
I1209 20:24:53.741422 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741497 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741523 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPool.
I1209 20:24:53.741605 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741652 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741660 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741727 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741755 16045 operator.cc:167] Engine CUDNN is not available for operator Conv.
I1209 20:24:53.741835 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741845 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPool.
I1209 20:24:53.741935 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.741966 16045 operator.cc:167] Engine CUDNN is not available for operator Dropout.
I1209 20:24:53.742022 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.742033 16045 operator.cc:167] Engine CUDNN is not available for operator Dropout.
I1209 20:24:53.742090 16045 operator.cc:167] Engine CUDNN is not available for operator Relu.
I1209 20:24:53.742100 16045 operator.cc:167] Engine CUDNN is not available for operator Dropout.
I1209 20:24:53.742224 16045 operator.cc:167] Engine CUDNN is not available for operator DropoutGrad.
I1209 20:24:53.742260 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742282 16045 operator.cc:167] Engine CUDNN is not available for operator DropoutGrad.
I1209 20:24:53.742296 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742318 16045 operator.cc:167] Engine CUDNN is not available for operator DropoutGrad.
I1209 20:24:53.742332 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742353 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPoolGradient.
I1209 20:24:53.742390 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742403 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.
I1209 20:24:53.742449 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742461 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.
I1209 20:24:53.742498 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742508 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.
I1209 20:24:53.742579 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPoolGradient.
I1209 20:24:53.742638 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742650 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.
I1209 20:24:53.742686 16045 operator.cc:167] Engine CUDNN is not available for operator MaxPoolGradient.
I1209 20:24:53.742720 16045 operator.cc:167] Engine CUDNN is not available for operator ReluGradient.
I1209 20:24:53.742732 16045 operator.cc:167] Engine CUDNN is not available for operator ConvGradient.
Thanks for the help