I am using the following code on 8 GPUs. GPU utilization is too low, around 10%. Can someone please help me to understand what is wrong with the code?
def conv3x3(in_planes, out_planes, stride=1):
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion*planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion*planes)
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x)
out = F.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes=10):
super(ResNet, self).__init__()
dtype = torch.cuda.FloatTensor
n_cells = 2
self.in_planes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.linear = nn.Linear(512*block.expansion, num_classes)
self.relu = nn.ReLU()
#making a list of layers
layer_list = [self.layer1[0],self.layer1[1],self.layer1[2],\
self.layer2[0],self.layer2[1],self.layer2[2],self.layer2[3],\
self.layer3[0],self.layer3[1],self.layer3[2],self.layer3[3],\
self.layer3[4],self.layer3[5],\
self.layer4[0],self.layer4[1],self.layer4[2]]
self.layer_list = nn.ModuleList(layer_list)
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1]*(num_blocks-1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)
def forward(self, x, new_weights):
dtype = torch.cuda.FloatTensor
device_ids = [0,1,2,3,4,5,6,7]
n_samples = int(x.shape[0])
batch_size = n_samples
n_cells = 2
conv = Variable(torch.zeros(1,1,3,3).cuda())
rnn = nn.RNNCell(10,n_cells).cuda()
linear = nn.Linear(n_cells,1).cuda()
sigmoid = nn.Sigmoid().cuda()
v1 = int(10*n_cells);v2=v1+int(n_cells**2);v3=v2+int(n_cells)
v4 = v3+int(n_cells);v5=v4+3*3;v6=v5+int(n_cells);v7=v6+1
new_weights = new_weights.data
value = new_weights[:v1].cuda()
value = new_weights[v1:v2].cuda()
rnn.weight_hh = torch.nn.Parameter(value.view(n_cells,n_cells))
value = new_weights[v2:v3].cuda()
rnn.bias_ih = torch.nn.Parameter(value.view(n_cells))
value = new_weights[v3:v4].cuda()
rnn.bias_hh = torch.nn.Parameter(value.view(n_cells))
value = new_weights[v4:v5].cuda()
conv.weight = torch.nn.Parameter(value.view(1,1,3,3))
value = new_weights[v5:v6].cuda()
linear.weight = torch.nn.Parameter(value.view(1,n_cells))
value = new_weights[v6:v7].cuda()
linear.bias = torch.nn.Parameter(value.view(1))
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)#F.relu(out)
hidden = Variable(torch.zeros(n_samples,n_cells).type(dtype)).cuda()
for layer in self.layer_list:
path_1 = layer(out)
path_2 = self.relu(layer.shortcut(out))#F.relu(layer.shortcut(out))
ht = out.shape[2];wd = out.shape[3]
n_samples = out.shape[0]
batch_size = out.shape[0]
input_to_rnn = F.conv2d(out.sum(1).view(n_samples,1,ht,wd),conv)
input_to_rnn.cuda()
input_to_rnn = self.relu(input_to_rnn)#F.relu(input_to_rnn)
input_to_rnn.cuda()
if (input_to_rnn.shape[2]>=4) and (input_to_rnn.shape[3]>=4):
input_to_rnn = F.avg_pool2d(input_to_rnn,4)
input_to_rnn = input_to_rnn.view(n_samples,1,-1)
if input_to_rnn.shape[2]<10:
repl = Variable(torch.zeros(batch_size,1,10-int(input_to_rnn.shape[2])).cuda())
input_to_rnn = torch.cat((input_to_rnn,repl),2).cuda()
one_d = math.floor(float(input_to_rnn.shape[2])/10)
input_to_rnn = F.avg_pool1d(input_to_rnn,one_d)
input_to_rnn = input_to_rnn.contiguous()
input_to_rnn = input_to_rnn[:,:,:10];
input_to_rnn = input_to_rnn.contiguous()
input_to_rnn = input_to_rnn.view(n_samples,10)
hidden = rnn(input_to_rnn,hidden).cuda()
logistic = linear(hidden).cuda()
prob = sigmoid(logistic).cuda()
shape = path_1.shape
num_features = int(shape[1]*shape[2]*shape[3])
mask = torch.bernoulli(prob).cuda() #(device = device_ids[0])
s = 1-torch.mean(mask)
s = s.cuda()#(device = device_ids[0])
mask = mask.repeat(1,num_features).cuda()
mask = mask.view(batch_size,int(shape[1]),int(shape[2]),int(shape[3])).cuda()
mask = mask.cuda()#type('torch.cuda.FloatTensor')
unmask = Variable(torch.ones(batch_size,int(shape[1]),int(shape[2]),int(shape[3])).cuda())-mask
out = mask*path_1 + self.relu(unmask*path_2)#F.relu(unmask*path_2)
out = F.avg_pool2d(out, 4)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def main():
device_ids = [0,1,2,3,4,5,6,7]
dtype = torch.cuda.FloatTensor
transform_train = \
transforms.Compose([transforms.RandomCrop(32, padding=4),transforms.RandomHorizontalFlip(),\
transforms.ToTensor(),\
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
folder_train = 'folder_train'
folder_test = 'folder_train'
trainset = datasets.CIFAR10(folder_train,train=True,download=True,transform=transform_train)
bsz = 256
train_set = torch.utils.data.DataLoader\
(trainset,batch_size=bsz,shuffle=True,num_workers=2,pin_memory=True)
transform_test = transforms.Compose([transforms.ToTensor(),\
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
testset = torchvision.datasets.CIFAR10(folder_test, train=False, download=True,\
transform=transform_test)
bsz_test = 64
test_set = torch.utils.data.DataLoader
(testset,batch_size=bsz_test,shuffle=False,num_workers=2,pin_memory=True)
model= ResNet(BasicBlock, [3,4,6,3]) #for resnet34
model = nn.DataParallel(model,device_ids = device_ids)
#nn.DataParallel(model_1,device_ids = device_ids)#.cuda(device=device_ids[0])
model = model.cuda()
print ('gpu count',torch.cuda.device_count())
dtype_1 = torch.cuda.LongTensor
dtype = torch.cuda.FloatTensor
for epoch in range(100):
weights = Variable(torch.zeros(40*8).cuda())
for data, target in train_set_spsa:
data, target = Variable(data.type(dtype)), Variable(target.type(dtype_1))
output = model(data,weights)