Hello,
I am encountering “very bad performance” using a CUDA enabled Pytorch.
I tried to port a small cnn to Pytorch and it takes enormous time to train it,
which wasn’t the case on the previous framework I used.
For the same dataset and the same batch size, my PyTorch take almost 40 second per epoch ( with high CPU load and almost no GPU load) as it took 1s per epoch for the other framework. Such low performance isn’t normal and I would like to know where does my problem stem from ?
I’m using Pytorch 1.3.1 on a docker (the dockerfile used to build the docker image). Docker is 19.03. The GPU is a GeForce RTX 2060 SUPER and the CPU a Ryzen 7 3700X.
I pasted below my code if that could help:
Thank for your help.
Here the CNN:
class Net(nn.Module):
def __init__(self):
super(CNNet,self).__init__()
# in 192x192x3
pad1 = same_padding(192,3,1)
self.conv1 = nn.Conv2d(in_channels=3,out_channels=32,kernel_size=3,stride=1,padding=pad1)
# out: 192x192x32
self.max1 = nn.MaxPool2d(kernel_size=8,stride=8,padding=0)
# out: 24x24x32
pad2 = same_padding(24,5,1)
self.conv2 = nn.Conv2d(in_channels=32,out_channels=16,kernel_size=5,stride=1,padding=pad2)
# out: 24x24x16
self.avg = nn.AvgPool2d(4,4)
# out 6x6x16
self.fc1 = nn.Linear(6*6*16,32) # FC layer
self.fc2 = nn.Linear(32,3) # prediction layer
def forward(self,x):
x = F.relu(self.conv1(x))
x = self.max1(x)
x = F.relu(self.conv2(x))
x = self.avg(x)
x = x.view(-1,6 * 6 * 16) # tensor of shape [1,6x6x16]
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
Training:
if torch.cuda.is_available():
print('using GPU: {}'.format(torch.cuda.get_device_name(0)))
gpu = torch.device('cuda:0')
Dataset and dataloader
train_set = MyDataset(("dataset/","dataset/train_set.csv")
dataloader = DataLoader(train_set, batch_size=5,
shuffle=True,pin_memory=True,num_workers=4)
net = Net()
net.to(device=gpu)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(),lr=0.001)
stop = time.time()
MAX_EPOCHS = 2
BATCH_SIZE = 5
MAX_BATCH = len(dataloader)-1
for epoch in range(MAX_EPOCHS): # loop over the dataset multiple times
running_loss = 0.0
for i_batch, batch_data in enumerate(dataloader):
# get the inputs
inputs = batch_data['image'].to(gpu)
labels = batch_data['label'].to(gpu)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print loss and elapsed time
running_loss += loss.item()
if i_batch == MAX_BATCH:
now = time.time()
elapsed = now - stop
print('Epoch # %d, loss: %.3f Elapsed time %.4f s' %
(epoch + 1, running_loss / MAX_BATCH,elapsed))
running_loss = 0.0
stop = now
The dataset class:
class MyDataset(Dataset):
def __init__(self,root_dir,csv_file, input_shape = 192):
self.root = root_dir
self.data = pd.read_csv(csv_file)
self.input_shape = input_shape
def __getitem__(self, index):
img_path = self.root + self.data["image"][index]
label = self.data['label'][index]
img = io.imread(img_path)
img = transform.resize(img,(self.input_shape,self.input_shape),anti_aliasing=True)
img = self.to_tensor(img)
return {"image":img,"label": label}
def __len__(self):
count = len(self.data)
return count
def to_tensor(self,array):
array = array.transpose((2, 0, 1))
array = array.astype(np.float32)
return torch.from_numpy(array)
The containair is launched using:
docker run \
--gpus all \
-it \
--ipc=host \
-p 8082:8082 \
--name pytorch_lab \
-w /pytorch \
-v $(pwd)/workspace:/pytorch/workspace \
custom/pytorch:1.3-gpu-py3-jupyter \
bash -c "jupyter lab --ip=0.0.0.0 --port=8082 --allow-root --no-browser"