Hello,
I have defined a densenet architecture in PyTorch to use it on training data consisting of 15000 samples of 128x128 images. Here is the code:
class Dense_Block(nn.Module):
def __init__(self, in_channels):
super(Dense_Block, self).__init__()
self.relu = nn.ReLU(inplace = True)
self.bn = nn.BatchNorm2d(num_features = in_channels)
self.conv1 = nn.Conv2d(in_channels = in_channels, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
self.conv2 = nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
self.conv3 = nn.Conv2d(in_channels = 64, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
self.conv4 = nn.Conv2d(in_channels = 96, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
self.conv5 = nn.Conv2d(in_channels = 128, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
def forward(self, x):
bn = self.bn(x)
conv1 = self.relu(self.conv1(bn))
conv2 = self.relu(self.conv2(conv1))
c2_dense = self.relu(torch.cat([conv1, conv2], 1))
conv3 = self.relu(self.conv3(c2_dense))
c3_dense = self.relu(torch.cat([conv1, conv2, conv3], 1))
conv4 = self.relu(self.conv4(c3_dense))
c4_dense = self.relu(torch.cat([conv1, conv2, conv3, conv4], 1))
conv5 = self.relu(self.conv5(c4_dense))
c5_dense = self.relu(torch.cat([conv1, conv2, conv3, conv4, conv5], 1))
return c5_dense
class Transition_Layer(nn.Module):
def __init__(self, in_channels, out_channels):
super(Transition_Layer, self).__init__()
self.relu = nn.ReLU(inplace = True)
self.bn = nn.BatchNorm2d(num_features = out_channels)
self.conv = nn.Conv2d(in_channels = in_channels, out_channels = out_channels, kernel_size = 1, bias = False)
self.avg_pool = nn.AvgPool2d(kernel_size = 2, stride = 2, padding = 0)
def forward(self, x):
bn = self.bn(self.relu(self.conv(x)))
out = self.avg_pool(bn)
return out
class DenseNet(nn.Module):
def __init__(self, nr_classes):
super(DenseNet, self).__init__()
self.lowconv = nn.Conv2d(in_channels = 1, out_channels = 64, kernel_size = 7, padding = 3, bias = False)
self.relu = nn.ReLU()
# Make Dense Blocks
self.denseblock1 = self._make_dense_block(Dense_Block, 64)
self.denseblock2 = self._make_dense_block(Dense_Block, 128)
self.denseblock3 = self._make_dense_block(Dense_Block, 128)
# Make transition Layers
self.transitionLayer1 = self._make_transition_layer(Transition_Layer, in_channels = 160, out_channels = 128)
self.transitionLayer2 = self._make_transition_layer(Transition_Layer, in_channels = 160, out_channels = 128)
self.transitionLayer3 = self._make_transition_layer(Transition_Layer, in_channels = 160, out_channels = 64)
# Classifier
self.bn = nn.BatchNorm2d(num_features = 64)
self.pre_classifier = nn.Linear(64*16*16, 512)
self.classifier = nn.Linear(512, nr_classes)
def _make_dense_block(self, block, in_channels):
layers = []
layers.append(block(in_channels))
return nn.Sequential(*layers)
def _make_transition_layer(self, layer, in_channels, out_channels):
modules = []
modules.append(layer(in_channels, out_channels))
return nn.Sequential(*modules)
def forward(self, x):
out = self.relu(self.lowconv(x))
out = self.denseblock1(out)
out = self.transitionLayer1(out)
out = self.denseblock2(out)
out = self.transitionLayer2(out)
out = self.denseblock3(out)
out = self.transitionLayer3(out)
out = self.bn(out)
# print(out.shape)
out = out.reshape(-1, 64*16*16)
out = self.pre_classifier(out)
out = self.classifier(out)
return out
Then I define my DataSet class:
class MyDataset(Dataset):
def __init__(self, images, n, labels=None, transforms=None):
self.X = images
self.y = labels
self.n = n
self.transforms = transforms
def __len__(self):
return (len(self.X))
def __getitem__(self, i):
data = self.X.iloc[i, :]
data = np.asarray(data).astype(np.float).reshape(1,n,n)
if self.transforms:
data = self.transforms(data).reshape(1,n,n)
if self.y is not None:
y = self.y.iloc[i,:]
y = np.asarray(y).astype(np.float).reshape(128,) # for 128-vector of labels
return (data, y)
else:
return data
Then I create the instances of the train, dev, and test data:
train_data = MyDataset(train_images, n, train_labels, None)
dev_data = MyDataset(dev_images, n, dev_labels, None)
test_data = MyDataset(test_images, n, test_labels, None)
The shapes of train_images
, dev_images
and test_images
are respectively (15000, 16384)
, (4000, 16384)
and (1000, 16384)
. So there are in total 20000 samples of 128x128 (=16384) images.
The shapes of train_labels
, dev_labels
and test_labels
are respectively (15000, 128)
, (4000, 128)
and (1000, 128)
. So there are in total 20000 samples of 128 vectors.
I define also a custom loss function:
# Huber
def Huber(yHat,y,delta=1.):
n_samples = yHat.size()[0]
n_points = yHat.size()[1]
preds = yHat
labels = y
size = yHat.size()[0]*yHat.size()[1]
diff = yHat - y
return torch.sum(torch.where(torch.abs(diff) < delta,.5*diff**2 , delta*(torch.abs(diff)-.5*delta**2))) / size
Then I create an instance of the model:
densenet = DenseNet(nr_classes=128).float().to('cuda:0')
Then I initialize parameters, create train- and dev-set dataloaders, and train the model using Adam optimizer and Huber loss-function:
def main():
nn.init.kaiming_uniform_(list(densenet.parameters())[0], nonlinearity = 'relu')
loader = DataLoader(train_data,batch_size=200,shuffle=False,num_workers=0)
loader_dev = DataLoader(dev_data,batch_size=10,shuffle=None,num_workers=0)
N_epochs = 10
for epoch in range(N_epochs):
optimizer = optim.Adam(densenet.parameters(), lr=.001, betas=(0.9, 0.999), eps=1e-08)
for batch in loader:
images = batch[0].float().to('cuda:0')
labels = batch[1].float().to('cuda:0')
preds = densenet(images)
loss = Loss(preds,labels).Huber()
with torch.no_grad():
loss_dev = 0
for batch_dev in loader_dev:
images_dev = batch_dev[0].float().to('cuda:0')
labels_dev = batch_dev[1].float().to('cuda:0')
preds_dev = densenet(images_dev)
loss_ = Loss(preds_dev,labels_dev).Huber()
loss_dev += loss_
optimizer.zero_grad()
loss.backward()
optimizer.step()
if __name__ == '__main__':
multiprocessing.freeze_support()
main()
I have two GPUs identified as cuda:0
(~ 24 Gb memory) and cuda:1
(~ 6 Gb memory)
With cuda:1
device I get the error message:
File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 1127, in <module>
main()
File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 1074, in main
preds = network(images) # Pass Batch
File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 752, in forward
out = self.denseblock1(out)
File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\container.py", line 117, in forward
input = module(input)
File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 691, in forward
c3_dense = self.relu(torch.cat([conv1, conv2, conv3], 1))
RuntimeError: CUDA out of memory. Tried to allocate 1.17 GiB (GPU 1; 6.00 GiB total capacity; 4.34 GiB already allocated; 16.62 MiB free; 4.34 GiB reserved in total by PyTorch)
Then I tried to switch to cuda:0
device which has much more memory capacity, and the error in this case reads as:
File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 1127, in <module>
main()
File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 1074, in main
preds = network(images) # Pass Batch
File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 752, in forward
out = self.denseblock1(out)
File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\container.py", line 117, in forward
input = module(input)
File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 691, in forward
c3_dense = self.relu(torch.cat([conv1, conv2, conv3], 1))
RuntimeError: CUDA out of memory. Tried to allocate 1.17 GiB (GPU 0; 24.00 GiB total capacity; 21.59 GiB already allocated; 372.94 MiB free; 21.69 GiB reserved in total by PyTorch)
Why does PyTorch allocate almost all available memory?
However, when I use train-set of 6 images and dev-set of 3 images (test-set of 1 image), training with cuda
-devices works fine.