I am building my custom unet model which takes numpy matrix as input and has mask associated with it(0,1) for each pixel. I am converting this to a torch image by expanding dimensions using numpy.expand(npMat, 0)
converting it to torch image of the form C x H x W
. Here is the model for your reference:
class UNET(nn.Module):
def __init__(self, in_channels, out_channels):
super().__init__()
self.conv1 = self.contract_block(in_channels, 32, 7, 3)
self.conv2 = self.contract_block(32, 64, 3, 1)
self.conv3 = self.contract_block(64, 128, 3, 1)
self.upconv3 = self.expand_block(128, 64, 3, 1)
self.upconv2 = self.expand_block(64*2, 32, 3, 1)
self.upconv1 = self.expand_block(32*2, out_channels, 3, 1)
def __call__(self, x):
# downsampling part
conv1 = self.conv1(x)
conv2 = self.conv2(conv1)
conv3 = self.conv3(conv2)
# upsampling part
upconv3 = self.upconv3(conv3)
upconv2 = self.upconv2(torch.cat([upconv3, conv2], 1))
upconv1 = self.upconv1(torch.cat([upconv2, conv1], 1))
return upconv1
def contract_block(self, in_channels, out_channels, kernel_size, padding):
contract = nn.Sequential(
torch.nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=1, padding=padding),
torch.nn.BatchNorm2d(out_channels),
torch.nn.ReLU(),
torch.nn.Conv2d(out_channels, out_channels, kernel_size=kernel_size, stride=1, padding=padding),
torch.nn.BatchNorm2d(out_channels),
torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
return contract
def expand_block(self, in_channels, out_channels, kernel_size, padding):
expand = nn.Sequential(torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=padding),
torch.nn.BatchNorm2d(out_channels),
torch.nn.ReLU(),
torch.nn.Conv2d(out_channels, out_channels, kernel_size, stride=1, padding=padding),
torch.nn.BatchNorm2d(out_channels),
torch.nn.ReLU(),
torch.nn.ConvTranspose2d(out_channels, out_channels, kernel_size=3, stride=2, padding=1, output_padding=1)
)
return expand
Here is the training part:
import time
from IPython.display import clear_output
def train(model, train_dl, valid_dl, loss_fn, optimizer, acc_fn, epochs=1):
print("Inside train..")
start = time.time()
dev = torch.device("gpu")
train_loss, valid_loss = [], []
best_acc = 0.0
print("Starting Epochs........")
for epoch in range(epochs):
print('Epoch {}/{}'.format(epoch, epochs - 1))
print('-' * 10)
for phase in ['train', 'valid']:
if phase == 'train':
model.train(True) # Set trainind mode = true
dataloader = train_dl
else:
model.train(False) # Set model to evaluate mode
dataloader = valid_dl
running_loss = 0.0
running_acc = 0.0
step = 0
# iterate over data
for x, y in dataloader:
x = x.cuda()
y = y.cuda()
step += 1
# forward pass
if phase == 'train':
optimizer.zero_grad()
outputs = model(x)
loss = loss_fn(torch.sigmoid(outputs), y)
loss.backward()
optimizer.step()
# scheduler.step()
else:
with torch.no_grad():
outputs = model(x)
loss = loss_fn(outputs, y)
acc = acc_fn(outputs, y)
running_acc += acc*dataloader.batch_size
running_loss += loss*dataloader.batch_size
if step % 100 == 0:
# clear_output(wait=True)
print('Current step: {} Loss: {} Acc: {} AllocMem (Mb): {}'.format(step, loss, acc, torch.cuda.memory_allocated()/1024/1024))
#print(torch.cpu.memory_summary())
epoch_loss = running_loss / len(dataloader.dataset)
epoch_acc = running_acc / len(dataloader.dataset)
clear_output(wait=True)
print('Epoch {}/{}'.format(epoch, epochs - 1))
print('-' * 10)
print('{} Loss: {:.4f} Acc: {}'.format(phase, epoch_loss, epoch_acc))
print('-' * 10)
train_loss.append(epoch_loss) if phase=='train' else valid_loss.append(epoch_loss)
time_elapsed = time.time() - start
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
return train_loss, valid_loss
def acc_metric(predb, yb):
return (predb.argmax(dim=1) == yb.cuda()).float().mean()
Complete error is here:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-40-effb400478a6> in <module>()
2 #output = loss_fn(1, target)
3 opt = torch.optim.Adam(unet.parameters(), lr=0.01)
----> 4 train_loss, valid_loss = train(unet, train_dl, valid_dl, loss_fn, opt, acc_metric, epochs=50)
<ipython-input-37-5335f67c61f3> in train(model, train_dl, valid_dl, loss_fn, optimizer, acc_fn, epochs)
43 if phase == 'train':
44 optimizer.zero_grad()
---> 45 outputs = model(x)
46 loss = loss_fn(torch.sigmoid(outputs), y)
47
<ipython-input-36-6dd22bc4cdce> in __call__(self, x)
14
15 # downsampling part
---> 16 conv1 = self.conv1(x)
17 conv2 = self.conv2(conv1)
18 conv3 = self.conv3(conv2)
/gpfs/share/apps/anaconda3/gpu/5.2.0/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
491 result = self._slow_forward(*input, **kwargs)
492 else:
--> 493 result = self.forward(*input, **kwargs)
494 for hook in self._forward_hooks.values():
495 hook_result = hook(self, input, result)
/gpfs/share/apps/anaconda3/gpu/5.2.0/lib/python3.6/site-packages/torch/nn/modules/container.py in forward(self, input)
90 def forward(self, input):
91 for module in self._modules.values():
---> 92 input = module(input)
93 return input
94
/gpfs/share/apps/anaconda3/gpu/5.2.0/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
491 result = self._slow_forward(*input, **kwargs)
492 else:
--> 493 result = self.forward(*input, **kwargs)
494 for hook in self._forward_hooks.values():
495 hook_result = hook(self, input, result)
/gpfs/share/apps/anaconda3/gpu/5.2.0/lib/python3.6/site-packages/torch/nn/modules/conv.py in forward(self, input)
336 _pair(0), self.dilation, self.groups)
337 return F.conv2d(input, self.weight, self.bias, self.stride,
--> 338 self.padding, self.dilation, self.groups)
339
340
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
This gives me RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
error and if I switch to CPU it takes forever to run and then the kernel timesout. Not sure how to solve it. Any help is appreciated, thanks for your help!