I’m dealing with a strange issue where the gradients after backward pass have different shapes depending on whether CUDA or CPU is used. The model used is relatively simple:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool1 = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.pool2 = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
self.relu1 = nn.ReLU()
self.relu2 = nn.ReLU()
self.relu3 = nn.ReLU()
self.relu4 = nn.ReLU()
def forward(self, x):
x = self.pool1(self.relu1(self.conv1(x)))
x = self.pool2(self.relu2(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = self.relu3(self.fc1(x))
x = self.relu4(self.fc2(x))
x = self.fc3(x)
return x
The input tensor has shape (1, 3, 32, 32), and the relevant section of code is as follows, with the method generate_gradients
being of particular importance:
class VanillaBackprop():
"""
Produces gradients generated with vanilla back propagation from the image
"""
def __init__(self, model):
self.model = model
self.gradients = None
# Put model in evaluation mode
self.model.eval()
# Hook the first layer to get the gradient
self.hook_layers()
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
def hook_layers(self):
def hook_function(module, grad_in, grad_out):
self.gradients = grad_in[0]
# Register hook to the first layer
try:
first_layer = list(self.model.features._modules.items())[0][1]
except:
first_layer = list(self.model._modules.items())[0][1]
first_layer.register_backward_hook(hook_function)
def generate_gradients(self, input_image, target_class):
# Forward
model_output = self.model(input_image.to(self.device))
# Zero grads
self.model.zero_grad()
# Target for backprop
one_hot_output = torch.FloatTensor(1, model_output.size()[-1]).zero_()
one_hot_output[0][target_class] = 1
# Backward pass
model_output.backward(gradient=one_hot_output.to(self.device))
# Convert Pytorch variable to numpy array
gradients_as_arr = self.gradients.data.cpu().numpy()[0]
return gradients_as_arr
When on CPU, self.gradients
has shape (1, 3, 32, 32), while on CUDA it has shape (1, 6, 28, 28). How is that possible, and how do I fix this? Any help is much appreciated.