Hi everyone,
I have a class:
import torch
import torch.nn as nn
import torch.nn.functional as F
def create_model():
return Net()
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout2d(0.25)
self.fc1 = nn.Linear(9216, 128)
self.dropout2 = nn.Dropout2d(0.25)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2)
x = torch.flatten(self.dropout1(x), 1)
x = F.relu(self.fc1(x))
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
def log_weights(self, step, writer):
writer.add_histogram('weights/conv1/weight', self.conv1.weight.data, step)
writer.add_histogram('weights/conv1/bias', self.conv1.bias.data, step)
writer.add_histogram('weights/conv2/weight', self.conv2.weight.data, step)
writer.add_histogram('weights/conv2/bias', self.conv2.bias.data, step)
writer.add_histogram('weights/fc1/weight', self.fc1.weight.data, step)
writer.add_histogram('weights/fc1/bias', self.fc1.bias.data, step)
writer.add_histogram('weights/fc2/weight', self.fc2.weight.data, step)
writer.add_histogram('weights/fc2/bias', self.fc2.bias.data, step)
With the training function:
def train(use_cuda, model, epoch, optimizer, log_interval, train_loader, writer):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
if use_cuda:
data, target = data.cuda(), target.cuda()
data, target = Variable(data), Variable(target)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)}'
f'({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')
step = epoch * len(train_loader) + batch_idx
log_scalar('train_loss', loss.data.item(), step, writer)
model.log_weights(step, writer)
And define the model like this:
# Define model, device and optimizer
model = create_model()
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
model.to(device)
optimizer = optim.Adam(model.parameters())
optimizer.step()
However, I keep running into:
ModuleAttributeError: 'DataParallel' object has no attribute 'log_weights'
NOTE
This only happens when MULTIPLE GPUs are used.
It does NOT happen for the CPU or a single GPU.
I expect the attribute to be available, especially since the wrapper in Pytorch ensures that all attributes of the wrapped model are accessible.
Environment
- PyTorch Version (e.g., 1.0): 1.6
- OS (e.g., Linux): Ubuntu 18
- How you installed PyTorch (
conda
,pip
, source): conda - Python version: 3.7
- CUDA/cuDNN version: 10.1
- GPU models and configuration: 2 x V100
- Any other relevant information:
Again, only happens with multiple GPUs.
I am happy to share the full code. However, it is a mlflow project and you need docker with the nvidia-container thingy to run it.
Just tell me if desired.