Dear all,
I am using Torch 1.13.1 to run this simple code:
import torch
class DataGen(torch.utils.data.Dataset):
def __init__(self, n, L):
super(DataGen, self).__init__()
self.n = n
self.L = L
def __getitem__(self, item):
return {'data': torch.randn(self.n)}
def __len__(self):
return self.L
class Network(torch.nn.Module):
def __init__(self, n):
super(Network, self).__init__()
self.layer = torch.nn.Linear(in_features=n, out_features=1)
def forward(self, x):
print(f'\tData processed:{x}')
return self.layer(x['data'])
# Parameters
n = 2
L = 12
batch_size = 4
# Data
dataset = DataGen(n, L)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
# Device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)
# Model
model = Network(n).to(device)
if torch.cuda.device_count() > 1:
model = torch.nn.DataParallel(model)
# Train
model.train()
for i, batch in enumerate(dataloader):
batch = {k: v.to(device) for k, v in batch.items()}
print(f'\nBatch {i}:\n\tData loaded:{batch}')
pred = model(batch)
It prints this:
Batch 0:
Data loaded:{'data': tensor([[ 0.0313, 0.7014],
[-0.1613, -1.0289],
[ 0.4327, 0.4148],
[ 1.2195, -0.8426]], device='cuda:0')}
Data processed:{'data': tensor([[0., 0.],
[0., 0.]], device='cuda:1')}
Data processed:{'data': tensor([[ 0.0313, 0.7014],
[-0.1613, -1.0289]], device='cuda:0')}
Batch 1:
Data loaded:{'data': tensor([[-0.3293, 2.3024],
[-0.4908, -1.0065],
[-0.4675, -0.1143],
[ 0.0790, 0.0789]], device='cuda:0')}
Data processed:{'data': tensor([[0., 0.],
[0., 0.]], device='cuda:1')}
Data processed:{'data': tensor([[-0.3293, 2.3024],
[-0.4908, -1.0065]], device='cuda:0')}
Batch 2:
Data loaded:{'data': tensor([[ 0.2400, -0.3636],
[ 1.8705, -1.0880],
[-1.5622, -1.8931],
[-0.5770, 0.0298]], device='cuda:0')}
Data processed:{'data': tensor([[0., 0.],
[0., 0.]], device='cuda:1')}
Data processed:{'data': tensor([[ 0.2400, -0.3636],
[ 1.8705, -1.0880]], device='cuda:0')}
Note that when the original batch is splitted by DataParallel, the values of the split asigned to cuda:1 become 0. Why is this happening?
Thanks in advance