Hi @aplassard, thanks for help! Following is the cleaned code to repro the error.
from __future__ import print_function
import argparse
from collections import OrderedDict
import linecache
import os
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.utils.data.distributed
# Load matrix from file
class LazyDataset(data.Dataset):
def __init__(self, filename):
self._filename = filename
self._total_data = 0
with open(filename, 'r') as f:
self._total_data = len(f.readlines()) - 1
def __getitem__(self, idx):
line = linecache.getline(self._filename, idx + 1)
return idx, line
def __len__(self):
return self._total_data
if __name__ == "__main__":
# Input args processing
parser = argparse.ArgumentParser()
parser.add_argument('-datadir', '--datadir', help='Data directory where the training dataset is located', required=False, default=None)
args = vars(parser.parse_args())
# Training dataset maybe splitted into multiple files
training_filename = args['datadir'] + '/matrixTest'
# Load the dataset
dataset = LazyDataset(training_filename)
dataLoader = data.DataLoader(dataset)
# Initialize the model
model = nn.Sequential(
nn.Linear(20, 10),
nn.Linear(10, 20)
)
# Initialize the distributed one
torch.distributed.init_process_group(world_size=2, \
init_method='file:///' + os.path.join(os.environ['HOME'], 'distributedFile'), \
backend='gloo')
model.cuda()
model = nn.parallel.DistributedDataParallel(model)
# load dataset
train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
dataLoader = data.DataLoader(dataset, num_workers=2, sampler=train_sampler)
# Put the model to GPU if used
model.cuda()
for epoch in range(5):
total_loss = 0
train_sampler.set_epoch(epoch)
for idx, _ in dataLoader:
in_val = torch.zeros(20)
in_val[idx] = 1.0
output = model(in_val)
The matrixTest
can be as following, but actually not used.
1 1 2 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 8 9
2 3 2 2 2 1 2 2 1 3 4 5 1 2 3 4 1 1 1 1
The error from one process (two processes have the same error):
Traceback (most recent call last):
File "test.py", line 67, in <module>
output = model(in_val)
File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__
result = self.forward(*input, **kwargs)
File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 216, in forward
outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs)
File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 223, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 65, in parallel_apply
raise output
File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 41, in _worker
output = module(*input, **kwargs)
File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__
result = self.forward(*input, **kwargs)
File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/nn/modules/container.py", line 91, in forward
input = module(input)
File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__
result = self.forward(*input, **kwargs)
File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/nn/modules/linear.py", line 55, in forward
return F.linear(input, self.weight, self.bias)
File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/nn/functional.py", line 994, in linear
output = input.matmul(weight.t())
RuntimeError: size mismatch, m1: [1 x 10], m2: [20 x 10] at /opt/conda/conda-bld/pytorch_1524586445097/work/aten/src/THC/generic/THCTensorMathBlas.cu:249
terminate called after throwing an instance of 'gloo::EnforceNotMet'
what(): [enforce fail at /opt/conda/conda-bld/pytorch_1524586445097/work/third_party/gloo/gloo/cuda.cu:249] error == cudaSuccess. 29 vs 0. Error at: /opt/conda/conda-bld/pytorch_1524586445097/work/third_party/gloo/gloo/cuda.cu:249: driver shutting down