Hi,
I am trying to wrap DistributedDataParallel() with the Transformer model.
But, I am facing the below error
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class ‘torchtext.data.example.Example’>
Code:
def train_1(args):
#init_process_group
#rank = args.nr * args.gpus + gpu
rank = int(os.environ[‘LOCAL_RANK’])
gpu = torch.device(f’cuda:{rank}')
torch.distributed.init_process_group(backend=‘nccl’, init_method=‘env://’)
torch.cuda.set_device(gpu)TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"), init_token='<sos>', eos_token='<eos>', lower=True) train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT) TEXT.build_vocab(train_txt) batch_size = 20 eval_batch_size = 10 sampler = torch.utils.data.distributed.DistributedSampler(train_txt); loader = torch.utils.data.DataLoader(train_txt, shuffle=(sampler is None), sampler=sampler) bptt = 35 ntokens = len(TEXT.vocab.stoi) # the size of vocabulary emsize = 200 # embedding dimension nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 2 # the number of heads in the multiheadattention models dropout = 0.2 # the dropout value model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(gpu) #DDP model = nn.parallel.DistributedDataParallel(model,device_ids=[gpu]) criterion = nn.CrossEntropyLoss() lr = 5.0 # learning rate optimizer = torch.optim.SGD(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) def train(): model.train() # Turn on the train mode total_loss = 0. start_time = time.time() ntokens = len(TEXT.vocab.stoi) for i, (data, targets) in enumerate(loader): optimizer.zero_grad() output = model(data) loss = criterion(output.view(-1, ntokens), targets) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() total_loss += loss.item() log_interval = 200 if batch % log_interval == 0 and batch > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | ' 'lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | device{:3d}'.format( epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0], elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss), torch.cuda.current_device())) total_loss = 0 start_time = time.time()