RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one

ZSheikhb · January 29, 2023, 8:29pm

I have a class which inherits from MyModel (parent class) which is a relatively large model :

class DynamicModel(MyModel):
    def __init__(self, num_inputs, num_outputs, h_dim=96, z_dim=48, n_layers=2, n_mixtures=10, device= torch.device('cuda' if torch.cuda.is_available() else 'cpu'), normalizer_input=None, normalizer_output=None,
                 *args, **kwargs):
        super(DynamicModel, self).__init__(u_dim=num_inputs,y_dim= num_outputs, h_dim=h_dim, z_dim=z_dim, n_layers=n_layers, n_mixtures=n_mixtures, device=device)
        # Save parameters
        self.num_inputs = num_inputs
        self.num_outputs = num_outputs
        self.args = args
        self.kwargs = kwargs
        self.normalizer_input = normalizer_input
        self.normalizer_output = normalizer_output
        
        self.to(device)

    @property
    def num_model_inputs(self):
        return self.num_inputs + self.num_outputs if self.ar else self.num_inputs

    def forward(self, u, y=None):
        if self.normalizer_input is not None:
            u = self.normalizer_input.normalize(u)
        if y is not None and self.normalizer_output is not None:
            y = self.normalizer_output.normalize(y)

        loss =super(DynamicModel, self).forward(u, y)
        return loss

    def generate(self, u, y=None):
        if self.normalizer_input is not None:
            u = self.normalizer_input.normalize(u)

        y_sample, y_sample_mu, y_sample_sigma = super(DynamicModel, self).generate(u)

        if self.normalizer_output is not None:
            y_sample = self.normalizer_output.unnormalize(y_sample)
        if self.normalizer_output is not None:
            y_sample_mu = self.normalizer_output.unnormalize_mean(y_sample_mu)
        if self.normalizer_output is not None:
            y_sample_sigma = self.normalizer_output.unnormalize_sigma(y_sample_sigma)

        return y_sample, y_sample_mu, y_sample_sigma

class ModelState:
    def __init__(self,
                 seed,
                 nu,
                 ny,
                 h_dim=56,
                 z_dim=48,
                 n_layers=2,
                 n_mixtures=8,
                 device= torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
                 optimizer_type= "AdamW",
                 **kwargs):

        torch.manual_seed(seed)

        self.h_dim=h_dim
        self.z_dim=z_dim
        self.n_layers=n_layers
        self.n_mixtures= n_mixtures

        self.model = DynamicModel( num_inputs=nu, num_outputs=ny, h_dim=h_dim, z_dim=z_dim, n_layers=n_layers, n_mixtures=n_mixtures, device=device, **kwargs)
        if optimizer_type == "AdaBelief":
            self.optimizer = torch_optimizer.AdaBelief(self.model.parameters(),
                                                       lr= 1e-4,
                                                      betas=(0.9, 0.999),
                                                      eps=1e-6,
                                                      weight_decay=0
                                                      )
        elif optimizer_type=="AdamW":
            self.optimizer = torch.optim.AdamW(self.model.parameters(),
                                               lr= 1e-4,
                                               betas=(0.9, 0.999)
                                              )
    
        else:
            # Optimization parameters
            yogi = torch_optimizer.Yogi(self.model.parameters(), lr= 0.5e-4, betas=(0.95, 0.999), eps=1e-3, initial_accumulator=1e-6, weight_decay=0,)

            self.optimizer = torch_optimizer.Lookahead(yogi, k=5, alpha=0.5)


    def load_model(self, path, name='my_model.pt', map_location=None):
        file = path if os.path.isfile(path) else os.path.join(path, name)
        try:
            if map_location is None:
               ckpt = torch.load(file, map_location=lambda storage, loc: storage)
            else:
               ckpt = torch.load(file, map_location=map_location)
        except NotADirectoryError:
            raise Exception("Could not find model: " + file)
        self.model.load_state_dict(ckpt["model"])
        self.optimizer.load_state_dict(ckpt["optimizer"])
        epoch = ckpt['epoch']
        return epoch

    def save_model(self, epoch, vloss, elapsed_time,  path, name='my_model.pt'):
        if not os.path.exists(path):
            os.makedirs(path)
        torch.save({
                'epoch': epoch,
                'model': self.model.state_dict(),
                'optimizer': self.optimizer.state_dict(),
                'vloss': vloss,
                'elapsed_time': elapsed_time,
            },
            os.path.join(path, name))

Next I tried to use DistributedDataParallel to train my model on multiple GPUs.


    batch_size = args.batch_size
    train_dataset = TensorDataset(train_x, train_y)
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset,
                                                                    num_replicas=args.world_size,
                                                                    rank=rank,
                                                                    shuffle=True,
                                                                    )

    train_loader = DataLoader(train_dataset, 
                              batch_size=batch_size , 
                              shuffle=False, 
                              num_workers=args.num_workers, 
                              sampler=train_sampler,
                              worker_init_fn=seed_worker,
                              generator=g,)

    normalizer_input, normalizer_output = compute_normalizer(train_loader)

    # Define model
    modelstate = ModelState(seed=seed,
                            nu=u_dim,
                            ny=y_dim,
                            #normalizer_input=normalizer_input,
                            #normalizer_output=normalizer_output
                            )
    modelstate.model.cuda()
    modelstate.model = torch.nn.parallel.DistributedDataParallel(modelstate.model, device_ids=[current_device])
    print('passed distributed data parallel call')

When I trained the model with the distributed data on a cluster before finishing the first epoch, this error message interrupted the training process

    loss_ = modelstate.model(u, y)
  File "/home/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1026, in forward
    if torch.is_grad_enabled() and self.reducer._rebuild_buckets():
RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one. This error indicates that your module has parameters that were not used in producing loss. You can enable unused parameter detection by passing the keyword argument `find_unused_parameters=True` to `torch.nn.parallel.DistributedDataParallel`, and by 
making sure all `forward` function outputs participate in calculating loss. 
If you already have done the above, then the distributed data parallel module wasn't able to locate the output tensors in the return value of your module's `forward` function. Please include the loss function and the structure of the return value of `forward` of your module when reporting this issue (e.g. list, dict, iterable).
Parameter indices which did not receive grad for rank 9: 12 13 26 27 79 80 81 82 83 84 85 86 87 88
 In addition, you can set the environment variable TORCH_DISTRIBUTED_DEBUG to either INFO or DETAIL to print out information about which particular parameters did not receive gradient on this rank as part of this error
Traceback (most recent call last):

I would truely appreciate if someone could provide a solution.

XWu · January 31, 2023, 7:10am

Does “passing the keyword argument find_unused_parameters=True to torch.nn.parallel.DistributedDataParallel” help by any chance?

adv010 · October 20, 2023, 12:18pm

Hi, I ran into this bug recently. I have tried the solution recommended by @XWu , and it is starting to train. However, there are other tricks too like multiplying the unused_params by 0 and others, check this Github issues thread for more:

github.com/pytorch/pytorch

RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one.

opened 08:52AM - 19 Aug 20 UTC

QiuSYang

oncall: distributed triaged module: data parallel

## 🐛 Bug ## To Reproduce Epoch: 1, iter 0: loss = 10.099 0%| … | 1/144967 [00:02<116:54:31, 2.90s/it] Traceback (most recent call last): File "train.py", line 99, in <module> solver.train() File "/home/yckj2453/nlp_space/jd_multimodal_dialogue/multi-modal-dialogue-transformer_bart/utils/time_track.py", line 18, in timed result = method(*args, **kwargs) File "/home/yckj2453/nlp_space/jd_multimodal_dialogue/multi-modal-dialogue-transformer_bart/solver.py", line 284, in train decoder_input_ids=decoder_input_ids) File "/root/anaconda3/envs/jddc_mddr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/root/anaconda3/envs/jddc_mddr/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 473, in forward self.reducer.prepare_for_backward(list(_find_tensors(output))) RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one. This error indicates that your module has parameters that were not used in producing loss. You can enable unused parameter detection by (1) passing the keyword argument `find_unused_parameters=True` to `torch.nn.parallel.DistributedDataParallel`; (2) making sure all `forward` function outputs participate in calculating loss. If you already have done the above two steps, then the distributed data parallel module wasn't able to locate the output tensors in the return value of your module's `forward` function. Please include the loss function and the structure of the return value of `forward` of your module when reporting this issue (e.g. list, dict, iterable). Traceback (most recent call last): File "/root/anaconda3/envs/jddc_mddr/lib/python3.7/runpy.py", line 193, in _run_module_as_main "__main__", mod_spec) File "/root/anaconda3/envs/jddc_mddr/lib/python3.7/runpy.py", line 85, in _run_code exec(code, run_globals) File "/root/anaconda3/envs/jddc_mddr/lib/python3.7/site-packages/torch/distributed/launch.py", line 263, in <module> main() File "/root/anaconda3/envs/jddc_mddr/lib/python3.7/site-packages/torch/distributed/launch.py", line 259, in main cmd=cmd) subprocess.CalledProcessError: Command '['/root/anaconda3/envs/jddc_mddr/bin/python', '-u', 'train.py', '--local_rank=0']' returned non-zero exit status 1. Steps to reproduce the behavior: 1. 1. 1. ## Expected behavior ## Environment Please copy and paste the output from our [environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py) (or fill out the checklist below manually). You can get the script and run it with: ``` wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py # For security purposes, please check the contents of collect_env.py before running it. python collect_env.py ``` - PyTorch Version (1.5.1): - OS (e.g., Linux): - How you installed PyTorch (`conda`, `pip`, source): - Build command you used (if compiling from source): - Python version: 3.7.5 - CUDA/cuDNN version: 10.1/7.6 - GPU models and configuration: - Any other relevant information: pip transformers==2.11.0 pip numpy==1.19.0 ## Additional context ## here is my code: ` def train(self): epoch_loss_history = [] best_eval_loss = float('inf') # 记录最佳损失 # 设置并行计算 if self.config.n_gpu > 1: print("use torch.nn.DataParallel for the parallel operations.") self.model = nn.DataParallel(self.model) if self.config.local_rank != -1: print("use torch.nn.parallel.DistributedDataParallel for the parallel operations.") self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[self.config.local_rank], output_device=self.config.local_rank, find_unused_parameters=True) for epoch_i in range(self.epoch_i, self.config.n_epoch): # self.epoch_i = epoch_i batch_loss_history = [] loss_history = [] num_batch = 0 self.model.train() n_total_words = 0 # 每个batch开始之前, 先进行梯度清空 # self.optimizer.zero_grad() self.model.zero_grad() # 更加安全的清理梯度 # epoch_iterator = tqdm(self.train_data_loader, desc="Iteration", # disable=self.config.local_rank not in [-1, 0]) for batch_i, (input_ids, label_ids, images, img_char_positions) in enumerate(tqdm(self.train_data_loader, ncols=80)): # input_ids: [batch, sentence_length] num_batch = batch_i # flatten input and target conversations # 去除PAD_ID列表长度, -1为了去除起始字符的长度 label_origin_length = [len(single_label) - single_label.count(PAD_ID) - 1 for single_label in label_ids] if self.config.is_images_embedding: # 将一个batch内的所有image都压缩到一个列表之中 input_images = [image for sentence_images in images for image in sentence_images] # image在sentence中对应的索引号 input_image_indexes = [i for sentence_images_index in img_char_positions for i in sentence_images_index] # 计算每个句子包含多少张图片 input_images_length = [len(sentence_images_index) for sentence_images_index in img_char_positions] # 确保输入图片数量与图像索引号数量相同 assert len(input_images) == sum(input_images_length) assert len(input_image_indexes) == sum(input_images_length) input_sentences = to_var(torch.LongTensor(input_ids)) target_sentences = to_var(torch.LongTensor(label_ids)) target_sentence_length = to_var(torch.LongTensor(label_origin_length)) if self.config.is_images_embedding: input_images = to_var(torch.stack(input_images)) input_images_length = to_var(torch.LongTensor(input_images_length)) input_image_indexes = to_var(torch.LongTensor(input_image_indexes)) else: input_images = None input_images_length = None input_image_indexes = None # if self.config.gradient_accumulation_step == 1: # # reset gradient # self.optimizer.zero_grad() # self.model.zero_grad() attention_mask = input_sentences.ne(0).long() # decoder_input_ids = target_sentences[:, :-1] # GPT解码输入, 去除末尾的结束字符 decoder_input_ids = self.shift_tokens_right_custom(target_sentences, PAD_ID) # 删除EOS_ID outputs = self.model(input_ids=input_sentences, input_images=input_images, input_images_length=input_images_length, input_image_indexes=input_image_indexes, attention_mask=attention_mask, # input_sentences.eq(0) # lm_labels=target_sentences, decoder_input_ids=decoder_input_ids) # sentence_logits = self.model( # input_sentences, # input_sentence_length, # input_conversation_length, # target_sentences, # input_images, # input_images_length=input_images_length, # input_image_indexes=input_image_indexes) decoder_target_label_ids = target_sentences[:, 1:] # GPT解码Label, 去除首部的起始字符 sentence_logits = outputs[0] # 获取Bart的logits batch_loss, n_words = masked_cross_entropy( sentence_logits, decoder_target_label_ids, target_sentence_length) if self.config.n_gpu > 1: # mean() to average on multi-gpu parallel (not distributed) training batch_loss = batch_loss.mean() n_words = n_words.mean() if self.config.gradient_accumulation_step > 1: batch_loss = batch_loss / self.config.gradient_accumulation_step n_words = n_words / self.config.gradient_accumulation_step # assert not isnan(batch_loss.item()) batch_loss_history.append(batch_loss.item()) n_total_words += n_words.item() # loss_history.append(loss) if batch_i % self.config.print_every == 0: # tqdm.write( # f'Epoch: {epoch_i+1}, iter {batch_i}: loss = {loss.item():.3f}') tqdm.write( f'Epoch: {epoch_i+1}, iter {batch_i}: loss = {batch_loss.item()/ n_words.item():.3f}') # Back-propagation # loss.backward() batch_loss.backward() # Gradient cliping torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.clip) # 进行梯度累积 if (batch_i + 1) % self.config.gradient_accumulation_step == 0: # Run optimizer & scheduler self.optimizer.step() self.scheduler.step() # self.optimizer.zero_grad() # 清空梯度 self.model.zero_grad() torch.cuda.empty_cache() gc.collect() # epoch_loss = np.sum(loss_history) / (num_batch + 1) epoch_loss = np.sum(batch_loss_history) / n_total_words epoch_loss_history.append(epoch_loss) self.epoch_loss = epoch_loss print_str = f'Epoch {epoch_i+1} loss average: {epoch_loss:.3f}' print(print_str) if epoch_i % self.config.save_every_epoch == 0: self.save_model(epoch_i + 1) # Only evaluate when single GPU otherwise metrics may not average well if self.config.local_rank == -1: # print('\n<BLEU score>...') # self.calculate_bleu() print('\n<Validation>...') self.validation_loss = self.evaluate() # 保存最佳validation los model if self.validation_loss < best_eval_loss: self.save_model(epoch_i, best='best_model') # 更新最佳验证损失 best_eval_loss = self.validation_loss # # if epoch_i % self.config.plot_every_epoch == 0: # self.write_summary(epoch_i) self.save_model(self.config.n_epoch) return epoch_loss_history` cc @pietern @mrshenli @pritamdamania87 @zhaojuanmao @satgera @rohan-varma @gqchen @aazzolini @xush6528 @osalpekar @jiayisuse @agolynski