Why I don't train model with Distribute Data Parallel?

I reproduce the training code from DataParallel to DistributedDataParallel, It does not release bugs in training, but it does not print any log or running.
Could show me what is wrong with my code?

This is my code.

import torch
import argparse
import os, time
import json, tqdm
from utils import find_highest_score_answer, load_feature_from_file, create_logger
from transformers import AutoModelForQuestionAnswering
from transformers import AutoTokenizer

def inference(model, data, tokenizer):

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.eval()

results = []

for _, batch in tqdm.tqdm(enumerate(data)):
    
    input_ids               = batch["input_ids"].to(device)
    attention_mask          = batch["attention_mask"].to(device)
    token_type_ids          = batch["token_type_ids"].to(device)
    bbox                    = batch["bbox"].to(device)
    image                   = batch["image"].to(device)
    
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                bbox=bbox, image=image)
    start_logits = outputs.start_logits.detach().cpu().numpy()
    end_logits   = outputs.end_logits.detach().cpu().numpy()
    start_indices, end_indices = find_highest_score_answer(
                                start_scores=start_logits, end_scores=end_logits)
    
    input_ids    = input_ids.cpu().numpy()
    question_ids = batch["question_id"].detach().cpu().numpy().tolist()

    for question_id, input_id, s, e in zip(question_ids, input_ids, start_indices, end_indices):
        predicted_answer = tokenizer.decode(input_id[s:e+1])
        decoding_string  = tokenizer.decode(input_id)
        question         = decoding_string[decoding_string.find('[CLS]')+5:decoding_string.find('[SEP]')]
        results.append({
            "questionId": question_id,
            "question": question,
            "answer": predicted_answer,
        })
return results

def main(args):

output_dir = os.path.join(args['output_dir'], args['weights'].split("/")[-2])
if not os.path.exists(output_dir):
    try:
        print("Creating {} directory".format(output_dir))
        os.mkdir(output_dir)
    except:
        print("INVALID OUTPUT DIRECTORY")
        exit(0)

logger = create_logger(file_path=os.path.join(output_dir, 'inference.log'))

# Load dataset
logger.info("Loading dataset from {} ...".format(args['input_dir']))
eval_data = load_feature_from_file(path=args['input_dir'], batch_size=2)
logger.info("The number of sample is {} ...".format(len(eval_data.dataset)))

# Load model
logger.info("Loading model from {} ...".format(args['weights']))
model = AutoModelForQuestionAnswering.from_pretrained(args['model']).cuda()
# model.load_state_dict(torch.load(args['weights'], map_location='cuda'))

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(args['model'])

# Inference
logger.info("Start inference ...")
start_time  = time.time()
results     = inference(model=model, data=eval_data, tokenizer=tokenizer)
end_time    = time.time()
logger.info("Total inference time {} seconds".format(end_time-start_time))

# save inference results to disk
output_file = os.path.join(output_dir, args['input_dir'].split("/")[-2] + '.json')
with open(output_file, 'w') as f:
    json.dump(results, f, indent=4)
logger.info("DONE ! Check the inference results at {}".format(output_file))

if name == ‘main’:

parser = argparse.ArgumentParser(description='Inference on DocVQA dataset.')

parser.add_argument('--input_dir', required=True,
    help='The input feature extracted from DocVQA dataset. I can be train/val/test subset.' ,
)

parser.add_argument('--model', default='microsoft/layoutlmv2-base-uncased',
    help='The model architecture.'
)
parser.add_argument('--weights', required=True,
    help='The path to model weights',
)

parser.add_argument('--output_dir', required=True,
    help='The output directory'
)

args = vars(parser.parse_args())

main(args)

The output in terminal like below.

(transformer_env) root@ae94a4e6c92d:/mlcv/WorkingSpace/NCKH/tiennv/vqa_thesis/docvqa/libs/layoutlmv2# CUDA_VISIBLE_DEVICES=1,2 python train.py --work_dir ./runs/train/test_multi-gpus --train_config default_config
2021-09-26 10:11:49,801 - INFO - Loading training configuration …
2021-09-26 10:11:49,802 - INFO - Configuration: {‘optimizer’: <class ‘torch.optim.adam.Adam’>, ‘lr’: 0.0001, ‘epochs’: 2, ‘batch_size’: 2, ‘momentum’: 0.9, ‘eval_freq’: 1, ‘save_freq’: 1, ‘num_workers’: 4}
2021-09-26 10:11:49,803 - INFO - Loading training dataset from /mlcv/Databases/DocVQA_2020-21/task_1/extracted_features/layoutlmv2/train …
2021-09-26 10:11:49,953 - INFO - Loading validation dataset from /mlcv/Databases/DocVQA_2020-21/task_1/extracted_features/layoutlmv2/val …
2021-09-26 10:11:49,977 - INFO - Training size: 39456 - Validation size: 5344
2021-09-26 10:11:49,978 - INFO - Loading pre-training model from microsoft/layoutlmv2-base-uncased checkpoint
Some weights of the model checkpoint at microsoft/layoutlmv2-base-uncased were not used when initializing LayoutLMv2ForQuestionAnswering: [‘layoutlmv2.visual.backbone.bottom_up.res4.0.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.7.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.11.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.12.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.5.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.19.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.1.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.15.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res3.1.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.19.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res2.0.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res5.0.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.21.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res5.2.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.16.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.21.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.14.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.2.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.8.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res3.2.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.9.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.4.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res2.2.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.22.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res5.0.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.5.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res3.3.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res2.2.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.4.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.17.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res3.1.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.11.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.1.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.3.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.7.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.8.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res3.3.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.13.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.22.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res5.2.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res5.0.shortcut.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res2.2.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.6.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res5.1.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res3.2.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.13.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res2.0.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res2.0.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.5.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.20.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.16.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.0.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.7.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.14.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.11.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res2.1.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res3.3.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.2.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.3.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.12.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.20.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res5.0.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.3.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.14.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.10.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res3.0.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.18.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.19.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.21.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.stem.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.17.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.0.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.12.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.1.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res5.2.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res3.0.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.8.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res2.1.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.0.shortcut.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.22.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res3.1.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res3.2.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.15.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res3.0.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res5.1.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res2.1.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.9.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.18.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.10.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.20.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.17.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.18.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.10.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.6.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res3.0.shortcut.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.6.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.16.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.2.conv1.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res5.1.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.4.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.9.conv3.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.13.conv2.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res2.0.shortcut.norm.num_batches_tracked’, ‘layoutlmv2.visual.backbone.bottom_up.res4.15.conv3.norm.num_batches_tracked’]

  • This IS expected if you are initializing LayoutLMv2ForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
  • This IS NOT expected if you are initializing LayoutLMv2ForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
    Some weights of LayoutLMv2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/layoutlmv2-base-uncased and are newly initialized: [‘qa_outputs.weight’, ‘qa_outputs.bias’, ‘layoutlmv2.visual_segment_embedding’]
    You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
    Running DDP with model parallel example on cuda:0 device
    Running DDP with model parallel example on cuda:1 device
    GPUs usages for model: 6721 Mb
    Epoch 1/2
    GPUs usages for model: 6721 Mb
    Epoch 1/2

I guess this is very similar to question of Error when training AutoModelForQuesionAnswering with Distribute Data Parallel?

Please check my reply there.