When adding torch.nn.parallel.DistributedDataParallel
to one-gpu training code, I meet a problem that I get same loss but different gradient on different gpus. Comparing with the former one-gpu training code, I’m sure the loss is right, but after loss.backward()
, I watch the grad of the weights and bias of the layers, and found they’re different on different gpus before all_gather, the grad of the layers between all_gather and computing loss is the same on different gpus.
This is a Contrastive Learning code, so I all_gather the tensors on all gpu to compute the common final loss.
Here is some of the code of the model:
import torch.nn as nn
import torch
from config.base_config import Config
from modules.transformer import Transformer
from modules.stochastic_module import StochasticText
from modules.basic_utils import AllGather
allgather = AllGather.apply
from modules.tokenization_clip import SimpleTokenizer
class CLIPStochastic(nn.Module):
def __init__(self, config: Config):
super(CLIPStochastic, self).__init__()
self.config = config
from transformers import CLIPModel
if config.clip_arch == 'ViT-B/32':
self.clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
elif config.clip_arch == 'ViT-B/16':
self.clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
else:
raise ValueError
self.task_config = config
config.pooling_type = 'transformer'
self.pool_frames = Transformer(config)
self.stochastic = StochasticText(config)
def forward(self, data, return_all_frames=False, is_train=True):
batch_size = data['video'].shape[0]
text_data = data['text'] # text_data["input_ids"].shape = torch.Size([16, 17])
video_data = data['video'] # [16, 12, 3, 224, 224]
video_data = video_data.reshape(-1, 3, self.config.input_res, self.config.input_res) # [192, 3, 224, 224]
if is_train:
text_features = self.clip.get_text_features(**text_data)
video_features = self.clip.get_image_features(video_data)
video_features = video_features.reshape(batch_size, self.config.num_frames, -1) # [bs, #F, 512]
text_features = allgather(text_features,self.task_config)
video_features = allgather(video_features,self.task_config)
torch.distributed.barrier()
video_features_pooled = self.pool_frames(text_features, video_features)
# @WJM: perform stochastic text
text_features_stochstic, text_mean, log_var = self.stochastic(text_features, video_features)
if return_all_frames:
return text_features, video_features, video_features_pooled, text_features_stochstic, text_mean, log_var
return text_features, video_features_pooled, text_features_stochstic, text_mean, log_var
else:
text_features = self.clip.get_text_features(**text_data)
video_features = self.clip.get_image_features(video_data)
video_features = video_features.reshape(batch_size, self.config.num_frames, -1)
video_features_pooled = self.pool_frames(text_features, video_features)
# @WJM: re-parameterization for text (independent of the text-cond pooling)
text_features_stochstic, _, _ = self.stochastic(text_features, video_features)
if return_all_frames:
return text_features, video_features, video_features_pooled, text_features_stochstic
return text_features, video_features_pooled, text_features_stochstic
and the allgather function is like this:
class AllGather(torch.autograd.Function):
"""An autograd function that performs allgather on a tensor."""
@staticmethod
def forward(ctx, tensor, args):
output = [torch.empty_like(tensor) for _ in range(args.world_size)]
torch.distributed.all_gather(output, tensor)
ctx.rank = local_rank
ctx.batch_size = tensor.shape[0]
return torch.cat(output, dim=0)
@staticmethod
def backward(ctx, grad_output):
local_grad = grad_output[ctx.batch_size * ctx.rank : ctx.batch_size * (ctx.rank + 1)]
return local_grad, None
I tried to add all_reduce, but that seems it doesn’t work, maybe because DDP comes with built-in synchronous gradient function?
I have been troubled by this problem for a long time. Thank you so much!