Related thread

I have this loss:

```
class Tacotron2Loss(nn.Module):
def __init__(self, hparams):
super(Tacotron2Loss, self).__init__()
self.gate_loss_fn = nn.BCEWithLogitsLoss()
self.emotion_loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)
num_losses = 3
self.use_mmi = hparams.use_mmi
if self.use_mmi:
self.ctc_loss_fn = torch.nn.CTCLoss(
blank=len(ctc_symbols), reduction='none')
num_losses += 1
# loss weights
self.eta = nn.Parameter(torch.ones(num_losses, dtype=torch.float32))
@staticmethod
def masked_l2_loss(out, target, lengths):
num_not_padded = lengths.sum() * out.size(1)
loss = F.mse_loss(out, target, reduction="sum")
loss = loss / num_not_padded
return loss
def forward(self, y_pred, y, output_lengths):
mel_target, gate_target, ctc_text, ctc_text_lengths, emotion_label = y
# mel_target.requires_grad = False
# gate_target.requires_grad = False
gate_target = gate_target.view(-1, 1)
_, mel_out, mel_out_postnet, gate_out, _, log_probs, emotion_weights = y_pred
gate_out = gate_out.view(-1, 1)
losses = []
mel_loss = self.masked_l2_loss(mel_out, mel_target, output_lengths) + \
self.masked_l2_loss(mel_out_postnet, mel_target, output_lengths)
losses.append(mel_loss)
gate_loss = self.gate_loss_fn(gate_out, gate_target)
losses.append(gate_loss)
emotiom_loss = self.emotion_loss_fn(emotion_weights, emotion_label)
losses.append(emotiom_loss)
if self.use_mmi:
ctc_loss = (self.ctc_loss_fn(log_probs, ctc_text, output_lengths, ctc_text_lengths) /
output_lengths.float()).mean()
losses.append(ctc_loss)
total_loss = torch.stack(losses) * torch.exp(-self.eta) + self.eta
return total_loss.sum(), losses, self.eta
```

Then i pu it in optimizer like this:

```
optimizer = torch.optim.AdamW(list(
model.parameters()) + list(criterion.parameters()), lr=hparams.learning_rate)
```

So, what is right way to use it in DDP setup?

Should i put criterion in main model’s forwars function as submodule or use DDP wrapped on criterion, or something else?