One of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [8, 632, 512]], which is output 0 of ReluBackward0, is at version 1; expected version 0 instead

I have been trying to debug the following error but i cant seem to find where the error is originating from. I used torch.autograd.set_detect_anomaly(True) which shows the error is coming from nn.Relu activation function. I went through a couple of websites and most of them suggested in cloning the tensor but its not working.
This is my model

class TexttoSpeech(nn.Module):
  def __init__(
      self, 
      in_channels:int=1,
      patch_size:int=16,
      num_transformer_layers:int=6,
      embedding_dim:int=512,
      mlp_size:int=512,
      num_heads:int=2,
      attn_dropout:int=0.1,
      mlp_dropout:int=0.1,
      embedding_dropout:int=0.1,
      out_features:int=512
  ):
    super().__init__()
    self.DecoderPrenet = Prenet(input_size=hp.num_mels * hp.outputs_per_step, output_size=embedding_dim, hidden_size=out_features*2)
    self.padding_idx = 1
    self.pe = PositionalEmbedding(
            666, 512, self.padding_idx
        )
    self.dropout_module = FairseqDropout(
            0.1, module_name=self.__class__.__name__
        )
    self._future_mask = torch.empty(0)
    self.mel_linear = nn.Linear(embedding_dim, hp.num_mels * hp.outputs_per_step)
    self.transformer_layer = nn.TransformerDecoderLayer(d_model=embedding_dim,
                                                                            nhead=num_heads,
                                                                            dim_feedforward=mlp_size,
                                                                            dropout=mlp_dropout, activation='gelu',
                                                                            batch_first=True, norm_first=True)
    self.transformer_decoder_layer = nn.TransformerDecoder(self.transformer_layer, num_layers=num_transformer_layers)
    self.n_transformer_layers = num_transformer_layers
    self.transformer_layers = nn.ModuleList(
            TransformerDecoderLayer(num_transformer_layers) for _ in range(self.n_transformer_layers)
        )
    self.postnet = Postnet(
            hp.num_mels * hp.outputs_per_step,
            512,
            5,
            5,
            0.5,
        )
    self.encoder = S2TTransformerEncoder()
    self.alignment_layer = num_transformer_layers-1
    self.pos_emb_alpha = nn.Parameter(torch.ones(1))
    self.layer_norm = nn.LayerNorm(embedding_dim)
  def buffered_future_mask(self, tensor):
        dim = tensor.size(0)
        if (
            self._future_mask.size(0) == 0
            or (not self._future_mask.device == tensor.device)
            or self._future_mask.size(0) < dim
        ):
            self._future_mask = torch.triu(
                utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1
            )
        self._future_mask = self._future_mask.to(tensor)
        return self._future_mask[:dim, :dim]

  def forward(self, x, src):
      batch_size = x.shape[0]   
      src_lengths = torch.Tensor([src.shape[1]]).long().cuda().clone()
      target_lengths = torch.Tensor([x.shape[1]]).long().cuda().clone()  
      encoder_out = self.encoder(src, src_lengths)
      self_attn_padding_mask = lengths_to_padding_mask(target_lengths)
      positions = self.pe(
      self_attn_padding_mask, incremental_state=None
        )
      positions = positions[:, -1:]
      x = self.DecoderPrenet(x)
      x += self.pos_emb_alpha * positions
      x = self.dropout_module(x)
      x = x.transpose(0, 1)
      incremental_state=None
      self_attn_padding_mask = self_attn_padding_mask.expand(x.shape[1], -1)
      inner_states: List[Optional[torch.Tensor]] = [x]
      for idx, transformer_layer in enumerate(self.transformer_layers):
        if incremental_state is None:
                self_attn_mask = self.buffered_future_mask(x)
        else:
                self_attn_mask = None
        x, layer_attn, _ = transformer_layer(
                x,
                encoder_out
                if (encoder_out is not None and len(encoder_out) > 0)
                else None,
                incremental_state,
                self_attn_mask=self_attn_mask,
                self_attn_padding_mask=self_attn_padding_mask,
                need_attn=bool((idx == self.alignment_layer)),
                need_head_weights=bool((idx == self.alignment_layer)),
            )
        inner_states.append(x)
        if layer_attn is not None and idx == self.alignment_layer:
                attn = layer_attn.float().to(x)
      x = self.layer_norm(x)
      x = x.transpose(0, 1)
      #x = self.transformer_decoder_layer(x, memory.permute(1,0,2))
      mel_out = self.mel_linear(x)
      postconvnet = self.postnet(mel_out)
      out = mel_out + postconvnet
      return out, mel_out

The error is originating from this function:

class Prenet(nn.Module):

    def __init__(self, input_size, hidden_size, output_size, p=0.5):

        super(Prenet, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.layer = nn.Sequential(OrderedDict([
             ('fc1', nn.Linear(self.input_size, self.hidden_size)),
             ('relu1', nn.ReLU()),
             ('dropout1', nn.Dropout(p)),
             ('fc2', nn.Linear(self.hidden_size, self.output_size)),
             ('relu2', nn.ReLU()),
             ('dropout2', nn.Dropout(p)),
        ]))

    def forward(self, input_):

        out = self.layer(input_)

        return out

Sorry i was able to solve the issue by following the link