Is this a correct use of nn.Transformer?

import torch, torch.nn as nn
class A(nn.Module):
  def __init__(self):
    super().__init__()
    self.embed_src = nn.Embedding(2, 10)
    self.embed_target = nn.Embedding(2, 10)
    self.transformer = nn.Transformer(10, 2)
    self.lin = nn.Linear(10, 2)
    self.softmax = nn.Softmax(dim=-1)
  def forward(self, inp, tgt):
    embed_src = self.embed_src(inp)
    embed_target = self.embed_target(tgt)
    # print(embedding.shape)
    output = self.transformer(embed_src.view(len(inp), 1, -1), embed_target.view((len(tgt)), 1, -1))
    output = self.lin(output)
    print('output.shape', output.shape, 'output', output)
    print('embed_target', embed_target.view(2, 1, -1))
    print('softmax probabilities', self.softmax(output))
    return output.permute(0, 2, 1)
model = A()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
dataset = [ 
            [([0, 1, 0, 1, 0, 1]), ([[0], [1]])],
            [([1, 0, 1, 0, 1, 0]), ([[1], [0]])],
            [([0, 0, 0, 0, 0, 0]), ([[0], [0]])],
            [([1, 1, 1, 1, 1, 1]), ([[1], [1]])],
            [([1, 1, 0, 0, 1, 1]), ([[0], [0]])],
            [([0, 0, 1, 1, 0, 0]), ([[1], [1]])]
           ]
tensor_dataset = []
for i in range(6):
  tensor_dataset.append([torch.tensor(dataset[i][0]), torch.tensor(dataset[i][1])])
tensor_dataset

[
[tensor([0, 1, 0, 1, 0, 1]), tensor([[0], [1]])],
[tensor([1, 0, 1, 0, 1, 0]), tensor([[1], [0]])],
[tensor([0, 0, 0, 0, 0, 0]), tensor([[0], [0]])],
[tensor([1, 1, 1, 1, 1, 1]), tensor([[1], [1]])],
[tensor([1, 1, 0, 0, 1, 1]), tensor([[0], [0]])],
[tensor([0, 0, 1, 1, 0, 0]), tensor([[1], [1]])]
]

criterion = nn.CrossEntropyLoss()
for i in range(1000):
  optimizer.zero_grad()
  print(tensor_dataset[i%6][0], tensor_dataset[i%6][1])
  loss = criterion(model(tensor_dataset[i%6][0], tensor_dataset[i%6][1]), tensor_dataset[i%6][1])
  print(loss)
  loss.backward()
  optimizer.step()

do I need to set self.embed_target requires_grad as False?
is output.permute(0, 2, 1) correct way to use with nn.CrossEntropyLoss?