Hello community,

I am working on creating a binary classifier by using the Transformer-Encoder architecture.

Most of my code is based on this blog - http://nlp.seas.harvard.edu/2018/04/03/attention.html and

```
class TransformerModel(nn.Module):
"""
"""
def __init__(self, n_token, n_dim_model, n_head, n_hidden, n_blocks, dropout=0.5):
super(TransformerModel, self).__init__()
# #Multi Headed Attention Layer
self_attention = MultiHeadedAttention(n_head, n_dim_model)
# #Feedforward Layer
feed_forward = FeedForwardLayer(n_dim_model, n_hidden, dropout)
# #Positional Encoding
positional_encoding = PositionalEncoding(n_dim_model, dropout)
encoder_layer = EncoderLayer(n_dim_model, copy.deepcopy(self_attention), copy.deepcopy(feed_forward), dropout)
self.encoder = Encoder(encoder_layer, n_blocks)
embedding = Embeddings(n_dim_model, n_token)
self.src_embed = nn.Sequential(embedding, copy.deepcopy(positional_encoding))
# Fully-Connected Layer
self.fc = nn.Linear(n_dim_model, 2)
# Softmax non-linearity
self.softmax = nn.Softmax()
def forward(self, x):
# # x dimension[k, batch_size = 64]
embedded_sents = self.src_embed(x.permute(1,0)) # shape = (batch_size=64, k, d_model=256)
encoded_sents = self.encoder(embedded_sents)
# Convert input to (batch_size, d_model) for linear layer
final_feature_map = encoded_sents[:,-1,:] # #Output of this is shape ([64, 256])
final_out = self.fc(final_feature_map) # #output of tjhis is shape torch.Size([64, 2])
return self.softmax(final_out) # #output of tjhis is shape torch.Size([64, 2])
```

and my model training step is

```
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
model.train()
for batch in iterator:
# #Reset gradients after every batch
optimizer.zero_grad()
predictions = model(batch.X)
predictions = torch.max(predictions.data, 1)[1]
predictions = predictions.float()
loss = criterion(predictions, batch.label)
loss.backward() # #Error Here
optimizer.step()
epoch_loss += loss.item()
return epoch_loss/len(iterator)
```

Error message is

```
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
164 products. Defaults to ``False``.
165 """
--> 166 torch.autograd.backward(self, gradient, retain_graph, create_graph)
167
168 def register_hook(self, hook):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
97 Variable._execution_engine.run_backward(
98 tensors, grad_tensors, retain_graph, create_graph,
---> 99 allow_unreachable=True) # allow_unreachable flag
100
101
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
```

Any suggestions or help around this would be really appreciated!

Thank you