Parameter matched Transformer is slower than RNN

I shared a script below that measures the inference speed of parameter matched Transformers and LSTMs, both with the same number of layers. I expected that the Transformer would be faster as the sequence length increases since the Transformer can use teacher forcing for better parallelizability. However, I find that the LSTM is faster. Is this expected? I tested on both A100 and V100 GPUs.

import torch
import torch.nn as nn
import tqdm

ARCHITECTURE = 'transformer'
transformer_dim = 256
lstm_dim = 312
nhead = 4
num_layers = 5

if ARCHITECTURE == 'transformer':
    d_model = transformer_dim
    encoder_layer = nn.TransformerEncoderLayer(
        d_model=transformer_dim,
        nhead=nhead,
        dim_feedforward=transformer_dim*4,
        dropout=0,
        batch_first=True
    )
    model = nn.TransformerEncoder(
        encoder_layer=encoder_layer,
        num_layers=num_layers
    )
elif ARCHITECTURE == 'lstm':
    d_model = lstm_dim
    model = nn.LSTM(
        input_size=lstm_dim,
        hidden_size=lstm_dim,
        num_layers=num_layers,
        batch_first=True,
        dropout=0
    )

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

parameters = sum(p.numel() for p in model.parameters())
print(f'{parameters} parameters')

batch_size = 128
sequence_length = 1024
warmup_steps = 50
steps = 1000
log_frequency = 100

print('Warming up')
warmup_steps = tqdm.tqdm(range(warmup_steps))
for i in warmup_steps:
    batch = torch.rand((batch_size, sequence_length, d_model), device=device)
    out = model(batch)

torch.cuda.synchronize()
print('Finished warming up')
start = torch.cuda.Event(enable_timing=True)
start.record()
steps = tqdm.tqdm(range(steps))
for i in steps:
    batch = torch.rand((batch_size, sequence_length, d_model), device=device)
    out = model(batch)
    if i % log_frequency == 0 and i > 0:
        end = torch.cuda.Event(enable_timing=True)
        end.record()
        torch.cuda.synchronize()
        elapsed_time_ms = start.elapsed_time(end)
        elapsed_time_per_step_ms = elapsed_time_ms / log_frequency
        print(f'Elapsed time per step: {elapsed_time_per_step_ms}ms')
        start = torch.cuda.Event(enable_timing=True)
        start.record()