I shared a script below that measures the inference speed of parameter matched Transformers and LSTMs, both with the same number of layers. I expected that the Transformer would be faster as the sequence length increases since the Transformer can use teacher forcing for better parallelizability. However, I find that the LSTM is faster. Is this expected? I tested on both A100 and V100 GPUs.
import torch
import torch.nn as nn
import tqdm
ARCHITECTURE = 'transformer'
transformer_dim = 256
lstm_dim = 312
nhead = 4
num_layers = 5
if ARCHITECTURE == 'transformer':
d_model = transformer_dim
encoder_layer = nn.TransformerEncoderLayer(
d_model=transformer_dim,
nhead=nhead,
dim_feedforward=transformer_dim*4,
dropout=0,
batch_first=True
)
model = nn.TransformerEncoder(
encoder_layer=encoder_layer,
num_layers=num_layers
)
elif ARCHITECTURE == 'lstm':
d_model = lstm_dim
model = nn.LSTM(
input_size=lstm_dim,
hidden_size=lstm_dim,
num_layers=num_layers,
batch_first=True,
dropout=0
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
parameters = sum(p.numel() for p in model.parameters())
print(f'{parameters} parameters')
batch_size = 128
sequence_length = 1024
warmup_steps = 50
steps = 1000
log_frequency = 100
print('Warming up')
warmup_steps = tqdm.tqdm(range(warmup_steps))
for i in warmup_steps:
batch = torch.rand((batch_size, sequence_length, d_model), device=device)
out = model(batch)
torch.cuda.synchronize()
print('Finished warming up')
start = torch.cuda.Event(enable_timing=True)
start.record()
steps = tqdm.tqdm(range(steps))
for i in steps:
batch = torch.rand((batch_size, sequence_length, d_model), device=device)
out = model(batch)
if i % log_frequency == 0 and i > 0:
end = torch.cuda.Event(enable_timing=True)
end.record()
torch.cuda.synchronize()
elapsed_time_ms = start.elapsed_time(end)
elapsed_time_per_step_ms = elapsed_time_ms / log_frequency
print(f'Elapsed time per step: {elapsed_time_per_step_ms}ms')
start = torch.cuda.Event(enable_timing=True)
start.record()