# Parallelize loop over experts

Hello,

I am working on the Switch Transformers model (https://www.jmlr.org/papers/volume23/21-0998/21-0998.pdf). I use the code base from labml.ai (Switch Transformer). They propose the following implementation of a Sparse Mixture of Experts layer:

class SwitchFeedForward(Module):
"""
## Routing among multiple FFNs
"""

def __init__(self, *,
capacity_factor: float,
drop_tokens: bool,
is_scale_prob: bool,
n_experts: int,
expert: FeedForward,
d_model: int):
"""
* capacity_factor is the capacity of each expert as a factor relative to ideally balanced load
* drop_tokens specifies whether to drop tokens if more tokens are routed to an expert than the capacity
* is_scale_prob specifies whether to multiply the input to the FFN by the routing probability
* n_experts is the number of experts
* expert is the expert layer, a [FFN module](../feed_forward.html)
* d_model is the number of features in a token embedding
* d_ff is the number of features in the hidden layer of the FFN
* dropout is dropout probability in the FFN
"""
super().__init__()

self.capacity_factor = capacity_factor
self.is_scale_prob = is_scale_prob
self.n_experts = n_experts
self.drop_tokens = drop_tokens

# make copies of the FFNs
self.experts = clone_module_list(expert, n_experts)
# Routing layer and softmax
self.switch = nn.Linear(d_model, n_experts)
self.softmax = nn.Softmax(dim=-1)

def forward(self, x: torch.Tensor):
"""
* x is the input to the switching module with shape [seq_len, batch_size, d_model]
"""

# Capture the shape to change shapes later
seq_len, batch_size, d_model = x.shape
# Flatten the sequence and batch dimensions
x = x.view(-1, d_model)

# Get routing probabilities for each of the tokens.
# $$p_i(x) = \frac{e^{h(x)_i}}{\sum^N_j e^{h(x)_j}}$$
# where $N$ is the number of experts n_experts and
# $h(\cdot)$ is the linear transformation of token embeddings.
route_prob = self.softmax(self.switch(x))

# Get the maximum routing probabilities and the routes.
# We route to the expert with highest probability
route_prob_max, routes = torch.max(route_prob, dim=-1)

# Get indexes of tokens going to each expert
indexes_list = [torch.eq(routes, i).nonzero(as_tuple=True)[0] for i in range(self.n_experts)]

# Initialize an empty tensor to store outputs
final_output = x.new_zeros(x.shape)

# Capacity of each expert.
# $$\mathrm{expert\;capacity} = # \frac{\mathrm{tokens\;per\;batch}}{\mathrm{number\;of\;experts}} # \times \mathrm{capacity\;factor}$$
capacity = int(self.capacity_factor * len(x) / self.n_experts)
# Number of tokens routed to each expert.
counts = x.new_tensor([len(indexes_list[i]) for i in range(self.n_experts)])

# Initialize an empty list of dropped tokens
dropped = []
# Only drop tokens if drop_tokens is True.
if self.drop_tokens:
# Drop tokens in each of the experts
for i in range(self.n_experts):
# Ignore if the expert is not over capacity
if len(indexes_list[i]) <= capacity:
continue
# Shuffle indexes before dropping
indexes_list[i] = indexes_list[i][torch.randperm(len(indexes_list[i]))]
# Collect the tokens over capacity as dropped tokens
dropped.append(indexes_list[i][capacity:])
# Keep only the tokens upto the capacity of the expert
indexes_list[i] = indexes_list[i][:capacity]

# Get outputs of the expert FFNs
expert_output = [self.experts[i](x[indexes_list[i], :]) for i in range(self.n_experts)]

# Assign to final output
for i in range(self.n_experts):
final_output[indexes_list[i], :] = expert_output[i]

# Pass through the dropped tokens
if dropped:
dropped = torch.cat(dropped)
final_output[dropped, :] = x[dropped, :]

if self.is_scale_prob:
# Multiply by the expert outputs by the probabilities $y = p_i(x) E_i(x)$
final_output = final_output * route_prob_max.view(-1, 1)
else:
# Don't scale the values but multiply by $\frac{p}{\hat{p}} = 1$ so that the gradients flow
# (this is something we experimented with).
final_output = final_output * (route_prob_max / route_prob_max.detach()).view(-1, 1)

# Change the shape of the final output back to [seq_len, batch_size, d_model]
final_output = final_output.view(seq_len, batch_size, d_model)

# Return
#
# * the final output
# * number of tokens routed to each expert
# * sum of probabilities for each expert
# * number of tokens dropped.
# * routing probabilities of the selected experts
#
# These are used for the load balancing loss and logging
return final_output, counts, route_prob.sum(0), len(dropped),


A main drawback of this implementation is the for loop over the experts:

 expert_output = [self.experts[i](x[indexes_list[i], :]) for i in range(self.n_experts)]

I wonder if it would be possible to parallelize this operation? How would you approach this problem?