TypeError: ' ' SLiMPerformer is not iterable

I am using the SLiMPerformer module as the head in a Multitask prediction model that has two output tracks(each for a task). Initially, I had the issues like the SLiMperformer requires a forward module. Now the issue is that the SLiMPerformer is not iterable. I do not seem to understand where is the problem is. Any insights or help to solve this problem will be appreciated.

from IPython.core.completerlib import module_list
from numpy.core.fromnumeric import shape
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.container import ModuleList, Sequential
from methods.utils.model_utilities_wale import ( DoubleConv,init_layer)
#from methods.utils.model_utilities_orth import ( CustomCNN,PositionalEncoding, init_layer) #( DoubleConv,PositionalEncoding, init_layer)
#from methods.utils.cbam import CBAM
from performer_stuffs.slim_performer_model import SLiMPerformer

#import performer.models.slim_performer.pytorch.numerator_and_denominator as num_and_den
#from performer_stuffs import numerator_and_denominator as num_and_den

class EINV2(nn.Module):
def init(self, cfg, dataset):
super().init()
self.pe_enable = False # Ture | False

    if cfg['data']['audio_feature'] == 'logmel&intensity':
        self.f_bins = cfg['data']['n_mels']
        self.in_channels = 7
        n_layers = 2
        
    feature_type = 'logmel&intensity'
    self.downsample_ratio = 2 ** 2
    self.sed_conv_block1 = nn.Sequential(
        DoubleConv(in_channels= 4, out_channels=64),
        nn.AvgPool2d(kernel_size=(2, 2)),
    )
    self.sed_conv_block2 = nn.Sequential(
        DoubleConv(in_channels=64, out_channels=128),
        nn.AvgPool2d(kernel_size=(2, 2)),
    )
    self.sed_conv_block3 = nn.Sequential(
        DoubleConv(in_channels=128, out_channels=256),
        nn.AvgPool2d(kernel_size=(2, 2)),
    )
    self.sed_conv_block4 = nn.Sequential(
        DoubleConv(in_channels=256, out_channels=512),
        nn.AvgPool2d(kernel_size=(1, 2)),
    )

    self.doa_conv_block1 = nn.Sequential(
        DoubleConv(in_channels=self.in_channels, out_channels=64),
        nn.AvgPool2d(kernel_size=(2, 2)),
    )
    self.doa_conv_block2 = nn.Sequential(
        DoubleConv(in_channels=64, out_channels=128),
        nn.AvgPool2d(kernel_size=(2, 2)),
    )
    self.doa_conv_block3 = nn.Sequential(
        DoubleConv(in_channels=128, out_channels=256),
        nn.AvgPool2d(kernel_size=(2, 2)),
    )
    self.doa_conv_block4 = nn.Sequential(
        DoubleConv(in_channels=256, out_channels=512),
        nn.AvgPool2d(kernel_size=(1, 2)),
    )
   
    self.stitch = nn.ParameterList([
        nn.Parameter(torch.FloatTensor(64, 2, 2).uniform_(0.1, 0.9)),
        nn.Parameter(torch.FloatTensor(128, 2, 2).uniform_(0.1, 0.9)),
        nn.Parameter(torch.FloatTensor(256, 2, 2).uniform_(0.1, 0.9)),
    ])

self.sed_slimperformer_track1 = SLiMPerformer(hidden_dim = 512, ffn_dim= 1024, n_heads=8, n_layers = 2, compute_type =‘iter’,feature_type = ‘logmel&intensity’,vocab_size = 256,on_gptln =True )

self.sed_slimperformer_track2 = SLiMPerformer(hidden_dim = 512, ffn_dim= 1024, n_heads=8, n_layers = 2,compute_type =‘iter’,feature_type = ‘logmel&intensity’,vocab_size =256,on_gptln =True)

self.doa_slimperformer_track1 = SLiMPerformer(hidden_dim = 512, ffn_dim= 1024, n_heads=8, n_layers = 2,compute_type =‘iter’,feature_type = ‘logmel&intensity’,vocab_size = 256,on_gptln =True)
self.doa_slimperformer_track2 = SLiMPerformer(hidden_dim = 512, ffn_dim= 1024, n_heads=8, n_layers = 2,compute_type =‘iter’,feature_type = ‘logmel&intensity’,vocab_size = 256,on_gptln =True)

    self.fc_sed_track1 = nn.Linear(512, 14, bias=True)
    self.fc_sed_track2 = nn.Linear(512, 14, bias=True)
    self.fc_doa_track1 = nn.Linear(512, 3, bias=True)
    self.fc_doa_track2 = nn.Linear(512, 3, bias=True)
    self.final_act_sed = nn.Sequential() # nn.Sigmoid()
    self.final_act_doa = nn.Tanh()

    self.init_weight()

def init_weight(self):

    init_layer(self.fc_sed_track1)
    init_layer(self.fc_sed_track2)
    init_layer(self.fc_doa_track1)
    init_layer(self.fc_doa_track2)

def forward(self, x):
    """
    x: waveform, (batch_size, num_channels, data_length)
    """
    x_sed = x[:, :4]
    x_doa = x
   

    # cnn
    x_sed = self.sed_conv_block1(x_sed)
    x_doa = self.doa_conv_block1(x_doa)
    x_sed = torch.einsum('c, nctf -> nctf', self.stitch[0][:, 0, 0], x_sed) + \
        torch.einsum('c, nctf -> nctf', self.stitch[0][:, 0, 1], x_doa)
    x_doa = torch.einsum('c, nctf -> nctf', self.stitch[0][:, 1, 0], x_sed) + \
        torch.einsum('c, nctf -> nctf', self.stitch[0][:, 1, 1], x_doa)
    x_sed = self.sed_conv_block2(x_sed)
    x_doa = self.doa_conv_block2(x_doa)
    x_sed = torch.einsum('c, nctf -> nctf', self.stitch[1][:, 0, 0], x_sed) + \
        torch.einsum('c, nctf -> nctf', self.stitch[1][:, 0, 1], x_doa)
    x_doa = torch.einsum('c, nctf -> nctf', self.stitch[1][:, 1, 0], x_sed) + \
        torch.einsum('c, nctf -> nctf', self.stitch[1][:, 1, 1], x_doa)
    x_sed = self.sed_conv_block3(x_sed)
    x_doa = self.doa_conv_block3(x_doa)
    x_sed = torch.einsum('c, nctf -> nctf', self.stitch[2][:, 0, 0], x_sed) + \
        torch.einsum('c, nctf -> nctf', self.stitch[2][:, 0, 1], x_doa)
    x_doa = torch.einsum('c, nctf -> nctf', self.stitch[2][:, 1, 0], x_sed) + \
        torch.einsum('c, nctf -> nctf', self.stitch[2][:, 1, 1], x_doa)
   
    x_sed = self.sed_conv_block4(x_sed)
    x_doa = self.doa_conv_block4(x_doa)
    x_sed = x_sed.mean(dim=3) # (N, C, T)
    x_doa = x_doa.mean(dim=3) # (N, C, T)

   # transformer
   
    x_sed = x_sed.permute(2,0,1) # (T, N, C)2, 0, 1
    x_doa = x_doa.permute(2,0,1)
    for module in self.sed_slimperformer_track1:
        x_sed_1 = module(x_sed)
    for module in self.sed_slimperformer_track2:
        x_sed_2 = module(x_sed)
    for module in self.doa_slimperformer_track1:
        x_sed_1 = module(x_doa)
    for module in self.doa_slimperformer_track2:
        x_sed_2 = module(x_doa)
  
   
    x_sed = x_sed.permute(2, 0, 1) # (T, N, C)
    x_doa = x_doa.permute(2, 0, 1) # (T, N, C)

    x_sed_1 = self.sed_trans_track1(x_sed).transpose(0, 1) # (N, T, C)
    x_sed_2 = self.sed_trans_track2(x_sed).transpose(0, 1) # (N, T, C)   
    x_doa_1 = self.doa_trans_track1(x_doa).transpose(0, 1) # (N, T, C)
    x_doa_2 = self.doa_trans_track2(x_doa).transpose(0, 1) # (N, T, C)

    
    x_sed_1 = self.final_act_sed(self.fc_sed_track1(x_sed_1))
    x_sed_2 = self.final_act_sed(self.fc_sed_track2(x_sed_2))
    x_sed = torch.stack((x_sed_1, x_sed_2), 2)
    x_doa_1 = self.final_act_doa(self.fc_doa_track1(x_doa_1))
    x_doa_2 = self.final_act_doa(self.fc_doa_track2(x_doa_2))
    x_doa = torch.stack((x_doa_1, x_doa_2), 2)
    output = {
        'sed': x_sed,
        'doa': x_doa,
    }

    return output

Could you post the stacktrace pointing to the failing line of code?

PS: you can post code snippets by wrapping them into three backticks ```

TypeError: Caught TypeError in replica 0 on device 0.
Original Traceback (most recent call last):
File “/mnt/antares_raid/home/wale/anaconda3/envs/ein/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py”, line 61, in _worker
output = module(*input, **kwargs)
File “/mnt/antares_raid/home/wale/anaconda3/envs/ein/lib/python3.7/site-packages/torch/nn/modules/module.py”, line 1130, in _call_impl
return forward_call(*input, **kwargs)
File “/mnt/raid/ni/WALE_SEdl/EIN-SELD/seld/methods/ein_seld/models/seld.py”, line 140, in forward
for module in self.sed_slimperformer_track1:
TypeError: ‘SLiMPerformer’ object is not iterable

I don’t know where SLiMPerformer is defined so you might need to check its implementation and make sure its iterable, e.g. an nn.ModuleList.

Attached is the SLiMPerformer class that is called in the script.
Thanks

(Attachment SLiMPerform.py is missing)

Here is the implementation of the SLiMPerformer class that is called in my model for clarity. Help

“”“Main SLiMPerformer model.”“”
import numpy as np
import torch
from performer_stuffs import numerator_and_denominator as num_and_den
#import performer.models.slim_performer.pytorch.numerator_and_denominator as num_and_den

def valid_feature_type(feature_type):
bool1 = feature_type in [‘relu’, ‘elu+1’, ‘logmel&intensity’,‘sqr’ ‘favor+’]
bool2 = feature_type.startswith(‘favor+’) and feature_type.split(
‘_’)[1].isdigit()
return bool1 or bool2

class SLiMPerformer(torch.nn.Module):
“”“Full SLiMPerformer Transformer model.”“”

def init(self, vocab_size, hidden_dim, n_layers, ffn_dim, n_heads,
feature_type, compute_type, on_gptln):
assert valid_feature_type(feature_type)
assert compute_type in [‘ps’, ‘iter’, ‘parallel_ps’]
super(SLiMPerformer, self).init()

self._vocab_size = vocab_size
self._hidden_dim = hidden_dim
self._feature_type = feature_type

self.input_map = torch.nn.Embedding(vocab_size, hidden_dim // 2)
self.output_logit_map = torch.nn.Linear(hidden_dim, vocab_size)

self.layers = torch.nn.ModuleList([
SLiMPerformerLayer(hidden_dim, ffn_dim, n_heads, feature_type,
compute_type, on_gptln) for _ in range(n_layers)
])

def full_forward(self, x):
“”“Naive full forward pass.”“”

x = self.input_map(x)
x = self._concat_pos_embs(x, 0)

for layer in self.layers:
x = layer.full_forward(x, layer.attention.sample_rfs(x.device))

x = self.output_logit_map(x)

return x

def full_loss(self,
inputs,
with_grad=False,
nonpad_mask=None,
return_acc=False):
“”“Naive full loss and grad.”“”

if nonpad_mask is None:
nonpad_mask = torch.ones_like(inputs, dtype=torch.bool)

float_nonpad_mask = nonpad_mask.float()

logits = self.full_forward(inputs[:, :-1])
logits = logits.transpose(1, 2)
neg_loglikes = torch.nn.functional.cross_entropy(
logits, inputs[:, 1:], reduction=‘none’)

loss = (neg_loglikes * float_nonpad_mask[:, 1:]).sum()
loss = loss / (1e-8 + float_nonpad_mask[:, 1:].sum())

if with_grad:
loss.backward()

if return_acc:
correct = nonpad_mask[:, 1:] & (inputs[:, 1:] == torch.argmax(logits, 1))
acc = correct.float().sum() / float_nonpad_mask.sum()
return loss, acc

return loss

def loss_with_grad(self,
inputs,
step_size,
nonpad_mask=None,
return_acc=False):
“”“step_size: size of a parallel step, 1 corresponds to a fully-sequential mode.”“”

if nonpad_mask is None:
nonpad_mask = torch.ones_like(inputs, dtype=torch.bool)

loss = 0.0
sums = None
x = inputs[:, :-1]

if return_acc:
acc = 0.0

if self._feature_type == ‘favor+’:
rfs = [layer.attention.sample_rfs(inputs.device) for layer in self.layers]
else:
rfs = [None] * len(self.layers)

loss_normalization = nonpad_mask[:, 1:].float().sum() + 1e-8

with torch.no_grad():

for start_index, end_index, cur_x, cur_sums in self._forward_gen(
step_size, x, rfs):

sums = cur_sums

cur_gt_preds = inputs[:, start_index + 1:end_index + 1]
cur_nonpad_mask = nonpad_mask[:, start_index + 1:end_index + 1]
float_cur_nonpad_mask = cur_nonpad_mask.float()

c_x = cur_x.transpose(1, 2)

neg_loglikes = torch.nn.functional.cross_entropy(
c_x, cur_gt_preds, reduction=‘none’)
cur_loss = (neg_loglikes * float_cur_nonpad_mask).sum()

loss = loss + cur_loss.detach().clone()

if return_acc:
correct = cur_nonpad_mask & (cur_gt_preds == torch.argmax(c_x, 1))
acc = acc + correct.float().sum().detach().clone()

torch.cuda.synchronize()
torch.cuda.empty_cache()

loss = loss / loss_normalization

if return_acc:
acc = acc / loss_normalization

ps_gradients = None

seq_len = int(inputs.shape[1]) - 1
start_indices = np.arange(0, seq_len, step_size)[::-1]

for start_index in start_indices:

end_index = min(seq_len, start_index + step_size)

cur_x = x[:, start_index:end_index]
cur_gt_preds = inputs[:, start_index + 1:end_index + 1]
float_cur_nonpad_mask = nonpad_mask[:, start_index + 1:end_index +
1].float()

cur_x = self.input_map(cur_x)
cur_x = self._concat_pos_embs(cur_x, start_index)

new_sums =
cur_sums =

for layer, (num_sums, den_sums), layer_rfs in zip(self.layers, sums, rfs):

cur_x, new_num_sums, new_den_sums, cur_num_sums, cur_den_sums = layer.incr_step(
cur_x, num_sums, den_sums, False, layer_rfs, start_index == 0)

new_sums.append((new_num_sums, new_den_sums))
cur_sums.append((cur_num_sums, cur_den_sums))

sums = new_sums

cur_x = self.output_logit_map(cur_x)

cur_x = cur_x.transpose(1, 2)

neg_loglikes = torch.nn.functional.cross_entropy(
cur_x, cur_gt_preds, reduction=‘none’)
cur_loss = (neg_loglikes * float_cur_nonpad_mask).sum()

cur_loss = cur_loss / loss_normalization

if ps_gradients is not None:
cur_loss = cur_loss + sum([(z[0] * y[0]).sum() + (z[1] * y[1]).sum()
for z, y in zip(cur_sums, ps_gradients)
if (y[0] is not None) and (y[1] is not None)
])

cur_loss.backward()

ps_gradients = [[(None if y.grad is None else y.grad.detach().clone())
for y in z]
for z in sums]

torch.cuda.synchronize()
torch.cuda.empty_cache()

if return_acc:
return loss, acc

return loss

def _forward_gen(self, step_size, x, rfs):

sums = [layer.attention.init_sums(x.device) for layer in self.layers]

seq_len = int(x.shape[1])

for start_index in range(0, seq_len, step_size):

end_index = min(seq_len, start_index + step_size)

cur_x = self.input_map(x[:, start_index:end_index])
cur_x = self._concat_pos_embs(cur_x, start_index)

new_sums =

for layer, (num_sums, den_sums), layer_rfs in zip(self.layers, sums, rfs):
cur_x, new_num_sums, new_den_sums = layer.incr_step(
cur_x, num_sums, den_sums, True, layer_rfs, start_index == 0)
new_sums.append((new_num_sums, new_den_sums))

sums = new_sums

cur_x = self.output_logit_map(cur_x)

yield start_index, end_index, cur_x, sums

def _concat_pos_embs(self, x, start_index):

pos_emb_size = self._hidden_dim // 2

positions = torch.arange(
start_index, start_index + x.shape[1], dtype=x.dtype, device=x.device)
freqs = torch.exp(
torch.arange(0, pos_emb_size, 2, dtype=x.dtype, device=x.device) *
(-np.log(10000) / pos_emb_size))
args = positions[None, :, None] * freqs[None, None, :]
sin_pos_embs = torch.sin(args) * torch.ones_like(x[:, :1, :1])
cos_pos_embs = torch.cos(args) * torch.ones_like(x[:, :1, :1])

return torch.cat([x, sin_pos_embs, cos_pos_embs], 2)

class SLiMPerformerLayer(torch.nn.Module):
“”“Single SLiMPerformer layer (MLPs + Attention + LayerNorm).”“”

def init(self, hidden_dim, ffn_dim, n_heads, feature_type, compute_type,
on_gptln):

super(SLiMPerformerLayer, self).init()

self.attention = MultiHeadAttention(feature_type, n_heads, hidden_dim,
compute_type)

self.U_map = torch.nn.Linear(hidden_dim, ffn_dim)
self.V_map = torch.nn.Linear(ffn_dim, hidden_dim)
self.layernorm1 = torch.nn.LayerNorm(hidden_dim)
self.layernorm2 = torch.nn.LayerNorm(hidden_dim)

self._on_gptln = on_gptln

def full_forward(self, x, rfs):

skip = x

if not self._on_gptln:
x = self.layernorm1(x)

x = self.attention.full_forward(x, rfs)

if self._on_gptln:
x = self.layernorm1(x)

x = skip + x

x = self._ffn(x)

return x

.

Your code is a bit hard to read as it’s not properly formatted. However, it seems the layer is implemented as an nn.Module and thus not iterable. You might want to call it directly instead.