Embedding Error Index out of Range in self

TheOraware · May 21, 2021, 4:20am

even my indices is lower than embedding_dim-1 within a batch , i am still getting IndexError: index out of range in self , here is my data and code. In each batch for Age category size is 20 and input embedding size is 70 , dont know why indexing error is throwing

data = pd.read_csv('Churn_Modelling.csv')
print("Shape:", data.shape)
data.head()

X_train = data[['Age','Balance']]
y_train = pd.DataFrame(data['Exited'])
X_train

Shape: (10000, 14)

	Age  	Balance
	----	-------
0	 42	        0.00
1	41	    83807.86
2	42	   159660.80
3	39	        0.00
4	43	   125510.82

10000 rows × 2 columns

y_train

	Exited
	-------
0	1
1	0
2	1
3	0
4	0

10000 rows × 1 columns

features  = ['Age']
for col in features:
    X_train.loc[:,col] = X_train.loc[:,col].astype('category')
X_train.dtypes

Age        category
Balance     float64
dtype: object

embedded_cols = {n: len(col.cat.categories) for n,col in X_train[features].items()}
embedded_cols

{'Age': 70}


 class ShelterOutcomeDataset(Dataset):
     def __init__(self, X, Y, embedded_col_names):
         X = X.copy()
         self.X1 = X.loc[:,embedded_col_names].copy().values.astype(np.int64) #categorical columns
         self.X2 = X.drop(columns=embedded_col_names).copy().values.astype(np.float32) #numerical columns
         self.y = Y.copy().values.astype(np.int64)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.y[idx]

embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]
embedding_sizes

[(70, 35)]   

train_ds = ShelterOutcomeDataset(X_train,y_train , ['Age'])

class testNet(nn.Module):
    def __init__(self, emb_dims, n_cont):
        super().__init__()
        
        self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in emb_dims])
        no_of_embs = sum(e.embedding_dim for e in self.embeddings) #length of all embeddings combined
        self.n_emb, self.n_cont = no_of_embs, n_cont
        self.lin1 = nn.Linear(self.n_emb + self.n_cont,6)
        self.lin2 = nn.Linear(6, 4)
        self.lin3 = nn.Linear(4, 2)

        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(6)
        self.bn3 = nn.BatchNorm1d(4)
        self.emb_drop = nn.Dropout(0.6)
        self.drops = nn.Dropout(0.3)

    def forward(self, x_cat, x_cont):

        x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        
        x = self.emb_drop(x)
        # batch normalization over continous features
        x2 = self.bn1(x_cont)
        # concatenate both embedding and continous feature , here 1 means dim 
        # the dimension over which the tensors are concatenated we are concatenating columns
        x = torch.cat([x, x2], 1)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn2(x)
        x = F.relu(self.lin2(x))
        x = self.drops(x)
        x = self.bn3(x)
        x = self.lin3(x)

        return x
        
import torch.nn as nn

criterion = nn.CrossEntropyLoss()
def train_model(model, optim, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for cat, cont, y in train_dl:
        batch = y.shape[0]
        
        print(cat.size())  # <--- size of features whihc has to be embeded

        y = y.to(torch.float32)
        
        output = model(cat, cont)
        _,pred = torch.max(output,1)
        
        loss = criterion(output, y.squeeze(1).long())
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total,pred



def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr = lr, wd = wd)
    for epoch in range(epochs): 
        loss,pred = train_model(model, optim, train_dl)
        if (epoch+1) % 5 ==0:
            print(f'epoch : {epoch+1},training loss : {loss}, output : {output}')

batch_size = 20
train_dl = DataLoader(train_ds, batch_size=batch_size,shuffle=True)
#valid_dl = DataLoader(valid_ds, batch_size=batch_size,shuffle=True)    

train_dl = DeviceDataLoader(train_dl, device)
# valid_dl = DeviceDataLoader(valid_dl, device)

# model = ShelterOutcomeModel(embedding_sizes,0)

model = testNet(embedding_sizes,1)
print(model)

from collections import defaultdict
opt = torch.optim.Adam(model.parameters(), lr=1e-2)
# to_device(model, device)

train_loop(model, epochs=100, lr=0.01, wd=0.00001)

testNet(
  (embeddings): ModuleList(
    (0): Embedding(70, 35)
  )
  (lin1): Linear(in_features=36, out_features=6, bias=True)
  (lin2): Linear(in_features=6, out_features=4, bias=True)
  (lin3): Linear(in_features=4, out_features=2, bias=True)
  (bn1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)
torch.Size([20, 1])

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-3281-888e52d4559c> in <module>
     74 # to_device(model, device)
     75 
---> 76 train_loop(model, epochs=100, lr=0.01, wd=0.00001)

<ipython-input-3281-888e52d4559c> in train_loop(model, epochs, lr, wd)
     46     optim = get_optimizer(model, lr = lr, wd = wd)
     47     for epoch in range(epochs):
---> 48         loss,pred = train_model(model, optim, train_dl)
     49         if (epoch+1) % 5 ==0:
     50             print(f'epoch : {epoch+1},training loss : {loss}, output : {output}')

<ipython-input-3281-888e52d4559c> in train_model(model, optim, train_dl)
     15 
     16 
---> 17         output = model(cat, cont)
     18         _,pred = torch.max(output,1)
     19 

~/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

<ipython-input-3280-681fc4d5712d> in forward(self, x_cat, x_cont)
     30 
     31 
---> 32         x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
     33         x = torch.cat(x, 1)
     34 

<ipython-input-3280-681fc4d5712d> in <listcomp>(.0)
     30 
     31 
---> 32         x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
     33         x = torch.cat(x, 1)
     34 

~/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

~/anaconda3/lib/python3.8/site-packages/torch/nn/modules/sparse.py in forward(self, input)
    122 
    123     def forward(self, input: Tensor) -> Tensor:
--> 124         return F.embedding(
    125             input, self.weight, self.padding_idx, self.max_norm,
    126             self.norm_type, self.scale_grad_by_freq, self.sparse)

~/anaconda3/lib/python3.8/site-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   1812         # remove once script supports set_grad_enabled
   1813         _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1814     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
   1815 
   1816 

IndexError: index out of range in self