even my indices is lower than embedding_dim-1 within a batch , i am still getting IndexError: index out of range in self , here is my data and code. In each batch for Age category size is 20 and input embedding size is 70 , dont know why indexing error is throwing
data = pd.read_csv('Churn_Modelling.csv')
print("Shape:", data.shape)
data.head()
X_train = data[['Age','Balance']]
y_train = pd.DataFrame(data['Exited'])
X_train
Shape: (10000, 14)
Age Balance
---- -------
0 42 0.00
1 41 83807.86
2 42 159660.80
3 39 0.00
4 43 125510.82
10000 rows × 2 columns
y_train
Exited
-------
0 1
1 0
2 1
3 0
4 0
10000 rows × 1 columns
features = ['Age']
for col in features:
X_train.loc[:,col] = X_train.loc[:,col].astype('category')
X_train.dtypes
Age category
Balance float64
dtype: object
embedded_cols = {n: len(col.cat.categories) for n,col in X_train[features].items()}
embedded_cols
{'Age': 70}
class ShelterOutcomeDataset(Dataset):
def __init__(self, X, Y, embedded_col_names):
X = X.copy()
self.X1 = X.loc[:,embedded_col_names].copy().values.astype(np.int64) #categorical columns
self.X2 = X.drop(columns=embedded_col_names).copy().values.astype(np.float32) #numerical columns
self.y = Y.copy().values.astype(np.int64)
def __len__(self):
return len(self.y)
def __getitem__(self, idx):
return self.X1[idx], self.X2[idx], self.y[idx]
embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]
embedding_sizes
[(70, 35)]
train_ds = ShelterOutcomeDataset(X_train,y_train , ['Age'])
class testNet(nn.Module):
def __init__(self, emb_dims, n_cont):
super().__init__()
self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in emb_dims])
no_of_embs = sum(e.embedding_dim for e in self.embeddings) #length of all embeddings combined
self.n_emb, self.n_cont = no_of_embs, n_cont
self.lin1 = nn.Linear(self.n_emb + self.n_cont,6)
self.lin2 = nn.Linear(6, 4)
self.lin3 = nn.Linear(4, 2)
self.bn1 = nn.BatchNorm1d(self.n_cont)
self.bn2 = nn.BatchNorm1d(6)
self.bn3 = nn.BatchNorm1d(4)
self.emb_drop = nn.Dropout(0.6)
self.drops = nn.Dropout(0.3)
def forward(self, x_cat, x_cont):
x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
x = torch.cat(x, 1)
x = self.emb_drop(x)
# batch normalization over continous features
x2 = self.bn1(x_cont)
# concatenate both embedding and continous feature , here 1 means dim
# the dimension over which the tensors are concatenated we are concatenating columns
x = torch.cat([x, x2], 1)
x = F.relu(self.lin1(x))
x = self.drops(x)
x = self.bn2(x)
x = F.relu(self.lin2(x))
x = self.drops(x)
x = self.bn3(x)
x = self.lin3(x)
return x
import torch.nn as nn
criterion = nn.CrossEntropyLoss()
def train_model(model, optim, train_dl):
model.train()
total = 0
sum_loss = 0
for cat, cont, y in train_dl:
batch = y.shape[0]
print(cat.size()) # <--- size of features whihc has to be embeded
y = y.to(torch.float32)
output = model(cat, cont)
_,pred = torch.max(output,1)
loss = criterion(output, y.squeeze(1).long())
optim.zero_grad()
loss.backward()
optim.step()
total += batch
sum_loss += batch*(loss.item())
return sum_loss/total,pred
def train_loop(model, epochs, lr=0.01, wd=0.0):
optim = get_optimizer(model, lr = lr, wd = wd)
for epoch in range(epochs):
loss,pred = train_model(model, optim, train_dl)
if (epoch+1) % 5 ==0:
print(f'epoch : {epoch+1},training loss : {loss}, output : {output}')
batch_size = 20
train_dl = DataLoader(train_ds, batch_size=batch_size,shuffle=True)
#valid_dl = DataLoader(valid_ds, batch_size=batch_size,shuffle=True)
train_dl = DeviceDataLoader(train_dl, device)
# valid_dl = DeviceDataLoader(valid_dl, device)
# model = ShelterOutcomeModel(embedding_sizes,0)
model = testNet(embedding_sizes,1)
print(model)
from collections import defaultdict
opt = torch.optim.Adam(model.parameters(), lr=1e-2)
# to_device(model, device)
train_loop(model, epochs=100, lr=0.01, wd=0.00001)
testNet(
(embeddings): ModuleList(
(0): Embedding(70, 35)
)
(lin1): Linear(in_features=36, out_features=6, bias=True)
(lin2): Linear(in_features=6, out_features=4, bias=True)
(lin3): Linear(in_features=4, out_features=2, bias=True)
(bn1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn2): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn3): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(emb_drop): Dropout(p=0.6, inplace=False)
(drops): Dropout(p=0.3, inplace=False)
)
torch.Size([20, 1])
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-3281-888e52d4559c> in <module>
74 # to_device(model, device)
75
---> 76 train_loop(model, epochs=100, lr=0.01, wd=0.00001)
<ipython-input-3281-888e52d4559c> in train_loop(model, epochs, lr, wd)
46 optim = get_optimizer(model, lr = lr, wd = wd)
47 for epoch in range(epochs):
---> 48 loss,pred = train_model(model, optim, train_dl)
49 if (epoch+1) % 5 ==0:
50 print(f'epoch : {epoch+1},training loss : {loss}, output : {output}')
<ipython-input-3281-888e52d4559c> in train_model(model, optim, train_dl)
15
16
---> 17 output = model(cat, cont)
18 _,pred = torch.max(output,1)
19
~/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
<ipython-input-3280-681fc4d5712d> in forward(self, x_cat, x_cont)
30
31
---> 32 x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
33 x = torch.cat(x, 1)
34
<ipython-input-3280-681fc4d5712d> in <listcomp>(.0)
30
31
---> 32 x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
33 x = torch.cat(x, 1)
34
~/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
~/anaconda3/lib/python3.8/site-packages/torch/nn/modules/sparse.py in forward(self, input)
122
123 def forward(self, input: Tensor) -> Tensor:
--> 124 return F.embedding(
125 input, self.weight, self.padding_idx, self.max_norm,
126 self.norm_type, self.scale_grad_by_freq, self.sparse)
~/anaconda3/lib/python3.8/site-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
1812 # remove once script supports set_grad_enabled
1813 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1814 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
1815
1816
IndexError: index out of range in self