After training model when i test model in batch using shuffle=False give me good score , when i use the same model and same test data using shuffle=True give me bad score , i am confused why it is so?
dataset = pd.read_csv('Churn_Modelling.csv')
I shuffle the data before splitting data into train/test
from sklearn.utils import shuffle
data = shuffle(data)
data.reset_index(inplace=True, drop=True)
X = data[['Age','Tenure','Geography','Balance','EstimatedSalary','Gender','NumOfProducts','CreditScore','HasCrCard','IsActiveMember']]
Y = data['Exited']
I am embedding following categorical variables
categorical_columns = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
for col in categorical_columns:
X.loc[:,col] = X.loc[:,col].astype('category')
X['Geography'] = LabelEncoder().fit_transform(X['Geography'])
X['Gender'] = LabelEncoder().fit_transform(X['Gender'])
X['HasCrCard'] = LabelEncoder().fit_transform(X['HasCrCard'])
X['IsActiveMember'] = LabelEncoder().fit_transform(X['IsActiveMember'])
After encoding label encoder above , these columns converted into integer - hence re converting them to category
for col in categorical_columns:
X.loc[:,col] = X.loc[:,col].astype('category')
X.dtypes
Get embedding categorical columns
embedded_cols = {n: len(col.cat.categories) for n,col in X[categorical_columns].items()}
embedded_cols
{'Geography': 3, 'Gender': 2, 'HasCrCard': 2, 'IsActiveMember': 2}
Splitting train/test data
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.20, random_state=0)
Following function will return categorical , numerical columns separately , reason for this i want to embed categorical column separately and then combined with numerical features while training
class ShelterOutcomeDataset(Dataset):
def __init__(self, X, Y, embedded_col_names):
Xdata = X.copy()
self.X1 = Xdata.loc[:,embedded_col_names].copy().values.astype(np.int64) #categorical columns
self.X2 = Xdata.drop(columns=embedded_col_names).copy().values.astype(np.float32) #numerical columns
self.y = Y.copy().values.astype(np.int64)
def __len__(self):
return len(self.y)
def __getitem__(self, idx):
return self.X1[idx], self.X2[idx], self.y[idx]
Size of embedding columns
embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]
embedding_sizes
[(3, 2), (2, 1), (2, 1), (2, 1)]
train_ds = ShelterOutcomeDataset(X_train,y_train ,categorical_columns)
embedded_col_names = embedded_cols.keys()
len(X.columns) - len(embedded_cols) #number of numerical columns
6
Model
class testNet(nn.Module):
def __init__(self, emb_dims, n_cont):
super().__init__()
self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in emb_dims])
no_of_embs = sum(e.embedding_dim for e in self.embeddings) #length of all embeddings combined
self.n_emb, self.n_cont = no_of_embs, n_cont
self.lin1 = nn.Linear(self.n_emb + self.n_cont,200)
self.lin2 = nn.Linear(200, 100)
self.lin3 = nn.Linear(100, 50)
self.lin4 = nn.Linear(50, 2)
self.bn1 = nn.BatchNorm1d(self.n_cont)
self.bn2 = nn.BatchNorm1d(200)
self.bn3 = nn.BatchNorm1d(100)
self.bn4 = nn.BatchNorm1d(50)
self.emb_drop = nn.Dropout(0.4)
self.drops = nn.Dropout()
def forward(self, x_cat, x_cont):
x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
x = torch.cat(x, 1)
x = self.emb_drop(x)
x2 = self.bn1(x_cont)
x = torch.cat([x, x2], 1)
x = F.relu(self.lin1(x))
x = self.drops(x)
x = self.bn2(x)
x = F.relu(self.lin2(x))
x = self.drops(x)
x = self.bn3(x)
x = F.relu(self.lin3(x))
x = self.drops(x)
x = self.bn4(x)
x = F.relu(self.lin4(x))
return x
model = testNet(embedding_sizes,6)
print(model)
testNet(
(embeddings): ModuleList(
(0): Embedding(3, 2)
(1): Embedding(2, 1)
(2): Embedding(2, 1)
(3): Embedding(2, 1)
)
(lin1): Linear(in_features=9, out_features=200, bias=True)
(lin2): Linear(in_features=200, out_features=100, bias=True)
(lin3): Linear(in_features=100, out_features=50, bias=True)
(lin4): Linear(in_features=50, out_features=2, bias=True)
(bn1): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(emb_drop): Dropout(p=0.4, inplace=False)
(drops): Dropout(p=0.5, inplace=False)
)
Training
def get_optimizer(model, lr = 0.001, wd = 0.0):
parameters = filter(lambda p: p.requires_grad, model.parameters())
optim = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
return optim
def init_weights(m):
if type(m) == nn.Linear:
nn.init.xavier_normal_(m.weight)
criterion = nn.CrossEntropyLoss()
def train_model(model, optim, train_dl):
model.train()
total = 0
sum_loss = 0
output = 0
for cat, cont, y in train_dl:
batch = y.shape[0]
output = model(cat, cont)
loss = criterion(output, y)
optim.zero_grad()
loss.backward()
optim.step()
total += batch
sum_loss += batch*(loss.item())
return sum_loss/total,pred
def train_loop(model, epochs, lr=0.01, wd=0.0):
optim = get_optimizer(model, lr = lr, wd = wd)
for epoch in range(epochs):
loss,pred = train_model(model, optim, train_dl)
if (epoch+1) % 50 ==0:
print(f'epoch : {epoch+1},training loss : {loss}')
sampler = class_imbalance_sampler(y_train)
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size,shuffle=True)
model = testNet(embedding_sizes,6)
model.apply(init_weights)
opt = torch.optim.Adam(model.parameters(), lr=1e-2)
train_loop(model, epochs=200, lr=0.001, wd=0.00001)
Validation When Shuffle=False- Sores are below
valid_ds = ShelterOutcomeDataset(X_val,y_val , categorical_columns)
batch_size = 100
valid_dl = DataLoader(valid_ds, batch_size=batch_size,shuffle=False)<-------
valid_dl = DeviceDataLoader(valid_dl, device)
preds = []
with torch.no_grad():
for cat, cont,y in valid_dl:
model.eval()<-----------------------------------------------------
output = model(cat, cont)
_,pred = torch.max(output,1)
preds.append(pred.cpu().detach().numpy())
final_preds = [item for sublist in preds for item in sublist]
print(classification_report(y_val, np.array(final_preds)))
precision recall f1-score support
0 0.86 0.95 0.90 1610
1 0.63 0.37 0.47 390
accuracy 0.83 2000
macro avg 0.74 0.66 0.69 2000
weighted avg 0.82 0.83 0.82 2000
valid_ds = ShelterOutcomeDataset(X_val,y_val , categorical_columns)
batch_size = 100
valid_dl = DataLoader(valid_ds, batch_size=batch_size,shuffle=True)<--------
valid_dl = DeviceDataLoader(valid_dl, device)
preds = []
with torch.no_grad():
for cat, cont,y in valid_dl:
model.eval()<----------------------------------------------------
output = model(cat, cont)
_,pred = torch.max(output,1)
preds.append(pred.cpu().detach().numpy())
final_preds = [item for sublist in preds for item in sublist]
print(classification_report(y_val, np.array(final_preds)))
precision recall f1-score support
0 0.79 0.87 0.83 1576
1 0.23 0.14 0.17 424
accuracy 0.72 2000
macro avg 0.51 0.51 0.50 2000
weighted avg 0.67 0.72 0.69 2000