Model.eval() giving different result when shuffle is True and False

After training model when i test model in batch using shuffle=False give me good score , when i use the same model and same test data using shuffle=True give me bad score , i am confused why it is so?

dataset = pd.read_csv('Churn_Modelling.csv')

I shuffle the data before splitting data into train/test

from sklearn.utils import shuffle

data = shuffle(data)
data.reset_index(inplace=True, drop=True)

X = data[['Age','Tenure','Geography','Balance','EstimatedSalary','Gender','NumOfProducts','CreditScore','HasCrCard','IsActiveMember']]
Y = data['Exited']

I am embedding following categorical variables

categorical_columns = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
for col in categorical_columns:
    X.loc[:,col] = X.loc[:,col].astype('category')

X['Geography'] = LabelEncoder().fit_transform(X['Geography'])
X['Gender']    = LabelEncoder().fit_transform(X['Gender'])
X['HasCrCard'] = LabelEncoder().fit_transform(X['HasCrCard'])
X['IsActiveMember'] = LabelEncoder().fit_transform(X['IsActiveMember'])

After encoding label encoder above , these columns converted into integer - hence re converting them to category

for col in categorical_columns:
    X.loc[:,col] = X.loc[:,col].astype('category')
X.dtypes

Get embedding categorical columns

embedded_cols = {n: len(col.cat.categories) for n,col in X[categorical_columns].items()}
embedded_cols
{'Geography': 3, 'Gender': 2, 'HasCrCard': 2, 'IsActiveMember': 2}

Splitting train/test data

X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.20, random_state=0)

Following function will return categorical , numerical columns separately , reason for this i want to embed categorical column separately and then combined with numerical features while training

class ShelterOutcomeDataset(Dataset):
    def __init__(self, X, Y, embedded_col_names):
        Xdata = X.copy()
        self.X1 = Xdata.loc[:,embedded_col_names].copy().values.astype(np.int64) #categorical columns
        self.X2 = Xdata.drop(columns=embedded_col_names).copy().values.astype(np.float32) #numerical columns
        self.y  = Y.copy().values.astype(np.int64)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.y[idx]

Size of embedding columns

embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]
embedding_sizes
[(3, 2), (2, 1), (2, 1), (2, 1)]
train_ds = ShelterOutcomeDataset(X_train,y_train ,categorical_columns)

embedded_col_names = embedded_cols.keys()
len(X.columns) - len(embedded_cols) #number of numerical columns
6

Model

class testNet(nn.Module):
    def __init__(self, emb_dims, n_cont):
        super().__init__()

        self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in emb_dims])
        no_of_embs = sum(e.embedding_dim for e in self.embeddings) #length of all embeddings combined
   
        self.n_emb, self.n_cont = no_of_embs, n_cont
        self.lin1 = nn.Linear(self.n_emb + self.n_cont,200)
        self.lin2 = nn.Linear(200, 100)
        self.lin3 = nn.Linear(100, 50)
        self.lin4 = nn.Linear(50, 2)

        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(200)
        self.bn3 = nn.BatchNorm1d(100)
        self.bn4 = nn.BatchNorm1d(50)

        self.emb_drop = nn.Dropout(0.4)
        self.drops    = nn.Dropout()
        

    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        
        x = self.emb_drop(x)
        x2 = self.bn1(x_cont)
        x = torch.cat([x, x2], 1)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn2(x)
        x = F.relu(self.lin2(x))
        x = self.drops(x)
        x = self.bn3(x)
        x = F.relu(self.lin3(x))
        x = self.drops(x)
        x = self.bn4(x)
        x = F.relu(self.lin4(x))

        return x
model = testNet(embedding_sizes,6)
print(model)
testNet(
  (embeddings): ModuleList(
    (0): Embedding(3, 2)
    (1): Embedding(2, 1)
    (2): Embedding(2, 1)
    (3): Embedding(2, 1)
  )
  (lin1): Linear(in_features=9, out_features=200, bias=True)
  (lin2): Linear(in_features=200, out_features=100, bias=True)
  (lin3): Linear(in_features=100, out_features=50, bias=True)
  (lin4): Linear(in_features=50, out_features=2, bias=True)
  (bn1): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.4, inplace=False)
  (drops): Dropout(p=0.5, inplace=False)
)

Training

def get_optimizer(model, lr = 0.001, wd = 0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight)

criterion = nn.CrossEntropyLoss()

def train_model(model, optim, train_dl):
    model.train()
    total    = 0
    sum_loss = 0
    output   = 0
    
    for cat, cont, y in train_dl:
        batch = y.shape[0]
        output = model(cat, cont)
        loss = criterion(output, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total,pred

def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr = lr, wd = wd)
    for epoch in range(epochs): 
       
        loss,pred = train_model(model, optim, train_dl)
        if (epoch+1) % 50 ==0:
            print(f'epoch : {epoch+1},training loss : {loss}')
            
sampler = class_imbalance_sampler(y_train)

batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size,shuffle=True)

model = testNet(embedding_sizes,6)
model.apply(init_weights)

opt = torch.optim.Adam(model.parameters(), lr=1e-2)
train_loop(model, epochs=200, lr=0.001, wd=0.00001)

Validation When Shuffle=False- Sores are below

valid_ds = ShelterOutcomeDataset(X_val,y_val , categorical_columns)
batch_size = 100
valid_dl = DataLoader(valid_ds, batch_size=batch_size,shuffle=False)<-------
valid_dl = DeviceDataLoader(valid_dl, device)

preds = []
with torch.no_grad():
    for cat, cont,y in valid_dl:
        model.eval()<-----------------------------------------------------
        output = model(cat, cont)
        _,pred = torch.max(output,1)
        preds.append(pred.cpu().detach().numpy())
final_preds = [item for sublist in preds for item in sublist] 
print(classification_report(y_val, np.array(final_preds)))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90      1610
           1       0.63      0.37      0.47       390

    accuracy                           0.83      2000
   macro avg       0.74      0.66      0.69      2000
weighted avg       0.82      0.83      0.82      2000
valid_ds = ShelterOutcomeDataset(X_val,y_val , categorical_columns)
batch_size = 100
valid_dl = DataLoader(valid_ds, batch_size=batch_size,shuffle=True)<--------
valid_dl = DeviceDataLoader(valid_dl, device)

preds = []
with torch.no_grad():
    
    for cat, cont,y in valid_dl:
        model.eval()<----------------------------------------------------
        output = model(cat, cont)
        _,pred = torch.max(output,1)
        preds.append(pred.cpu().detach().numpy())
final_preds = [item for sublist in preds for item in sublist]  

print(classification_report(y_val, np.array(final_preds)))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83      1576
           1       0.23      0.14      0.17       424

    accuracy                           0.72      2000
   macro avg       0.51      0.51      0.50      2000
weighted avg       0.67      0.72      0.69      2000

I cannot reproduce the issue after calling model.eval() as neither the running stats are updated (as mentioned here nor is the output showing any difference when shuffling the inputs using this code snippet:

embedding_sizes = [(3, 2), (2, 1), (2, 1), (2, 1)]
model = testNet(embedding_sizes,6)
print(model)
model.eval()

for name, module in model.named_modules():
    if 'bn' in name:
        print(module.running_mean)
        print(module.running_var)

# first pass
x_cat = torch.randint(0, 2, (16, 4))
x_cont = torch.randn(16, 6)
out_ref = model(x_cat, x_cont)

# running stats are not updated
for name, module in model.named_modules():
    if 'bn' in name:
        print(module.running_mean)
        print(module.running_var)

# shuffle data
idx = torch.randperm(x_cat.size(0))
out = model(x_cat[idx], x_cont[idx])

# compare to reference output
print((out - out_ref[idx]).abs().max())
> tensor(0., grad_fn=<MaxBackward1>)

# check running stats
# running stats are not updated
for name, module in model.named_modules():
    if 'bn' in name:
        print(module.running_mean)
        print(module.running_var)
1 Like

@ptrblck here in my case running stats are not being updated either , please see below code. Not sure why shuffle=True and False has different score

@ptrblck thanks for pointing me right direction , using your code there was no changes in running stats in my code as well and everything was fine except shuffling valid_dl, i was shuffling valid_dl so target shuffled as well , after that doing prediction and comparing this prediction with non shuffled y_val of train_test_split.

I am still not clear what running stats are for? What it really tells in layman term?

The running stats of batchnorm layers are updated during training using the training batch stats and the formula mentioned in the docs, which can then be used during evaluation and makes the inference perform independent from the batch size. The BatchNorm paper explains this in more detail.

1 Like