Scaler.update() - AssertionError: No inf checks were recorded prior to update

Hi ,
I am new to Pytorch and trying to implement ViT on a spectrograms of raw audio . My training input consists of tensors [1,80,128] (almost 1M) of them and I am exploring AMP to speed up my training on a V100(16GB).

My training loop is as below

optimiser = optim.Adam(model.parameters(), lr=config_pytorch.lr)
scaler = torch.cuda.amp.GradScaler(enabled = True)
for e in range(config_pytorch.epochs):
    for idx,train_bat in enumerate(train_dl):
           with autocast(enabled=True):
                 y_pred = model(x).float()
                 loss = criterion(y_pred, y.float())
                 scaler.scale(loss).backward()
                  train_loss += loss.detach().item()
          scaler.step(optimiser)
          scaler.update()
          optimiser.zero_grad()

I print out the losses at each step just to check their values and they are very small (~1e-5) and after a few steps the loss becomes (0) .
The code errors out with the following AssertionError: No inf checks were recorded prior to update .

The entire stack trace is as below.

AssertionError                            Traceback (most recent call last)
/tmp/ipykernel_972350/3829185638.py in <module>
----> 1 model = train_model_ast(train_dl , val_dl )

/tmp/ipykernel_972350/3546603516.py in train_model_ast(train_dl, val_dl, model)
    130             bat_duration = bat_finish_time - start_time
    131             print("&&&& BATCH TRAIN DURATION = " + str(bat_duration/60))
--> 132             scaler.update()
    133             #removing all instances of 999
    134 

/opt/conda/lib/python3.8/site-packages/torch/cuda/amp/grad_scaler.py in update(self, new_scale)
    384                           for found_inf in state["found_inf_per_device"].values()]
    385 
--> 386             assert len(found_infs) > 0, "No inf checks were recorded prior to update."
    387 
    388             found_inf_combined = found_infs[0]

AssertionError: No inf checks were recorded prior to update.
`

The code however runs without any issues if I don’t use AMP.Appreciate if anyone could provide any pointers.

This error is usually raised, if scaler.step(optimizer) was skipped as seen here:

model = nn.Linear(10, 10).cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
scaler = torch.cuda.amp.GradScaler()
loss_fn = nn.CrossEntropyLoss()
target = torch.randint(0, 10, (1,)).cuda()

optimizer.zero_grad()
with torch.cuda.amp.autocast():
    output = model(torch.randn(1, 10).cuda())
    loss = loss_fn(output, target)

scaler.scale(loss).backward()
#scaler.step(optimizer)
scaler.update()
# > AssertionError: No inf checks were recorded prior to update.

Could you post a minimal executable code snippet reproducing your issue?

Hi @ptrblck ,
Many thanks for taking a look. Here is my dummy code .

  1. Create a dataframe with file names and labels
import pandas as pd
data = [['file1',torch.tensor(0)], ['file2', torch.tensor(1)], ['file3', torch.tensor(0)]]
 
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['filename', 'label'])
  • Create a dataset class
class SoundDS(Dataset):
    
    def __init__(self,df,max_dim = None ,sr = config_DK_AST.rate ):
        self.df = df
        self.sr = sr
        self.channel = 1
        #self.feat_list = len(final_feat_list)
        print("len = " + str(len(self.df)))
               
  
  # Number of items in dataset
  # 
    def __len__(self):
        #all_spec_gram = AudioUtil.num_specgrams(self.df)
        #print("total number of specgram for the dataset is = " +str(all_spec_gram))
        
        return len((self.df ))
    
  # ----------------------------
  # Get i'th item in dataset
  # ----------------------------
    def __getitem__(self, idx , win_size = config_DK_AST.win_size , step_size = config_DK_AST.step_size , min_duration = config_DK_AST.min_duration,sr = config_DK_AST.sr):
        # Absolute file path of the audio file - concatenate the audio directory with
        # the relative path
        #get the file details from the idex passed from the train_loop
        filename = self.df.loc[idx,'filename']
        label = self.df.loc[idx,'label']
        
        return (filename , label)
    
  • Create dataloader
train_obj =  train_obj = SoundDS(df )
train_dl = torch.utils.data.DataLoader(train_obj, batch_size= 128, shuffle=True,pin_memory = True,num_workers = 8)
  • Define a dummy pre-processing function to convert audio into a variable list of tensors( the number of items returned depends on the audio duration)
def pre_process(wav_file ,label ,max_chunk = 20):
    feat_list= []
    num_chunks = torch.randint(1,max_chunk,size = (1,1))
    for s_gram in (num_chunks):
        
        feature = torch.randn(1,80,128)
        tup = (list(feature),label)
        feat_list.append(tup)
            #print("POST appending feat_list = " + str(len(feat_list)))
    return (feat_list)
def train_model_dummy(train_dl,val_dl = None, model = ASTModel()):
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimiser = optim.Adam(model.parameters(), lr=config_pytorch.lr)
    #print("Optim Device= " +str(optimiser.device))
    
    overrun_counter = 0
    
    scaler = torch.cuda.amp.GradScaler(enabled = True)
    for e in range(10):
        train_loss = 0.0
        model.to(device).train()
        all_y = []
        all_y_pred = []
        total_sgrams = 0
        for idx,train_bat in enumerate(train_dl):
            file_bat = train_bat[0]
            label_bat = train_bat[1]
            for i in range(len(file_bat)):
                filename = file_bat[i]
                label = label_bat[i]
                spec_list = pre_process(filename,label)
                
                total_sgrams+=int(len(spec_list))
                for j in range(len(spec_list)):
                    y = torch.tensor(label).reshape(-1,1).to(device)
                    x_temp,_ = spec_list[j]
                    x_temp_new = x_temp[0]
                    x_temp_new = x_temp_new.unsqueeze(dim = 0)
                    with autocast(enabled=True):
                        y_pred = model(x_temp_new.to(device))
                        loss = criterion(y_pred, y.float())
                    scaler.scale(loss).backward()
                    scaler.step(optimiser)
                    scaler.update()
                    optimiser.zero_grad()
                    all_y.append(y.cpu().detach())
                    all_y_pred.append(y_pred.cpu().detach())
                    # if bat_size%100 == 0 :
                     #print("Inside Epoch " + str(e) + " & inside batch " + str(idx) + "specgram " + str(i) + " of " + str(len(tup_list))  )
                    del x_temp_new
                    del y
                    del y_pred
                       
    return model

  • call the model
tr_model = train_model_dummy(train_dl)

Interestingly though the error did not come up now. I wonder what’s going on when I train with the “real” data.