Load multiple batched npz files or huge data files for asynchronous loading

I have a dataset of about 40 Gb. I have saved this as .npz files (150 files) batch0.npz to batch150.npz. Each contains 64 data points (batch size = 64). Need to create a balance between I/O reads hence saved 64 data points but I want to train my model with batch size = 32. with the shuffle. How should I do this? Below is the code I have right now.

ai = np.random.randn(64,70,128)
aj = np.random.randn(64,70,18,128)
ak = tnp.random.randn(64,70,50,128)
al = torch.rand(64,70,60,128)

sample = {ai, aj, ak, al}

np.savez("batch0.npz", "ai"=ai, "aj"=aj, "ak"=ak, "al"=al)

I create a dataset as follows

class load_batch_files(Dataset):
    def __init__(self):
        self.data_files = os.listdir("batch_data/")
        self.data_files.sort()

    def __getitem__(self, idx):
        # pdb.set_trace()
        return np.load(self.data_files[idx])

    def __len__(self):
        return len(self.data_files)

def somecollate(batch):
    # pdb.set_trace()
    batch = batch[0]
    ai = torch.tensor(batch["ai"]).float().to(device)
    aj = torch.tensor(batch["aj"]).float().to(device)
    ak = torch.tensor(batch["ak"]).float().to(device)
    al = torch.tensor(batch["al"]).float().to(device)
    batch = {"ai": ai, "aj": aj, "ak": ak, "al": al}
    return batch

desc_train = load_batch_files()
trainloader = DataLoader(desc_train, batch_size=32, collate_fn=somecollate)

This does not give me data with a batch size of 32. even shuffle does not work. what am I doing wrong?
This is what I have tried loading-huge-data-functionality