Output.loss is None when training model

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_scheduler
from torch.utils.data import DataLoader, Dataset
import torch

# Load dataset
dataset = load_dataset("mxronga/nvidia_steer_yo", split='train')
dataset = dataset.select(range(100))
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
test_dataset = dataset['test']

# Load tokenizer and model
repo_name = "BeardedMonster/SabiYarn-125M"
tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True)

# Tokenize dataset
def preprocess_and_tokenize_dataset(examples):
    formatted_inputs = [
        f"<prompt> {inp} <response> {resp} |end_of_text||end_of_text||end_of_text|"
        for inp, resp in zip(examples['input'], examples['response'])
    ]
    tokenized_data = tokenizer(formatted_inputs,truncation=True, padding="max_length", max_length=1024)
    labels = tokenized_data['input_ids'].copy()
    tokenized_data['labels'] = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in labels
    ]
    return tokenized_data

tokenized_datasets = dataset.map(preprocess_and_tokenize_dataset, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['input', 'response', 'helpfulness', 'correctness', 'coherence', 'complexity', 'verbosity'])

train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = torch.tensor(tokenized_data['input_ids'])
        self.attention_mask = torch.tensor(tokenized_data['attention_mask'])
        self.labels = torch.tensor(tokenized_data['labels'])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

train_dataloader = DataLoader(CustomDataset(train_dataset), batch_size=4, shuffle=True)
test_dataloader = DataLoader(CustomDataset(test_dataset), batch_size=4, shuffle=False)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=0.001)
num_training_steps = len(train_dataloader) * 3

lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Training function
def train(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        output = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = output.loss # is none, hidden_state is also none.
        
        # codes below breaks
        
        # Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Average Training Loss: {avg_loss}")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

for epoch in range(3):
    print(f"Epoch {epoch + 1}")
    train(model, train_dataloader, optimizer, lr_scheduler, device)```