from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_scheduler
from torch.utils.data import DataLoader, Dataset
import torch
# Load dataset
dataset = load_dataset("mxronga/nvidia_steer_yo", split='train')
dataset = dataset.select(range(100))
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
test_dataset = dataset['test']
# Load tokenizer and model
repo_name = "BeardedMonster/SabiYarn-125M"
tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True)
# Tokenize dataset
def preprocess_and_tokenize_dataset(examples):
formatted_inputs = [
f"<prompt> {inp} <response> {resp} |end_of_text||end_of_text||end_of_text|"
for inp, resp in zip(examples['input'], examples['response'])
]
tokenized_data = tokenizer(formatted_inputs,truncation=True, padding="max_length", max_length=1024)
labels = tokenized_data['input_ids'].copy()
tokenized_data['labels'] = [
[-100 if token == tokenizer.pad_token_id else token for token in label]
for label in labels
]
return tokenized_data
tokenized_datasets = dataset.map(preprocess_and_tokenize_dataset, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['input', 'response', 'helpfulness', 'correctness', 'coherence', 'complexity', 'verbosity'])
train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']
# Custom Dataset class
class CustomDataset(Dataset):
def __init__(self, tokenized_data):
self.input_ids = torch.tensor(tokenized_data['input_ids'])
self.attention_mask = torch.tensor(tokenized_data['attention_mask'])
self.labels = torch.tensor(tokenized_data['labels'])
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return {
'input_ids': self.input_ids[idx],
'attention_mask': self.attention_mask[idx],
'labels': self.labels[idx]
}
train_dataloader = DataLoader(CustomDataset(train_dataset), batch_size=4, shuffle=True)
test_dataloader = DataLoader(CustomDataset(test_dataset), batch_size=4, shuffle=False)
# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=0.001)
num_training_steps = len(train_dataloader) * 3
lr_scheduler = get_scheduler(
'linear',
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps
)
# Training function
def train(model, dataloader, optimizer, scheduler, device):
model.train()
total_loss = 0
for batch in dataloader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
output = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = output.loss # is none, hidden_state is also none.
# codes below breaks
# Backward pass
loss.backward()
optimizer.step()
scheduler.step()
total_loss += loss.item()
avg_loss = total_loss / len(dataloader)
print(f"Average Training Loss: {avg_loss}")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
for epoch in range(3):
print(f"Epoch {epoch + 1}")
train(model, train_dataloader, optimizer, lr_scheduler, device)```