Fine-tune model RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Hi everybody,

i’m trying to fine tune the t5-small model and i’m getting the following error:

RuntimeError                              Traceback (most recent call last)
<ipython-input-31-7b6b8391c42e> in <cell line: 1>()
----> 1 trainer.fit(model, data_module)

28 frames
/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    198     # some Python versions print out the first line of a multi-line function
    199     # calls in the traceback and some print out the last line
--> 200     Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    201         tensors, grad_tensors_, retain_graph, create_graph, inputs,
    202         allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

I’ve been trying to debug what is happening but don’t know what’s wrong.

If you need more info let me know.

Regards!

MODEL_NAME = “t5-small”
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, is_trainable=True)

class OwnDataset(Dataset):

def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        source_max_token_len: int = 396,
        target_max_token_len: int = 32
):

    self.tokenizer = tokenizer
    self.data = data
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

def __len__(self):
    return len(self.data)

def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    source_encoding = tokenizer(
        data_row["question"],
        data_row["context"],
        max_length= self.source_max_token_len,
        padding="max_length",
        truncation="only_second",
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )

    target_encoding = tokenizer(
        data_row["answer_text"],
        max_length=self.target_max_token_len,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )

    labels = target_encoding["input_ids"]
    labels[labels == 0] = -100

    return dict(
        question=data_row["question"],
        context=data_row["context"],
        answer_text=data_row["answer_text"],
        input_ids=source_encoding["input_ids"].flatten(),
        attention_mask=source_encoding["attention_mask"].flatten(),
        labels=labels.flatten()
    )

class OwnDataModule(pl.LightningDataModule):

def __init__(self,
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    tokenizer: T5Tokenizer,
    batch_size: int = 8,
    source_max_token_len: int = 396,
    target_max_token_len: int = 32
):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len


def setup(self, stage=None):
    self.train_dataset = OwnDataset(
        self.train_df,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
    )
    self.test_dataset = OwnDataset(
        self.test_df,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
    )

def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=4
    )

def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=4
    )

def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=4
    )

BATCH_SIZE = 8
N_EPOCHS = 6

data_module = OwnDataModule(train_df,val_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME,return_dict=True)

class OwnQAModel(pl.LightningModule):

def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME,return_dict=True)

def forward(self,input_ids, attention_mask, labels=None):
    output = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
    )

    return output.loss, output.logits

def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("training_loss", loss, prog_bar=True, logger=True)
    return loss

def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

def configure_optimizers(self):
    return AdamW(self.parameters(), lr=0.0001)

Your issue seems to be related to this one, which is also using a HF model. In both cases it seems that either the computation graph is detached somewhere or the gradient calculation is disabled globally.
The linked topic doesn’t have any updates so you might consider posting this question in the HF discussion board, as it seems to be HF-specific.

Thank you, i will post an update if i have a solution.