I’m getting a
RuntimeError: CUDA out of memory. Tried to allocate 40.00 MiB (GPU 0; 14.76 GiB total capacity; 12.66 GiB already allocated; 35.75 MiB free; 13.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.
After calling loss.backward() for the below code
def train(epochs = epoch, batch_count = amount_per_batch):
######
db = get_data()
test_losses = []
train_losses = []
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
lowest_loss = 1000000000
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
nli_model = get_models('s3://chatbots-for-auto-labeling/t5/encoder_plus_logit_layer_plus_softmax/03_28_2023__03_56_46model.tar.gz').to(device)
optimizer = AdamW(nli_model.parameters(),
lr = learning_rate, # previous 8e-6
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
)
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = 1)
down_sampled_db = shuffle(db)
count_epochs = 0
sentences = down_sampled_db['description'].to_list()
questions = down_sampled_db['keyword'].to_list()
yes_or_no = down_sampled_db['yes_or_no']
toks = tokenizer(sentences, questions, padding='longest')
ds = Dataset.from_dict({"x": torch.tensor(toks['input_ids']), "mask": torch.tensor(toks['attention_mask']), 'labels' : torch.tensor([ 0 if i == 'no' else 1 for i in yes_or_no])}).with_format("torch")
dataloader = DataLoader(ds, batch_size=batch_count)
while count_epochs != epochs:
for batch in dataloader:
nli_model.train()
x_batch = batch["x"].to(device)
mask_batch = batch["mask"].to(device)
labels_batch = batch["labels"].to(device)
loss = nli_model(x_batch, attention_mask=mask_batch, labels = labels_batch)[0]
print(loss)
loss.backward()
print('completed backwards pass')
optimizer.step()
optimizer.zero_grad()
down_sampled_db = shuffle(db)
sentences = down_sampled_db['description'].to_list()
questions = down_sampled_db['keyword'].to_list()
yes_or_no = down_sampled_db['yes_or_no']
labels = torch.tensor([ 0 if i == 'no' else 1 for i in yes_or_no])
toks = tokenizer(sentences, questions, padding='longest')
x = torch.tensor(toks['input_ids'])
mask = torch.tensor(toks['attention_mask'])
count_epochs += 1
print('count_epochs', count_epochs)
print('lowest loss is: ', lowest_loss)
return model
model = train()
I would expect the error to come during the .backward() function call, since it seems the error should arise from the calculation of the gradient. Is this not the case? Also, is there a way to remedy this? I’m already using a batch size of one, and can’t really use any less tokens(I’m using a transformer) for this training job.