I am training the model related to NLP, however, it takes too long to train a epoch.
I found something weird. When I trained this model with batch size of 16, it can be trained successfully. However then I trained this model with batch size 32. It was out of work because of the problem : out of Memory on GPU. Being compared with this, When I just sample 1000 samples from the whole data set with batch size 32 to train the model, it can be trained successfully. I want to know, why does this happen? when it comes to the training with batch size 32 on the whole dataset, initially, it can be trained batch by batch, But after several batches, the problem came, which is out of memory. How should I solve this problem?
when I search the solution on the internet, I found that I perhaps can use mixprecision to solve this problem and I tried it, it was ok. But this is not the solution I want. I want to know why out of memory happened when I trained the model? More specifically, I want to know why I can train 1000 samples with batch size 32 even 64, but batch size 32 did not work when the model was trained on the whole dataset. Is the memory being used dependent on the whole data rather than the batch size? can some body give me some idea?
Can you share any of the code to see what the differences are between your different training runs? It could be something simple like you’re accidentally accumulating memory during training or holding too much data in your GPU memory.
1.the following is the code I am making my own customized Dataset.
import torch
import pandas as pd
from torch.utils.data import Dataset
from torchtext.vocab import vocab,Vocab
from collections import Counter, Or
* List item
deredDict
import torch.nn.functional as F
new_DataPath = "../Data_full/fastq.csv"
save_d__vocab_path = "../Data_full/"
Deoxynucleotide_ACID_vocab = ['A','T','C','G']
def _map_sequence_to_int(seq_item,vocab):
"""
Args:
seq_item: str
vocab : Vocab
Return: list of numbericial representation of the tokens in the sequence
"""
sequence = [c for c in seq_item]
sequence.pop() # remove the "\n" in the end
return vocab.lookup_indices(sequence)
def _map_labels_to_int(label_item,vocab):
"""
Args:
label_item : str
Return: int
"""
return vocab.__getitem__(label_item)
# define the Fastq Dataset class
class FastqDataset(Dataset):
def __init__(self,path_to_fastq_csv = new_DataPath) -> None:
"""
reading in the raw data and preprocessing it as needed
### we do not need to store or process the whole data, what we need to do is to be able to process one sample at a time
"""
# super().__init__()
self.path_to_fastq_csv = path_to_fastq_csv
self.raw_data = pd.read_csv(path_to_fastq_csv,header=None) #LOCAL variable
self.make_vocab()
# self.preprocessed_data = None
# self.preprocessed_data = self._preprocess_data(raw_data)
def __len__(self):
pass
return len(self.raw_data)
def __getitem__(self, index) :
"""
The feature will be a tensor representing the input sequence of tokens,
and the label will be a tensor representing the output label(s).
pytorch and tensorflow have different input size to conv1d() and sometimes we need to use tensor.prmute()
to make it suitable for conv1d
I do such thing in collate function : my_collaote_fun in utile.py
"""
feature = self.raw_data.iloc[index,0]
label = self.raw_data.iloc[index,1]
# More costomizable
# add the length to it and return it
length = len(feature) -1 # -1 because of "\n"
feature = _map_sequence_to_int(feature,self.Deoxynucleotide_vocab)
label = _map_labels_to_int(label,self.label_vocab)
feature = torch.tensor(feature)
label = torch.tensor(label)
# return self.preprocessed_data[index]
return F.one_hot(feature,num_classes = 4), label, length
def make_vocab(self):
labels = set(self.raw_data[1].values)
self.label_vocab = None
# to make the label a static file so that model will not be changed from time to time
try:
self.label_vocab = torch.load(save_d__vocab_path + "label_vocab.pt")
print("vocabulary loaded sucessuful from the existing vocab")
except FileNotFoundError:
labels = set(self.raw_data[1].values)
self.label_vocab = vocab(OrderedDict([(token , 1) for token in labels]))
torch.save(self.label_vocab,save_d__vocab_path + "label_vocab.pt")
print("a vocab was saved succussfully. ")
else:
pass
self.Deoxynucleotide_vocab = vocab(OrderedDict([(token,1) for token in Deoxynucleotide_ACID_vocab]))
def get_vocab(self):
return self.label_vocab
2 the following is the pipeline of how model is going to be trained.
import protcnn_model
from torch.utils.data import DataLoader , random_split
import utils
import fastq_dataset
import hparams
import torch.nn as nn
import torch.optim as optim
import torch
import datetime
import os
# accelerating with mixed precision
scaler = torch.cuda.amp.GradScaler()
##########################################
# check whether or not we can use GPU accelarator these tow line comes from offical tutorial
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
device = "cuda:2"
print(f"Using {device} device")
# model_save_path = "./training_info"
current_time = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
train_flag = "_clip_lr5e5_decayR0997"
model_save_path = "./saved_model/" + current_time + train_flag
print(f"model will be saved in{model_save_path}")
os.makedirs(model_save_path,exist_ok=True)
# Hyparameter
# -------------------------------------------------------------------------------------------
# set the ratio of data for each set
Hyparams = hparams.hparams_set_train()
print(Hyparams)
train_ratio = 0.7
dev_ratio = 0.2
test_ratio = 0.1
Batch_size = 32
Num_epochs = Hyparams["num_epochs"]
# -------------------------------------------------------------------------------------------
# hyper parameter above
# loading the whole dataset
print("begin to load the Fast dataset ")
Data_whole = fastq_dataset.FastqDataset()
print("the data is loaded successfully")
# calculate the size of each set
num_train = int(len(Data_whole) * train_ratio)
num_dev = int(len(Data_whole) * dev_ratio)
num_test = len(Data_whole) - num_train - num_dev
# split data into train ,dev and test
train_dataset, dev_dataset, test_datatest = random_split(Data_whole,[num_train,num_dev,num_test])
train_dataloader = DataLoader(train_dataset,shuffle=True,batch_size=Batch_size,collate_fn=utils.my_collate_fn)
test_dataloader = DataLoader(test_datatest,shuffle=True,batch_size=Batch_size,collate_fn=utils.my_collate_fn)
# set the model
My_model = protcnn_model.DNA_Model(Hyparams,len(Data_whole.get_vocab())).to(device)
loss_fn = nn.CrossEntropyLoss()
optimiser = optim.Adam(My_model.parameters(),lr=Hyparams["init_lr_rate"])
def lr_lambda(epoch):
turn_point = int(0.3 * Num_epochs)
# turn_point = 1
# debug
#print(f"the epoch in lr_lamba is {epoch}") # this function will be called when I initialize the LambdaLR
if (epoch <= turn_point):
# warm up
# return epoch * (Hyparams["max_lr_rate"] - Hyparams["init_lr_rate"])/(turn_point)
return (epoch) / turn_point
else :
# expenetial decay
return Hyparams["decay_rate"] ** (epoch - turn_point)
def lr_lambda2(epoch):
warmup_epochs = int(0.3 * Num_epochs)
if epoch <= warmup_epochs:
return (epoch + 1)/warmup_epochs
else :
return Hyparams["decay_rate"]**(epoch - warmup_epochs)
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimiser,lr_lambda=lr_lambda2)
def train(dataloader,model,loss_fn,optimiser):
size = len(dataloader.dataset)
for epoch in range(Num_epochs):
# set model to train mode
# model.train()
model.train()
print(f"start traing ===> epoch is {epoch}")
llrr = optimiser.param_groups[0]["lr"]
# llrr = optimiser.state_dict()['param_groups'][0]["lr"]
print(f"the learning rate of epoch is ----{llrr}")
for batch, (features, labels,length) in enumerate(dataloader):
mask = length # use length to generate the mask to violate
features, labels= features.to(device), labels.to(device)
optimiser.zero_grad()
# forward pass
# ...
with torch.cuda.amp.autocast():
pred = model(features,mask)
loss = loss_fn(pred,labels)
# backward pass and optimise the parameter
# ...
scaler.scale(loss).backward()
#clip the gradient
scaler.unscale_(optimiser)
torch.nn.utils.clip_grad_norm_(model.parameters(),Hyparams["max_norm"])
# optimiser.step()
scaler.step(optimiser)
scaler.update() # update the scale for the next generation
# inspect info:
if batch % 100 == 0 :
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
loss, current_number = loss.item(), (batch + 1) * len(features)
print(f"loss: {loss:>7f} [{current_number:>5d}/{size:>5d} ==> epoch : {epoch}] current time: {current_time}")
torch.save(model.state_dict(),os.path.join(model_save_path, f"my_model_epoch[{epoch}].pth"))
print(f"Model in epoch: [{epoch}] saved successfully")
lr_scheduler.step()
test(test_dataloader,model,loss_fn)
def test(dataloader,model,loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
# set the model to eval mode
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
for features,labels,length in dataloader:
mask = length
features, labels = features.to(device), labels.to(device)
pred = model(features,mask) # logits
test_loss += loss_fn(pred,labels).item()
correct += (pred.argmax(1) == labels).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100 * correct):0.1f}%, Avg loss : {test_loss:>8f}\n") # 后续把时间信息也加上
def test_saved_model():
pass
if __name__ == "__main__":
pass
train(dataloader=train_dataloader,model=My_model,loss_fn=loss_fn,optimiser=optimiser)
# test(test_dataloader,My_model,loss_fn)
def debug(option):
if option == 1:
train(dataloader=train_dataloader,model=My_model,loss_fn=loss_fn,optimiser=optimiser)
elif option ==2:
test(test_dataloader,My_model,loss_fn)
"""
3. the following is the detail of model
import torch.nn as nn
import torch
import math
class Conv1d_with_mask(nn.Module):
"""
### build a convotion building block with mask
"""
def __init__(self, in_channels, out_channels, kernel_size, dilation=1) -> None:
# super(Conv1d_with_mask,self).__init__()
super().__init__()
self.conv_layer = nn.Conv1d(
in_channels, out_channels, kernel_size, dilation=dilation, padding="same")
def forward(self, x, length):
"""
length : list with length being batch_size
"""
# first time zero_pad using mask --> element-wise multiplication
mask = torch.ones_like(x)
for i , len_of_seq in enumerate(length):
mask[i][:,len_of_seq:] = 0 # the i_th sequence in the batch
with torch.no_grad():
x = x * mask
x = self.conv_layer(x)
with torch.no_grad():
mask = torch.ones_like(x)
for i , len_of_seq in enumerate(length):
mask[i][:,len_of_seq:] = 0
x = x * mask
return x
class Residual_Block(nn.Module):
def __init__(self, in_channels, layer_index, hparams) -> None:
"""
hparams: dictionary from str to hyper parameters
"""
super().__init__()
# calculate the parameter for bulidng bolocks like con1d etc.
shifted_layer_index = layer_index - hparams["first_dilated_layer"] + 1
dilation_rate = max(1, hparams["dilation_rate"] ** shifted_layer_index)
num_bottleneck_units = math.floor(
hparams["resnet_bottleneck_factor"] * hparams["filters"])
self.batch_norm_1 = nn.BatchNorm1d(num_features=in_channels)
self.active_fn = nn.ReLU()
self.diate_conv = Conv1d_with_mask(in_channels, num_bottleneck_units,
kernel_size=hparams["kernel_size"], dilation=dilation_rate)
self.batch_norm_2 = nn.BatchNorm1d(num_features=num_bottleneck_units)
self.bottleneck_conv= Conv1d_with_mask(num_bottleneck_units, hparams["filters"], kernel_size = 1, dilation=1)
def forward(self, x, length):
# identity = x # the same name regulation as offical document
# out = self.residual_block(x)
# Because vonlution will not change the length of each sampel, so the mask can be used many times
out = self.batch_norm_1(x)
out = self.active_fn(out)
out = self.diate_conv(out,length)
out = self.batch_norm_2(out)
out = self.active_fn(out)
out = self.bottleneck_conv(out,length)
# return out + identity # skip connection
return out + x
class DNA_Model(nn.Module):
def __init__(self, hparams,num_output_classes) -> None:
"""
Build the neural network
"""
super().__init__()
# "build a initial convolution "
self.init_conv = Conv1d_with_mask(
4, hparams["filters"], hparams["kernel_size"], dilation=1)
self.residual_blocks = nn.ModuleList([Residual_Block(hparams["filters"], layer_index, hparams) for layer_index in range(hparams["num_layers"])])
# max pool here ....figuratively
##########################################################
self.predic_layer = nn.Linear(hparams["filters"],num_output_classes)
def forward(self, x, length):
"""
Args:
x : size(batch,input_embedding,max_length)
"""
#pool_layer = nn.MaxPool1d(x.size()[-1],x.size()[-1])
out = self.init_conv(x,length)
for resnet_block in self.residual_blocks:
out = resnet_block(out,length)
# perhaps this mask is unneccesary
with torch.no_grad():
mask = torch.ones_like(out)
for i , len_of_seq in enumerate(length):
mask[i][:,len_of_seq:] = 0
out = out * mask
#out = pool_layer(out) # now the size of out is Size([batch_size, input_sembedding,1])
out =torch.max_pool1d(out,x.size()[-1],x.size()[-1]) # if use this sequeeze_, then do not use nn.flatten() but this will
# cause problems when the one sample point was left -==> size mismatch
out = torch.nn.Flatten()(out)
#logits = self.predic_layer(out) # use logits to follow the regular name accoring the maching learning by tensorflow and pytorch
# perhaps this will make the memory of the GPU not effectively because the variable "out" still remain alive
out = self.predic_layer(out)
return out
there is a few differences between the training. The first difference is just the number of the training samples. I just pass number 1000 as the argument of the pd.read_csv(…, nrows = 1000). This is only the difference. The whole data contains almost 4 million data samples. Obviously, the second is the batch size 16 and 32.
I do not know whether I make accumulating memory mistakes during my training and I paste the detail about code.
def __init__(self,path_to_fastq_csv = new_DataPath) -> None:
"""
reading in the raw data and preprocessing it as needed
### we do not need to store or process the whole data, what we need to do is to be able to process one sample at a time
"""
# super().__init__()
self.path_to_fastq_csv = path_to_fastq_csv
self.raw_data = pd.read_csv(path_to_fastq_csv,header=None) #LOCAL variable
self.make_vocab()
# self.preprocessed_data = None
# self.preprocessed_data = self._preprocess_data(raw_data)
def __init__(self,path_to_fastq_csv = new_DataPath) -> None:
"""
reading in the raw data and preprocessing it as needed
### we do not need to store or process the whole data, what we need to do is to be able to process one sample at a time
"""
# super().__init__()
self.path_to_fastq_csv = path_to_fastq_csv
self.raw_data = pd.read_csv(path_to_fastq_csv,header=None,nrows = 1000) #LOCAL variable
self.make_vocab()
# self.preprocessed_data = None
# self.preprocessed_data = self._preprocess_data(raw_data)