Hello,
I am training a ViT network using pytorch. The training process is too slow. When I checked the task manager, I found although the dedicated GPU memory is fully used, GPU utilization is 0%. To be precise, first the GPU utilization increases and then after a few seconds, it decreases to 0.
I checked this method but the situation is still the same as before.
Also, I checked this method and got the following output which is weird:
data loading time: 59.500293016433716
data loading time: 60.46416115760803
data loading time: 61.4690420627594
data loading time: 62.47226524353027
data loading time: 63.4786810874939
data loading time: 64.48584127426147
data loading time: 65.48938298225403
The problem is that I am only using random tensors why the data loading time is so high?
This is my code for data preparation:
class MyDataset(Dataset):
def __init__(self, img_folder, mask_folder, ratings, config, transform=None):
self.data_list = os.listdir(img_folder)
self.img_folder = img_folder
self.mask_folder = mask_folder
self.transform = transform
self.config = config
self.ratings = ratings
self.data = {}
def __len__(self):
return len(self.data_list)
def __getitem__(self, idx):
img_name = self.data_list[idx]
# image = torch.load(os.path.join(self.img_folder, img_name))
# print(img_name)
#mask_name = img_name.split('.')[0] + '_mask.' + img_name.split('.')[1]
#img_path = os.path.join(self.img_folder, img_name)
#if self.mask_folder is not None:
# mask_path = os.path.join(self.mask_folder, mask_name)
#else:
# mask_path = None
#image = prepare_image_mask(img_path, mask_path, self.config)
image = torch.rand(705,3075)
labels = torch.Tensor(self.ratings[img_name.split('.')[0]])
return image, labels
def prepare_dataloaders(config, ratings):
my_dataset = MyDataset(config.path.img_folder, config.path.mask_folder, ratings, config.data)
config_test = copy.deepcopy(config)
config_test.data.max_seq_len_from_original_res = -1
config_test.training.batch_size = 1 # As we get inference on the original resolution
my_testset = MyDataset(config.path.test_folder, config.path.test_mask_folder, ratings, config_test.data)
# Split the dataset into train and test sets
train_dataset, val_dataset = torch.utils.data.random_split(my_dataset ,
[int(len(my_dataset ) * config.training.train_size),
len(my_dataset ) - int(
len(my_dataset ) * config.training.train_size)])
# Create DataLoader for train and test sets
train_loader = DataLoader(train_dataset, batch_size=config.training.batch_size, shuffle=config.training.shuffle_data
, num_workers=config.training.num_workers)
val_loader = DataLoader(val_dataset, batch_size=config.training.batch_size, shuffle=config.training.shuffle_data,
num_workers=config.training.num_workers)
test_loader = DataLoader(my_testset, batch_size=config_test.training.batch_size,
shuffle=config_test.training.shuffle_data,
num_workers=config_test.training.num_workers)
dataloaders = {
'train': train_loader,
'val': val_loader,
'test': test_loader
}
return dataloaders