Hi, I’ve tried to set CUDA_VISIBLE_DEVICES = ‘1’ in main function but when I move the model to cuda, It does not move to GPU1 but GPU0 instead (result in OOM due to GPU0 is in use). Please tell me if I’m wrong.
here is my code:
in train.py:
def main(config_file_path):
config = SspmYamlConfig(config_file_path)
dataloader_cfg = config.get_dataloader_cfg()
trainer_cfg = config.get_trainer_cfg()
logger_cfg = config.get_logger_cfg()
model_cfg = config.get_model_cfg()
pose_dataset_cfg = config.get_pose_dataset_cfg()
data_augmentation_cfg = config.get_augmentation_cfg()
target_generator_cfg = config.get_target_generator_cfg()
learning_rate = trainer_cfg['optimizer']['learning_rate']
# parsing device = [1] by config
device = ','.join(list(map(str, trainer_cfg['device'])))
os.environ['CUDA_DEVICE_ORDER']= 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = device
model = getModel(model_cfg)
train_loader = DataLoader(train_dataset, **dataloader_cfg['train'])
val_loader = DataLoader(val_dataset, **dataloader_cfg['val'])
trainer = Trainer(
model, optimizer, logger,
writer, config, train_loader, val_loader
)
Trainer class is inherited from BaseTrainer where the model was transferd to cuda
class BaseTrainer(ABC):
def __init__(self, model, optimizer, logger, writer, config):
self.config = config
self.logger = logger
self.writer = writer
self.optimizer = optimizer
self.trainer_config = config.get_trainer_cfg()
self.device_list = self.trainer_config['device'] #device list is [1]
self.device_type = self._check_gpu(self.device_list)
self.device = torch.device(self.device_type)
self.model = model
self.model = self.model.to(self.device)
self.model = torch.nn.DataParallel(self.model)
def _check_gpu(self, gpus):
if len(gpus) > 0 and torch.cuda.is_available():
pynvml.nvmlInit()
for i in gpus:
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
memused = meminfo.used / 1024 / 1024
self.logger.info('GPU{} used: {}M'.format(i, memused))
if memused > 1000:
pynvml.nvmlShutdown()
raise ValueError('GPU{} is occupied!'.format(i))
pynvml.nvmlShutdown()
return 'cuda'
else:
self.logger.info('Using CPU!')
return 'cpu'