Hello,
I’m running pytorch lightning with ray.
When I run this function:
def run_ray(metric='val_acc', mode='max',num_samples=3,config_dict={}, checkpoint_file_name="penetration_move_to_oracle_ray_ckpt",config_file='penetration/best_config.txt',local_dir='penetration/runs/'):
hyperopt_search = HyperOptSearch(metric=metric, mode=mode)
#change from gpu {"gpu": 1}
tuner = tune.Tuner(tune.with_resources(train_fn,{"gpu": 1}), tune_config=tune.TuneConfig(num_samples=num_samples,search_alg=hyperopt_search),param_space=config_dict,run_config= RunConfig(local_dir=local_dir))
results = tuner.fit()
best_result = results.get_best_result(metric=metric, mode=mode)
config_file = open(config_file, 'a')
config_file.write(str(best_result.config) + '\n')
best_checkpoint = best_result.checkpoint
path = os.path.join(str(best_checkpoint.to_directory()), checkpoint_file_name)
print(path)
model = GraphLevelGNN.load_from_checkpoint(path)
config_file.write(str(best_result.log_dir))
config_file.close()
return best_result.log_dir,model
Like this:
train_dataset_file = open('train_dataset.pkl','rb')
train_dataset=pickle.load(train_dataset_file, encoding='latin1')
train_seqs_file = open('train_seqs.pkl','rb')
train_seqs=pickle.load(train_seqs_file, encoding='latin1')
test_dataset_file = open('test_dataset.pkl','rb')
test_dataset=pickle.load(test_dataset_file, encoding='latin1')
test_seqs_file = open('test_seqs.pkl','rb')
test_seqs=pickle.load(test_seqs_file, encoding='latin1')
val_dataset_file = open('val_dataset.pkl','rb')
val_dataset=pickle.load(val_dataset_file, encoding='latin1')
val_seqs_file = open('val_seqs.pkl','rb')
val_seqs=pickle.load(val_seqs_file, encoding='latin1')
graph_train_loader_file = open('graph_train_loader.pkl','rb')
graph_train_loader=pickle.load(graph_train_loader_file, encoding='latin1')
graph_test_loader_file = open('graph_test_loader.pkl','rb')
graph_test_loader=pickle.load(graph_test_loader_file, encoding='latin1')
graph_val_loader_file = open('graph_val_loader.pkl','rb')
graph_val_loader=pickle.load(graph_val_loader_file, encoding='latin1')
#Step 2: Decide on the space to explore with HPO
config_dict = {
"c_hidden": tune.choice([32,64,128,256,512,1024,2056]),
"dp_rate_linear":tune.uniform(0.4,0.8),
"num_layers":tune.randint(3, 20),
"activation_function":tune.choice(['nn.ReLU(inplace=True)','nn.LeakyReLU(inplace=True)','nn.Sigmoid()','nn.Tanh()']),
"optimizer_name" : tune.choice(['SGD','NAdam', 'Adam','RMSProp', 'ASGD', 'LBFGS','AdamW','Adadelta']),
"learning_rate":tune.uniform(0.0001,0.1),
}
best_result_log_path,model = run_ray(config_dict=config_dict)
I get the error:
Sanity Checking: 0it [00:00, ?it/s]
2023-01-12 16:24:51,170 ERROR trial_runner.py:980 -- Trial train_fn_9d4a965b: Error processing event.
ray.exceptions.RayTaskError(AttributeError): ray::ImplicitFunc.train() (pid=83179, ip=10.0.0.106, repr=train_fn)
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/ray/tune/trainable/trainable.py", line 347, in train
result = self.step()
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py", line 417, in step
self._report_thread_runner_error(block=True)
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py", line 589, in _report_thread_runner_error
raise e
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run
self._entrypoint()
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint
return self._trainable_func(
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py", line 684, in _trainable_func
output = fn()
File "/home/ubuntu/penetration_move_to_oracle2/penetration_v1.py", line 320, in train_fn
train_graph_classifier(
File "/home/ubuntu/penetration_move_to_oracle2/penetration_v1.py", line 306, in train_graph_classifier
model = trainer.fit(model, graph_train_loader, graph_val_loader)
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 770, in fit
self._call_and_handle_interrupt(
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 723, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 811, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1236, in _run
results = self._run_stage()
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1323, in _run_stage
return self._run_train()
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1345, in _run_train
self._run_sanity_check()
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1413, in _run_sanity_check
val_loop.run()
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 155, in advance
dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 199, in run
self.on_run_start(*args, **kwargs)
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 88, in on_run_start
self._data_fetcher = iter(data_fetcher)
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/pytorch_lightning/utilities/fetching.py", line 178, in __iter__
self.dataloader_iter = iter(self.dataloader)
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 444, in __iter__
return self._get_iterator()
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 387, in _get_iterator
return _SingleProcessDataLoaderIter(self)
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 712, in __init__
super(_SingleProcessDataLoaderIter, self).__init__(loader)
File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 638, in __init__
if (len(loader.pin_memory_device) == 0):
AttributeError: 'DataLoader' object has no attribute 'pin_memory_device'
Could someone explain what this means?
There is no error just from reading in the data sets/loaders from pickle files, it’s only when you add the run_ray function that the error occurs.
The versions of the libraries I’m using are:
torch 1.12.1
torch-cluster 1.6.0
torch-geometric 2.1.0.post1
torch-scatter 2.0.9
torch-sparse 0.6.15
torchmetrics 0.10.0
ray 2.0.0
pytorch-lightning 1.6.5