Hi,
I wondered if anyone could help me with hyperparameter tuning an LSTM? I have elected to go with Ray Tune as I used it previously with CNNs for a piece of coursework but I seem to constantly run into errors that I don’t know how to solve when using it to tune an LSTM.
I am not set on Ray Tune - if someone knows an easier option please let me know! I have yet to see a tutorial online that does not use a CNN which is not helpful!
I would greatly appreciate some help on this as it is for my masters project!
Here is my code:
checkpoint_dir = '/content/Checkpoint'
epochs=5
def custom_train_part(config, checkpoint_dir=checkpoint_dir, data_dir=None):
model = LSTM(len(True_IMF_df.T), config["Hidden"], config["Layers"], 1)
optimizer = torch.optim.Adam(hht_model.parameters(), lr=config["lr"])
model = model.to(device) # move the model parameters to CPU/GPU
evice = "cpu"
if torch.cuda.is_available():
device = "cuda:0"
if torch.cuda.device_count() > 1:
net = nn.DataParallel(model)
model.to(device)
if checkpoint_dir:
model_state, optimizer_state = torch.load(
os.path.join(checkpoint_dir, "checkpoint"))
model.load_state_dict(model_state)
optimizer.load_state_dict(optimizer_state)
for e in range(epochs):
running_loss = 0.0
epoch_steps = 0
model.train() # put model to training mode
x = x_hht_train.to(device) # move to device, e.g. GPU
y = y_hht_train.to(device)
print(f"The shape of the data is: {x.shape}")
scores = model(x)
loss = F.mse_loss(scores, y_hht_train)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
epoch_steps += 1
if e % 5 == 0:
print('Epoch: %d, Iteration %d, loss = %.4f' % (e, t, loss.item()))
# check_accuracy(loader_val, model)
print()
# Validation loss
val_loss = 0.0
val_steps = 0
total = 0
correct = 0
with torch.no_grad():
x = x_hht_val.to(device) # move to device, e.g. GPU
y = y_hht_val.to(device)
scores = model(x)
correct += (np.sign(scores) == np.sign(y)).sum().item()
loss = F.mse_loss(scores, y)
val_loss += loss.cpu().numpy()
val_steps += 1
with tune.checkpoint_dir(e) as checkpoint_dir:
path = os.path.join(checkpoint_dir, "checkpoint")
torch.save((model.state_dict(), optimizer.state_dict()), path)
tune.report(loss=(val_loss / val_steps), accuracy = correct / len(y_hht_val))
print("Finished Training")
config = {
"lr": tune.loguniform(1e-6, 1e-1),
"Layers": tune.sample_from(lambda _: np.random.randint(1, 20)),
"Hidden" : tune.sample_from(lambda _: np.random.randint(2, 200))
}
reporter = CLIReporter(
parameter_columns=["lr", "Layers", "Hidden"],
metric_columns=["loss", "accuracy", "training_iteration"]
)
result = tune.run(
partial(custom_train_part, checkpoint_dir=checkpoint_dir, data_dir=None),
resources_per_trial={"cpu": 2, "gpu": 1},
config=config,
num_samples=1,
progress_reporter=reporter
)
best_trial = result.get_best_trial("loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(best_trial.last_result["accuracy"]))
##############################################################
# AFTER TUNNING #
##############################################################
best_trained_model = hht_model(len(True_IMF_df.T), best_trial.config["Hidden"], best_trial.config["Layers"], 1)
best_checkpoint_dir = best_trial.checkpoint.value
model_state, optimizer_state = torch.load(os.path.join(
best_checkpoint_dir, "checkpoint"))
best_trained_model.load_state_dict(model_state)
The error message:
2021-05-19 12:06:48,454 WARNING experiment.py:294 -- No name detected on trainable. Using DEFAULT.
2021-05-19 12:06:48,455 INFO registry.py:65 -- Detected unknown callable for trainable. Converting to class.
== Status ==
Memory usage on this node: 5.4/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/4 CPUs, 0/1 GPUs, 0.0/14.99 GiB heap, 0.0/7.5 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /root/ray_results/DEFAULT_2021-05-19_12-06-48
Number of trials: 1/1 (1 PENDING)
+---------------------+----------+-------+----------+----------+-------------+
| Trial name | status | loc | Hidden | Layers | lr |
|---------------------+----------+-------+----------+----------+-------------|
| DEFAULT_b0772_00000 | PENDING | | 75 | 1 | 4.51358e-05 |
+---------------------+----------+-------+----------+----------+-------------+
2021-05-19 12:06:50,877 WARNING worker.py:1115 -- Warning: The actor ImplicitFunc has size 70352637 when pickled. It will be stored in Redis, which could cause memory issues. This may mean that its definition uses a large array or other object.
2021-05-19 12:06:50,950 WARNING util.py:162 -- The `start_trial` operation took 0.934 s, which may be a performance bottleneck.
(pid=1741) /usr/local/lib/python3.7/dist-packages/torch/nn/modules/rnn.py:63: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1
(pid=1741) "num_layers={}".format(dropout, num_layers))
(pid=1741) /usr/local/lib/python3.7/dist-packages/torch/nn/modules/loss.py:528: UserWarning: Using a target size (torch.Size([4208, 1])) that is different to the input size (torch.Size([4208, 75])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
(pid=1741) return F.mse_loss(input, target, reduction=self.reduction)
(pid=1741) 2021-05-19 12:06:55,607 ERROR function_runner.py:254 -- Runner Thread raised error.
(pid=1741) Traceback (most recent call last):
(pid=1741) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 248, in run
(pid=1741) self._entrypoint()
(pid=1741) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 316, in entrypoint
(pid=1741) self._status_reporter.get_checkpoint())
(pid=1741) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 580, in _trainable_func
(pid=1741) output = fn()
(pid=1741) File "<ipython-input-198-4b7222b1d1ac>", line 58, in custom_train_part
(pid=1741) File "/usr/local/lib/python3.7/dist-packages/torch/tensor.py", line 621, in __array__
(pid=1741) return self.numpy()
(pid=1741) TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
(pid=1741) Exception in thread Thread-2:
(pid=1741) Traceback (most recent call last):
(pid=1741) File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
(pid=1741) self.run()
(pid=1741) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 267, in run
(pid=1741) raise e
(pid=1741) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 248, in run
(pid=1741) self._entrypoint()
(pid=1741) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 316, in entrypoint
(pid=1741) self._status_reporter.get_checkpoint())
(pid=1741) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 580, in _trainable_func
(pid=1741) output = fn()
(pid=1741) File "<ipython-input-198-4b7222b1d1ac>", line 58, in custom_train_part
(pid=1741) File "/usr/local/lib/python3.7/dist-packages/torch/tensor.py", line 621, in __array__
(pid=1741) return self.numpy()
(pid=1741) TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
(pid=1741)
2021-05-19 12:06:55,747 ERROR trial_runner.py:732 -- Trial DEFAULT_b0772_00000: Error processing event.
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/ray/tune/trial_runner.py", line 702, in _process_trial
results = self.trial_executor.fetch_result(trial)
File "/usr/local/lib/python3.7/dist-packages/ray/tune/ray_trial_executor.py", line 686, in fetch_result
result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
File "/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/ray/worker.py", line 1481, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TuneError): ray::ImplicitFunc.train_buffered() (pid=1741, ip=172.28.0.2)
File "python/ray/_raylet.pyx", line 505, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 449, in ray._raylet.execute_task.function_executor
File "/usr/local/lib/python3.7/dist-packages/ray/_private/function_manager.py", line 556, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/ray/tune/trainable.py", line 173, in train_buffered
result = self.train()
File "/usr/local/lib/python3.7/dist-packages/ray/tune/trainable.py", line 232, in train
result = self.step()
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 366, in step
self._report_thread_runner_error(block=True)
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 513, in _report_thread_runner_error
("Trial raised an exception. Traceback:\n{}".format(err_tb_str)
ray.tune.error.TuneError: Trial raised an exception. Traceback:
ray::ImplicitFunc.train_buffered() (pid=1741, ip=172.28.0.2)
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 248, in run
self._entrypoint()
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 316, in entrypoint
self._status_reporter.get_checkpoint())
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 580, in _trainable_func
output = fn()
File "<ipython-input-198-4b7222b1d1ac>", line 58, in custom_train_part
File "/usr/local/lib/python3.7/dist-packages/torch/tensor.py", line 621, in __array__
return self.numpy()
TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
(pid=1741) Running loss: 17.041641235351562
(pid=1741) Epoch: 0, loss = 17.041641235351562.4f
(pid=1741)
Result for DEFAULT_b0772_00000:
{}
== Status ==
Memory usage on this node: 7.1/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/4 CPUs, 0/1 GPUs, 0.0/14.99 GiB heap, 0.0/7.5 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /root/ray_results/DEFAULT_2021-05-19_12-06-48
Number of trials: 1/1 (1 ERROR)
+---------------------+----------+-------+----------+----------+-------------+
| Trial name | status | loc | Hidden | Layers | lr |
|---------------------+----------+-------+----------+----------+-------------|
| DEFAULT_b0772_00000 | ERROR | | 75 | 1 | 4.51358e-05 |
+---------------------+----------+-------+----------+----------+-------------+
Number of errored trials: 1
+---------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------+
| Trial name | # failures | error file |
|---------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------|
| DEFAULT_b0772_00000 | 1 | /root/ray_results/DEFAULT_2021-05-19_12-06-48/DEFAULT_b0772_00000_0_Hidden=75,Layers=1,lr=4.5136e-05_2021-05-19_12-06-49/error.txt |
+---------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------+
== Status ==
Memory usage on this node: 7.1/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/4 CPUs, 0/1 GPUs, 0.0/14.99 GiB heap, 0.0/7.5 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /root/ray_results/DEFAULT_2021-05-19_12-06-48
Number of trials: 1/1 (1 ERROR)
+---------------------+----------+-------+----------+----------+-------------+
| Trial name | status | loc | Hidden | Layers | lr |
|---------------------+----------+-------+----------+----------+-------------|
| DEFAULT_b0772_00000 | ERROR | | 75 | 1 | 4.51358e-05 |
+---------------------+----------+-------+----------+----------+-------------+
Number of errored trials: 1
+---------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------+
| Trial name | # failures | error file |
|---------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------|
| DEFAULT_b0772_00000 | 1 | /root/ray_results/DEFAULT_2021-05-19_12-06-48/DEFAULT_b0772_00000_0_Hidden=75,Layers=1,lr=4.5136e-05_2021-05-19_12-06-49/error.txt |
+---------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------+
---------------------------------------------------------------------------
TuneError Traceback (most recent call last)
<ipython-input-198-4b7222b1d1ac> in <module>()
95 num_samples=1,
96 scheduler = scheduler,
---> 97 progress_reporter=reporter
98 )
99
/usr/local/lib/python3.7/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, queue_trials, reuse_actors, trial_executor, raise_on_failed_trial, callbacks, loggers, ray_auto_init, run_errored_only, global_checkpoint_period, with_server, upload_dir, sync_to_cloud, sync_to_driver, sync_on_checkpoint, _remote)
541 if incomplete_trials:
542 if raise_on_failed_trial and not state[signal.SIGINT]:
--> 543 raise TuneError("Trials did not complete", incomplete_trials)
544 else:
545 logger.error("Trials did not complete: %s", incomplete_trials)
TuneError: ('Trials did not complete', [DEFAULT_b0772_00000])