Hyperparameter tuning an LSTM

Hi,

I wondered if anyone could help me with hyperparameter tuning an LSTM? I have elected to go with Ray Tune as I used it previously with CNNs for a piece of coursework but I seem to constantly run into errors that I don’t know how to solve when using it to tune an LSTM.

I am not set on Ray Tune - if someone knows an easier option please let me know! I have yet to see a tutorial online that does not use a CNN which is not helpful!

I would greatly appreciate some help on this as it is for my masters project!

Here is my code:

checkpoint_dir = '/content/Checkpoint'
epochs=5

def custom_train_part(config, checkpoint_dir=checkpoint_dir, data_dir=None):
    model = LSTM(len(True_IMF_df.T), config["Hidden"], config["Layers"], 1)
    optimizer = torch.optim.Adam(hht_model.parameters(), lr=config["lr"])
    model = model.to(device)  # move the model parameters to CPU/GPU

    evice = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(model)
    model.to(device)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
    
    for e in range(epochs):

        running_loss = 0.0
        epoch_steps = 0
        
        model.train()  # put model to training mode
        x = x_hht_train.to(device)  # move to device, e.g. GPU
        y = y_hht_train.to(device)
        print(f"The shape of the data is: {x.shape}")

        scores = model(x)
        loss = F.mse_loss(scores, y_hht_train)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        epoch_steps += 1

        if e % 5 == 0:
            print('Epoch: %d, Iteration %d, loss = %.4f' % (e, t, loss.item()))
            # check_accuracy(loader_val, model)
            print()
    
        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        
        with torch.no_grad():
            x = x_hht_val.to(device)  # move to device, e.g. GPU
            y = y_hht_val.to(device)

            scores = model(x)
            correct += (np.sign(scores) == np.sign(y)).sum().item()

            loss = F.mse_loss(scores, y)
            val_loss += loss.cpu().numpy()
            val_steps += 1

        with tune.checkpoint_dir(e) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((model.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy = correct / len(y_hht_val))
    print("Finished Training")


config = {
    "lr": tune.loguniform(1e-6, 1e-1),
    "Layers": tune.sample_from(lambda _: np.random.randint(1, 20)),
    "Hidden" : tune.sample_from(lambda _: np.random.randint(2, 200))
}

reporter = CLIReporter(
    parameter_columns=["lr", "Layers", "Hidden"],
    metric_columns=["loss", "accuracy", "training_iteration"]
)

result = tune.run(
    partial(custom_train_part, checkpoint_dir=checkpoint_dir, data_dir=None),
    resources_per_trial={"cpu": 2, "gpu": 1},
    config=config,
    num_samples=1,
    progress_reporter=reporter
)

best_trial = result.get_best_trial("loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(best_trial.last_result["accuracy"]))

##############################################################
#                       AFTER TUNNING                       #
##############################################################

best_trained_model = hht_model(len(True_IMF_df.T), best_trial.config["Hidden"], best_trial.config["Layers"], 1)
best_checkpoint_dir = best_trial.checkpoint.value
model_state, optimizer_state = torch.load(os.path.join(
    best_checkpoint_dir, "checkpoint"))
best_trained_model.load_state_dict(model_state)

The error message:

2021-05-19 12:06:48,454	WARNING experiment.py:294 -- No name detected on trainable. Using DEFAULT.
2021-05-19 12:06:48,455	INFO registry.py:65 -- Detected unknown callable for trainable. Converting to class.
== Status ==
Memory usage on this node: 5.4/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/4 CPUs, 0/1 GPUs, 0.0/14.99 GiB heap, 0.0/7.5 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /root/ray_results/DEFAULT_2021-05-19_12-06-48
Number of trials: 1/1 (1 PENDING)
+---------------------+----------+-------+----------+----------+-------------+
| Trial name          | status   | loc   |   Hidden |   Layers |          lr |
|---------------------+----------+-------+----------+----------+-------------|
| DEFAULT_b0772_00000 | PENDING  |       |       75 |        1 | 4.51358e-05 |
+---------------------+----------+-------+----------+----------+-------------+


2021-05-19 12:06:50,877	WARNING worker.py:1115 -- Warning: The actor ImplicitFunc has size 70352637 when pickled. It will be stored in Redis, which could cause memory issues. This may mean that its definition uses a large array or other object.
2021-05-19 12:06:50,950	WARNING util.py:162 -- The `start_trial` operation took 0.934 s, which may be a performance bottleneck.
(pid=1741) /usr/local/lib/python3.7/dist-packages/torch/nn/modules/rnn.py:63: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1
(pid=1741)   "num_layers={}".format(dropout, num_layers))
(pid=1741) /usr/local/lib/python3.7/dist-packages/torch/nn/modules/loss.py:528: UserWarning: Using a target size (torch.Size([4208, 1])) that is different to the input size (torch.Size([4208, 75])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
(pid=1741)   return F.mse_loss(input, target, reduction=self.reduction)
(pid=1741) 2021-05-19 12:06:55,607	ERROR function_runner.py:254 -- Runner Thread raised error.
(pid=1741) Traceback (most recent call last):
(pid=1741)   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 248, in run
(pid=1741)     self._entrypoint()
(pid=1741)   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 316, in entrypoint
(pid=1741)     self._status_reporter.get_checkpoint())
(pid=1741)   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 580, in _trainable_func
(pid=1741)     output = fn()
(pid=1741)   File "<ipython-input-198-4b7222b1d1ac>", line 58, in custom_train_part
(pid=1741)   File "/usr/local/lib/python3.7/dist-packages/torch/tensor.py", line 621, in __array__
(pid=1741)     return self.numpy()
(pid=1741) TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
(pid=1741) Exception in thread Thread-2:
(pid=1741) Traceback (most recent call last):
(pid=1741)   File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
(pid=1741)     self.run()
(pid=1741)   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 267, in run
(pid=1741)     raise e
(pid=1741)   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 248, in run
(pid=1741)     self._entrypoint()
(pid=1741)   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 316, in entrypoint
(pid=1741)     self._status_reporter.get_checkpoint())
(pid=1741)   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 580, in _trainable_func
(pid=1741)     output = fn()
(pid=1741)   File "<ipython-input-198-4b7222b1d1ac>", line 58, in custom_train_part
(pid=1741)   File "/usr/local/lib/python3.7/dist-packages/torch/tensor.py", line 621, in __array__
(pid=1741)     return self.numpy()
(pid=1741) TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
(pid=1741) 
2021-05-19 12:06:55,747	ERROR trial_runner.py:732 -- Trial DEFAULT_b0772_00000: Error processing event.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/trial_runner.py", line 702, in _process_trial
    results = self.trial_executor.fetch_result(trial)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/ray_trial_executor.py", line 686, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py", line 47, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/ray/worker.py", line 1481, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TuneError): ray::ImplicitFunc.train_buffered() (pid=1741, ip=172.28.0.2)
  File "python/ray/_raylet.pyx", line 505, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 449, in ray._raylet.execute_task.function_executor
  File "/usr/local/lib/python3.7/dist-packages/ray/_private/function_manager.py", line 556, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/trainable.py", line 173, in train_buffered
    result = self.train()
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/trainable.py", line 232, in train
    result = self.step()
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 366, in step
    self._report_thread_runner_error(block=True)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 513, in _report_thread_runner_error
    ("Trial raised an exception. Traceback:\n{}".format(err_tb_str)
ray.tune.error.TuneError: Trial raised an exception. Traceback:
ray::ImplicitFunc.train_buffered() (pid=1741, ip=172.28.0.2)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 248, in run
    self._entrypoint()
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 316, in entrypoint
    self._status_reporter.get_checkpoint())
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 580, in _trainable_func
    output = fn()
  File "<ipython-input-198-4b7222b1d1ac>", line 58, in custom_train_part
  File "/usr/local/lib/python3.7/dist-packages/torch/tensor.py", line 621, in __array__
    return self.numpy()
TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
(pid=1741) Running loss: 17.041641235351562
(pid=1741) Epoch: 0, loss = 17.041641235351562.4f
(pid=1741) 
Result for DEFAULT_b0772_00000:
  {}
  
== Status ==
Memory usage on this node: 7.1/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/4 CPUs, 0/1 GPUs, 0.0/14.99 GiB heap, 0.0/7.5 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /root/ray_results/DEFAULT_2021-05-19_12-06-48
Number of trials: 1/1 (1 ERROR)
+---------------------+----------+-------+----------+----------+-------------+
| Trial name          | status   | loc   |   Hidden |   Layers |          lr |
|---------------------+----------+-------+----------+----------+-------------|
| DEFAULT_b0772_00000 | ERROR    |       |       75 |        1 | 4.51358e-05 |
+---------------------+----------+-------+----------+----------+-------------+
Number of errored trials: 1
+---------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------+
| Trial name          |   # failures | error file                                                                                                                         |
|---------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------|
| DEFAULT_b0772_00000 |            1 | /root/ray_results/DEFAULT_2021-05-19_12-06-48/DEFAULT_b0772_00000_0_Hidden=75,Layers=1,lr=4.5136e-05_2021-05-19_12-06-49/error.txt |
+---------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------+

== Status ==
Memory usage on this node: 7.1/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/4 CPUs, 0/1 GPUs, 0.0/14.99 GiB heap, 0.0/7.5 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /root/ray_results/DEFAULT_2021-05-19_12-06-48
Number of trials: 1/1 (1 ERROR)
+---------------------+----------+-------+----------+----------+-------------+
| Trial name          | status   | loc   |   Hidden |   Layers |          lr |
|---------------------+----------+-------+----------+----------+-------------|
| DEFAULT_b0772_00000 | ERROR    |       |       75 |        1 | 4.51358e-05 |
+---------------------+----------+-------+----------+----------+-------------+
Number of errored trials: 1
+---------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------+
| Trial name          |   # failures | error file                                                                                                                         |
|---------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------|
| DEFAULT_b0772_00000 |            1 | /root/ray_results/DEFAULT_2021-05-19_12-06-48/DEFAULT_b0772_00000_0_Hidden=75,Layers=1,lr=4.5136e-05_2021-05-19_12-06-49/error.txt |
+---------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------+

---------------------------------------------------------------------------
TuneError                                 Traceback (most recent call last)
<ipython-input-198-4b7222b1d1ac> in <module>()
     95     num_samples=1,
     96     scheduler = scheduler,
---> 97     progress_reporter=reporter
     98 )
     99 

/usr/local/lib/python3.7/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, queue_trials, reuse_actors, trial_executor, raise_on_failed_trial, callbacks, loggers, ray_auto_init, run_errored_only, global_checkpoint_period, with_server, upload_dir, sync_to_cloud, sync_to_driver, sync_on_checkpoint, _remote)
    541     if incomplete_trials:
    542         if raise_on_failed_trial and not state[signal.SIGINT]:
--> 543             raise TuneError("Trials did not complete", incomplete_trials)
    544         else:
    545             logger.error("Trials did not complete: %s", incomplete_trials)

TuneError: ('Trials did not complete', [DEFAULT_b0772_00000])

It seems the code is failing with:

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

while trying to call .numpy() on a tensor, which is still on the GPU, so you might need to move it to the CPU first.

You should also consider fixing this warning:

UserWarning: Using a target size (torch.Size([4208, 1])) that is different to the input size (torch.Size([4208, 75])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.

Hello,

Thanks for responding. I managed to fix the first error, although the second remains because I am using an LSTM where the input is the 75 previous values and the prediction of the next point in the series