I am trying to use hyperopt to tune my model. Initially it runs fine, but after a few epochs, I get the following error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-15-1973a9b929e7> in <module>
14 max_evals = 15,
15 timeout = 8.9 * 60 * 60,
---> 16 trials = trials
17 )
/opt/conda/lib/python3.7/site-packages/hyperopt/fmin.py in fmin(fn, space, algo, max_evals, timeout, loss_threshold, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin, points_to_evaluate, max_queue_len, show_progressbar)
480 catch_eval_exceptions=catch_eval_exceptions,
481 return_argmin=return_argmin,
--> 482 show_progressbar=show_progressbar,
483 )
484
/opt/conda/lib/python3.7/site-packages/hyperopt/base.py in fmin(self, fn, space, algo, max_evals, timeout, loss_threshold, max_queue_len, rstate, verbose, pass_expr_memo_ctrl, catch_eval_exceptions, return_argmin, show_progressbar)
684 catch_eval_exceptions=catch_eval_exceptions,
685 return_argmin=return_argmin,
--> 686 show_progressbar=show_progressbar,
687 )
688
/opt/conda/lib/python3.7/site-packages/hyperopt/fmin.py in fmin(fn, space, algo, max_evals, timeout, loss_threshold, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin, points_to_evaluate, max_queue_len, show_progressbar)
507
508 # next line is where the fmin is actually executed
--> 509 rval.exhaust()
510
511 if return_argmin:
/opt/conda/lib/python3.7/site-packages/hyperopt/fmin.py in exhaust(self)
328 def exhaust(self):
329 n_done = len(self.trials)
--> 330 self.run(self.max_evals - n_done, block_until_done=self.asynchronous)
331 self.trials.refresh()
332 return self
/opt/conda/lib/python3.7/site-packages/hyperopt/fmin.py in run(self, N, block_until_done)
284 else:
285 # -- loop over trials and do the jobs directly
--> 286 self.serial_evaluate()
287
288 self.trials.refresh()
/opt/conda/lib/python3.7/site-packages/hyperopt/fmin.py in serial_evaluate(self, N)
163 ctrl = base.Ctrl(self.trials, current_trial=trial)
164 try:
--> 165 result = self.domain.evaluate(spec, ctrl)
166 except Exception as e:
167 logger.error("job exception: %s" % str(e))
/opt/conda/lib/python3.7/site-packages/hyperopt/base.py in evaluate(self, config, ctrl, attach_attachments)
892 print_node_on_error=self.rec_eval_print_node_on_error,
893 )
--> 894 rval = self.fn(pyll_rval)
895
896 if isinstance(rval, (float, int, np.number)):
<ipython-input-13-ee565489d39e> in tune(params)
11 dropouts.append(params['dropout_rate_3'])
12 weight_decay = 1e-5
---> 13 train_fn(hidden_layers, dropouts, weight_decay)
14 for seed in [1903]:
15 print(f"Inference for seed {seed}")
<ipython-input-11-1982693743a5> in train_fn(hidden_layers, dropouts, weight_decay)
45 if phase=='train':
46 loss.backward()
---> 47 optimizer.step()
48
49 running_loss += loss.item() / len(dataloaders[phase])
/opt/conda/lib/python3.7/site-packages/torch/autograd/grad_mode.py in decorate_context(*args, **kwargs)
13 def decorate_context(*args, **kwargs):
14 with self:
---> 15 return func(*args, **kwargs)
16 return decorate_context
17
/opt/conda/lib/python3.7/site-packages/torch/optim/adam.py in step(self, closure)
94
95 if group['weight_decay'] != 0:
---> 96 grad = grad.add(p, alpha=group['weight_decay'])
97
98 # Decay the first and second moment running average coefficient
RuntimeError: CUDA error: device-side assert triggered
I have no idea what this is about. Any help is much appreciated!