Getting Cuda error in the middle of hyper parameter tuning

keith123 · September 27, 2020, 2:14pm

I am trying to use hyperopt to tune my model. Initially it runs fine, but after a few epochs, I get the following error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-15-1973a9b929e7> in <module>
     14             max_evals = 15,
     15             timeout = 8.9 * 60 * 60,
---> 16             trials = trials
     17            )

/opt/conda/lib/python3.7/site-packages/hyperopt/fmin.py in fmin(fn, space, algo, max_evals, timeout, loss_threshold, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin, points_to_evaluate, max_queue_len, show_progressbar)
    480             catch_eval_exceptions=catch_eval_exceptions,
    481             return_argmin=return_argmin,
--> 482             show_progressbar=show_progressbar,
    483         )
    484 

/opt/conda/lib/python3.7/site-packages/hyperopt/base.py in fmin(self, fn, space, algo, max_evals, timeout, loss_threshold, max_queue_len, rstate, verbose, pass_expr_memo_ctrl, catch_eval_exceptions, return_argmin, show_progressbar)
    684             catch_eval_exceptions=catch_eval_exceptions,
    685             return_argmin=return_argmin,
--> 686             show_progressbar=show_progressbar,
    687         )
    688 

/opt/conda/lib/python3.7/site-packages/hyperopt/fmin.py in fmin(fn, space, algo, max_evals, timeout, loss_threshold, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin, points_to_evaluate, max_queue_len, show_progressbar)
    507 
    508     # next line is where the fmin is actually executed
--> 509     rval.exhaust()
    510 
    511     if return_argmin:

/opt/conda/lib/python3.7/site-packages/hyperopt/fmin.py in exhaust(self)
    328     def exhaust(self):
    329         n_done = len(self.trials)
--> 330         self.run(self.max_evals - n_done, block_until_done=self.asynchronous)
    331         self.trials.refresh()
    332         return self

/opt/conda/lib/python3.7/site-packages/hyperopt/fmin.py in run(self, N, block_until_done)
    284                 else:
    285                     # -- loop over trials and do the jobs directly
--> 286                     self.serial_evaluate()
    287 
    288                 self.trials.refresh()

/opt/conda/lib/python3.7/site-packages/hyperopt/fmin.py in serial_evaluate(self, N)
    163                 ctrl = base.Ctrl(self.trials, current_trial=trial)
    164                 try:
--> 165                     result = self.domain.evaluate(spec, ctrl)
    166                 except Exception as e:
    167                     logger.error("job exception: %s" % str(e))

/opt/conda/lib/python3.7/site-packages/hyperopt/base.py in evaluate(self, config, ctrl, attach_attachments)
    892                 print_node_on_error=self.rec_eval_print_node_on_error,
    893             )
--> 894             rval = self.fn(pyll_rval)
    895 
    896         if isinstance(rval, (float, int, np.number)):

<ipython-input-13-ee565489d39e> in tune(params)
     11     dropouts.append(params['dropout_rate_3'])
     12     weight_decay = 1e-5
---> 13     train_fn(hidden_layers, dropouts, weight_decay)
     14     for seed in [1903]:
     15         print(f"Inference for seed {seed}")

<ipython-input-11-1982693743a5> in train_fn(hidden_layers, dropouts, weight_decay)
     45                             if phase=='train':
     46                                 loss.backward()
---> 47                                 optimizer.step()
     48 
     49                         running_loss += loss.item() / len(dataloaders[phase])

/opt/conda/lib/python3.7/site-packages/torch/autograd/grad_mode.py in decorate_context(*args, **kwargs)
     13         def decorate_context(*args, **kwargs):
     14             with self:
---> 15                 return func(*args, **kwargs)
     16         return decorate_context
     17 

/opt/conda/lib/python3.7/site-packages/torch/optim/adam.py in step(self, closure)
     94 
     95                 if group['weight_decay'] != 0:
---> 96                     grad = grad.add(p, alpha=group['weight_decay'])
     97 
     98                 # Decay the first and second moment running average coefficient

RuntimeError: CUDA error: device-side assert triggered

I have no idea what this is about. Any help is much appreciated!

ptrblck · September 27, 2020, 9:58pm

Could you rerun your code via:

CUDA_LAUNCH_BLOCKING=1 python script.py args

and post the stack trace here, please?
Alternatively you could also run the code on the CPU and post the error message here.