RuntimeError : CUDA out of memory when load and train pretrain model

I had RuntimeError: CUDA out of memory. Tried to allocate 66.00 MiB (GPU 1; 10.76 GiB total capacity; 9.67 GiB already allocated; 25.44 MiB free; 9.86 GiB reserved in total by PyTorch)
What should I do?..:frowning:

This save and load function code is

    def save(self):
        Saves the current model and related training parameters into a subdirectory of the checkpoint directory.
        The name of the subdirectory is the current local time in Y_M_D_H_M_S format.
        date_time = time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime())
        path = os.path.join(self.SAVE_PATH, self.CHECKPOINT_DIR_NAME, date_time)

        if os.path.exists(path):
            shutil.rmtree(path)  # delete path dir & sub-files

        trainer_states = {
            'optimizer': self.optimizer,
            'trainset_list': self.trainset_list,
            'validset': self.validset,
            'epoch': self.epoch
        }, os.path.join(path, self.TRAINER_STATE_NAME)), os.path.join(path, self.MODEL_NAME)), os.path.join(path, '')) ##{
            'model': self.model.state_dict(),
            'optimizer': self.optimizer,
            'trainset_list': self.trainset_list,
            'validset': self.validset,
            'epoch': self.epoch
        }, os.path.join(path, 'all.tar')) ##'save checkpoints\n%s\n%s'
                    % (os.path.join(path, self.TRAINER_STATE_NAME),
                       os.path.join(path, self.MODEL_NAME)))

    def load(self, path, first=True):
        Loads a Checkpoint object that was previously saved to disk.

            path (str): path to the checkpoint subdirectory

            checkpoint (Checkpoint): checkpoint object with fields copied from those stored on disk
       """'load checkpoints\n%s\n%s'
                    % (os.path.join(path, self.TRAINER_STATE_NAME),
                       os.path.join(path, self.MODEL_NAME)))

        if torch.cuda.is_available():
            resume_checkpoint = torch.load(os.path.join(path, self.TRAINER_STATE_NAME)) 
            model = torch.load(os.path.join(path, self.MODEL_NAME))
            checkpoint = torch.load(os.path.join(path, 'all.tar')) ##
            model.load_state_dict(checkpoint['model']) ##

            resume_checkpoint = torch.load(os.path.join(path, self.TRAINER_STATE_NAME), map_location=lambda storage, loc: storage)#
            model = torch.load(os.path.join(path, self.MODEL_NAME), map_location=lambda storage, loc: storage)

        if isinstance(model, SpeechSeq2seq):
            if isinstance(model, nn.DataParallel):
                model.module.flatten_parameters()  # make RNN parameters contiguous

        return Checkpoint(
            model=model, optimizer=resume_checkpoint['optimizer'], epoch=resume_checkpoint['epoch'],

I used code this github(GitHub - sooftware/kospeech: Open-Source Toolkit for End-to-End Korean Automatic Speech Recognition leveraging PyTorch and Hydra.)

Traceback (most recent call last):
  File "./", line 113, in <module>
  File "./", line 109, in main
  File "./", line 87, in train
  File "../kospeech/trainer/", line 147, in train
    train_queue, teacher_forcing_ratio)
  File "../kospeech/trainer/", line 232, in __train_epoches
    targets=targets, teacher_forcing_ratio=teacher_forcing_ratio)
  File "/home/stt_py/.local/lib/python3.6/site-packages/torch/nn/modules/", line 550, in __$
    result = self.forward(*input, **kwargs)
  File "/home/stt_py/.local/lib/python3.6/site-packages/torch/nn/parallel/", line 15$
, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/home/stt_py/.local/lib/python3.6/site-packages/torch/nn/parallel/", line 165
, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/home/stt_py/.local/lib/python3.6/site-packages/torch/nn/parallel/", line 85
, in parallel_apply
  File "/home/stt_py/.local/lib/python3.6/site-packages/torch/", line 395, in reraise
    raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 1 on device 1.                                  [6/1849]
Original Traceback (most recent call last):
  File "/home/stt_py/.local/lib/python3.6/site-packages/torch/nn/parallel/", line 60
, in _worker
    output = module(*input, **kwargs)
  File "/home/stt_py/.local/lib/python3.6/site-packages/torch/nn/modules/", line 550, in __c
    result = self.forward(*input, **kwargs)
  File "../kospeech/models/acoustic/seq2seq/", line 51, in forward
    result = self.decoder(targets, output, teacher_forcing_ratio, return_decode_dict)
  File "/home/stt_py/.local/lib/python3.6/site-packages/torch/nn/modules/", line 550, in __c
    result = self.forward(*input, **kwargs)
  File "../kospeech/models/acoustic/seq2seq/", line 175, in forward
    step_output, hidden, attn = self.forward_step(input_var, hidden, encoder_outputs, attn)
  File "../kospeech/models/acoustic/seq2seq/", line 126, in forward_step
    context, attn = self.attention(output, encoder_outputs, encoder_outputs)
  File "/home/stt_py/.local/lib/python3.6/site-packages/torch/nn/modules/", line 550, in __c
    result = self.forward(*input, **kwargs)
  File "../kospeech/models/acoustic/transformer/", line 22, in forward
    output = self.sublayer(*args)
  File "/home/stt_py/.local/lib/python3.6/site-packages/torch/nn/modules/", line 550, in __c
    result = self.forward(*input, **kwargs)
  File "../kospeech/models/", line 96, in forward
    key = key.permute(2, 0, 1, 3).contiguous().view(batch_size * self.num_heads, -1, self.d_head)
  # BNxK_LENxD
RuntimeError: CUDA out of memory. Tried to allocate 32.00 MiB (GPU 1; 10.76 GiB total capacity; 9.56
GiB already allocated; 22.44 MiB free; 9.86 GiB reserved in total by PyTorch)

Try to decrease the batch size and make sure no tensor which are attached to the computation graph are stored in each iteration.
Also, you could try to use torch.utils.checkpoint to trade compute for memory.