Out of memory/illegal memory access in 2080


#1

I have some problems running the examples provided in fastai lib so I posted on their forum. But after searching here for a solution , I found torch.cuda.empty_cache() but still I get the memory error… so that is why Im comming here

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-8-cc9249814530> in <module>()
      1 torch.cuda.is_available()
----> 2 torch.cuda.empty_cache()

~/anaconda3/lib/python3.7/site-packages/torch/cuda/__init__.py in empty_cache()
    372     """
    373     if _initialized:
--> 374         torch._C._cuda_emptyCache()
    375 
    376 

RuntimeError: CUDA error: an illegal memory access was encountered

I cross post from https://forums.fast.ai/t/cyfar-ipynb-cuda-runtime-error-77-an-illegal-memory-access/29649

Hi there I just reinstalled my home PC to start all over again

Here is fastai.show_install()

=== Software === 
python version  : 3.7.0
fastai version  : 1.0.20.dev0
torch version   : 1.0.0.dev20181105
nvidia driver   : 410.73
torch cuda ver  : 9.2.148
torch cuda is   : available
torch cudnn ver : 7104
torch cudnn is  : enabled

=== Hardware === 
nvidia gpus     : 1
torch available : 1
  - gpu0        : 7949MB | GeForce RTX 2080

=== Environment === 
platform        : Linux-4.18.0-10-generic-x86_64-with-debian-buster-sid
distro          : Ubuntu 18.10 Cosmic Cuttlefish
conda env       : base
python          : /home/tyoc213/anaconda3/bin/python
sys.path        : 
/home/tyoc213/fastai/examples
/home/tyoc213/anaconda3/lib/python37.zip
/home/tyoc213/anaconda3/lib/python3.7
/home/tyoc213/anaconda3/lib/python3.7/lib-dynload
/home/tyoc213/anaconda3/lib/python3.7/site-packages
/home/tyoc213/fastai
/home/tyoc213/anaconda3/lib/python3.7/site-packages/IPython/extensions
/home/tyoc213/.ipython

collab.ipynb works OK but stepping on cyfar on fastai/examples I an error executing this line

learn = Learner(data, wrn_22(), metrics=accuracy).to_fp16()
learn.fit_one_cycle(30, 3e-3, wd=0.4, div_factor=10, pct_start=0.5)

I get this output

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-14-72f1e2b0093b> in <module>()
----> 1 learn = Learner(data, wrn_22(), metrics=accuracy).to_fp16()
      2 learn.fit_one_cycle(30, 3e-3, wd=0.4, div_factor=10, pct_start=0.5)

<string> in __init__(self, data, model, opt_func, loss_func, metrics, true_wd, bn_wd, wd, train_bn, path, model_dir, callback_fns, callbacks, layer_groups)

~/fastai/fastai/basic_train.py in __post_init__(self)
    136         self.path = Path(ifnone(self.path, self.data.path))
    137         (self.path/self.model_dir).mkdir(parents=True, exist_ok=True)
--> 138         self.model = self.model.to(self.data.device)
    139         self.loss_func = ifnone(self.loss_func, self.data.loss_func)
    140         self.metrics=listify(self.metrics)

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in to(self, *args, **kwargs)
    377             return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
    378 
--> 379         return self._apply(convert)
    380 
    381     def register_backward_hook(self, hook):

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    183     def _apply(self, fn):
    184         for module in self.children():
--> 185             module._apply(fn)
    186 
    187         for param in self._parameters.values():

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    183     def _apply(self, fn):
    184         for module in self.children():
--> 185             module._apply(fn)
    186 
    187         for param in self._parameters.values():

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    189                 # Tensors stored in modules are graph leaves, and we don't
    190                 # want to create copy nodes, so we have to unpack the data.
--> 191                 param.data = fn(param.data)
    192                 if param._grad is not None:
    193                     param._grad.data = fn(param._grad.data)

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in convert(t)
    375 
    376         def convert(t):
--> 377             return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
    378 
    379         return self._apply(convert)

RuntimeError: cuda runtime error (77) : an illegal memory access was encountered at /opt/conda/conda-bld/pytorch-nightly_1541411195070/work/aten/src/THC/generic/THCTensorCopy.cpp:20

if running torch.cuda.is_available() return True.

Update extra tests

Im also running out of memory in dogs_cats.ipynb.

learn = create_cnn(data, models.resnet34, metrics=accuracy)

learn.fit_one_cycle(1)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-9-6ec085df1eed> in <module>()
----> 1 learn = create_cnn(data, models.resnet34, metrics=accuracy)
      2 learn.fit_one_cycle(1)

~/fastai/fastai/vision/learner.py in create_cnn(data, arch, cut, pretrained, lin_ftrs, ps, custom_head, split_on, classification, **kwargs)
     67     learn.split(ifnone(split_on,meta['split']))
     68     if pretrained: learn.freeze()
---> 69     apply_init(model[1], nn.init.kaiming_normal_)
     70     return learn
     71 

~/fastai/fastai/torch_core.py in apply_init(m, init_func)
    193 def apply_init(m, init_func:LayerFunc):
    194     "Initialize all non-batchnorm layers of `m` with `init_func`."
--> 195     apply_leaf(m, partial(cond_init, init_func=init_func))
    196 
    197 def in_channels(m:nn.Module) -> List[int]:

~/fastai/fastai/torch_core.py in apply_leaf(m, f)
    189     c = children(m)
    190     if isinstance(m, nn.Module): f(m)
--> 191     for l in c: apply_leaf(l,f)
    192 
    193 def apply_init(m, init_func:LayerFunc):

~/fastai/fastai/torch_core.py in apply_leaf(m, f)
    188     "Apply `f` to children of `m`."
    189     c = children(m)
--> 190     if isinstance(m, nn.Module): f(m)
    191     for l in c: apply_leaf(l,f)
    192 

~/fastai/fastai/torch_core.py in cond_init(m, init_func)
    183     if (not isinstance(m, bn_types)) and requires_grad(m):
    184         if hasattr(m, 'weight'): init_func(m.weight)
--> 185         if hasattr(m, 'bias') and hasattr(m.bias, 'data'): m.bias.data.fill_(0.)
    186 
    187 def apply_leaf(m:nn.Module, f:LayerFunc):

RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch-nightly_1541411195070/work/aten/src/THC/generic/THCTensorMath.cu:14

I get the cuda memory error also in tabular

learn = get_tabular_learner(data, layers=[200,100], metrics=accuracy)
learn.fit(1, 1e-2)

output

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-5-480eb9caae1a> in <module>()
----> 1 learn = get_tabular_learner(data, layers=[200,100], metrics=accuracy)
      2 learn.fit(1, 1e-2)

~/fastai/fastai/tabular/data.py in get_tabular_learner(data, layers, emb_szs, metrics, ps, emb_drop, y_range, use_bn, **kwargs)
     93     model = TabularModel(emb_szs, len(data.cont_names), out_sz=data.c, layers=layers, ps=ps, emb_drop=emb_drop,
     94                          y_range=y_range, use_bn=use_bn)
---> 95     return Learner(data, model, metrics=metrics, **kwargs)
     96 

<string> in __init__(self, data, model, opt_func, loss_func, metrics, true_wd, bn_wd, wd, train_bn, path, model_dir, callback_fns, callbacks, layer_groups)

~/fastai/fastai/basic_train.py in __post_init__(self)
    136         self.path = Path(ifnone(self.path, self.data.path))
    137         (self.path/self.model_dir).mkdir(parents=True, exist_ok=True)
--> 138         self.model = self.model.to(self.data.device)
    139         self.loss_func = ifnone(self.loss_func, self.data.loss_func)
    140         self.metrics=listify(self.metrics)

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in to(self, *args, **kwargs)
    377             return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
    378 
--> 379         return self._apply(convert)
    380 
    381     def register_backward_hook(self, hook):

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    183     def _apply(self, fn):
    184         for module in self.children():
--> 185             module._apply(fn)
    186 
    187         for param in self._parameters.values():

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    183     def _apply(self, fn):
    184         for module in self.children():
--> 185             module._apply(fn)
    186 
    187         for param in self._parameters.values():

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    189                 # Tensors stored in modules are graph leaves, and we don't
    190                 # want to create copy nodes, so we have to unpack the data.
--> 191                 param.data = fn(param.data)
    192                 if param._grad is not None:
    193                     param._grad.data = fn(param._grad.data)

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in convert(t)
    375 
    376         def convert(t):
--> 377             return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
    378 
    379         return self._apply(convert)

RuntimeError: CUDA error: out of memory