torch.Storage._expired

Hussein · July 6, 2022, 12:42pm

Hello

I got this error when using data parallel on four GPUs but the model runs fine on single gpu. Somehow the first two batches run fine on the four GPUs and after that I got this error.

File "/opt/conda/lib/python3.7/site-packages/tqdm/std.py", line 1195, in __iter__
    for obj in iterable:
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 652, in __next__
    data = self._next_data()
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1330, in _next_data
    idx, data = self._get_data()
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1296, in _get_data
    success, data = self._try_get_data()
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1134, in _try_get_data
    data = self._data_queue.get(timeout=timeout)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 113, in get
    return _ForkingPickler.loads(res)
  File "/opt/conda/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 303, in rebuild_storage_fd
    shared_cache[fd_id(fd)] = StorageWeakRef(storage)
  File "/opt/conda/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 65, in __setitem__
    self.free_dead_references()
  File "/opt/conda/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 70, in free_dead_references
    if storage_ref.expired():
  File "/opt/conda/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 35, in expired
    return torch.Storage._expired(self.cdata)  # type: ignore[attr-defined]
  File "/opt/conda/lib/python3.7/site-packages/torch/storage.py", line 753, in _expired
    return eval(cls.__module__)._UntypedStorage._expired(*args, **kwargs)
AttributeError: module 'torch.cuda' has no attribute '_UntypedStorage'

tried multiple times and the error keeps showing. Thank you for your help.

Hussein · July 10, 2022, 12:46pm

The problem was solved by using distributed data parallel. In case anyone facing the same error.

lnsydengyuang · July 21, 2022, 3:35pm

Can you describe in more detail how you solved this problem, I also encountered the same problem.

Hussein · July 21, 2022, 4:07pm

Hello @lnsydengyuang

When using data parallel to train the model on more than one GPU, this error appears.
I followed PyTorch recommendation by using distributed data parallel instead of data parallel. Please follow the following tutorials to implement distributed data parallel.

github.com

pytorch/examples/blob/fe8abc3c810420df2856c6e668258f396b154cee/imagenet/main.py#L208


      
          train_dataset = datasets.ImageFolder(
              traindir,
              transforms.Compose([
                  transforms.RandomResizedCrop(224),
                  transforms.RandomHorizontalFlip(),
                  transforms.ToTensor(),
                  normalize,
              ]))
          
          
if args.distributed:
              train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
          else:
              train_sampler = None
          
          
train_loader = torch.utils.data.DataLoader(
              train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
              num_workers=args.workers, pin_memory=True, sampler=train_sampler)
          
          
val_loader = torch.utils.data.DataLoader(
              datasets.ImageFolder(valdir, transforms.Compose([
                  transforms.Resize(256),

I hope this helps.

Xelif · July 26, 2022, 2:34pm

I am actually getting the same error and I can’t see how to use

if args.distributed:
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
    train_sampler = None

in my case since I am not defining any instance of DistributedSampler in my code.
I am training on google colab with torch 1.12.0+cu113

Might my error be connected to using torch.multiprocessing.set_start_method('spawn', force=True)?
My error is:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-15-cba3cc81036f> in <module>()
     38     false_positive = 0
     39     # iterate for every batch in data loader
---> 40     for images, label_grids, _, maximum_anchor_grid_gt in train_loader:
     41         images, label_grids, maximum_anchor_grid_gt = images.to(device), label_grids.to(device), maximum_anchor_grid_gt.to(device)
     42         # forward and loss

9 frames
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in __next__(self)
    650                 # TODO(https://github.com/pytorch/pytorch/issues/76750)
    651                 self._reset()  # type: ignore[call-arg]
--> 652             data = self._next_data()
    653             self._num_yielded += 1
    654             if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
   1328 
   1329             assert not self._shutdown and self._tasks_outstanding > 0
-> 1330             idx, data = self._get_data()
   1331             self._tasks_outstanding -= 1
   1332             if self._dataset_kind == _DatasetKind.Iterable:

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in _get_data(self)
   1294         else:
   1295             while True:
-> 1296                 success, data = self._try_get_data()
   1297                 if success:
   1298                     return data

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in _try_get_data(self, timeout)
   1132         #   (bool: whether successfully get data, any: data if successful else None)
   1133         try:
-> 1134             data = self._data_queue.get(timeout=timeout)
   1135             return (True, data)
   1136         except Exception as e:

/usr/lib/python3.7/multiprocessing/queues.py in get(self, block, timeout)
    111                 self._rlock.release()
    112         # unserialize the data after having released the lock
--> 113         return _ForkingPickler.loads(res)
    114 
    115     def qsize(self):

/usr/local/lib/python3.7/dist-packages/torch/multiprocessing/reductions.py in rebuild_storage_fd(cls, df, size)
    301             return storage
    302         storage = cls._new_shared_fd_cpu(fd, size)
--> 303         shared_cache[fd_id(fd)] = StorageWeakRef(storage)
    304         return storage
    305     finally:

/usr/local/lib/python3.7/dist-packages/torch/multiprocessing/reductions.py in __setitem__(self, key, storage_ref)
     63             dict.__setitem__(self, key, storage_ref)
     64             if len(self) > self.limit:
---> 65                 self.free_dead_references()
     66 
     67     def free_dead_references(self):

/usr/local/lib/python3.7/dist-packages/torch/multiprocessing/reductions.py in free_dead_references(self)
     68         live = 0
     69         for key, storage_ref in list(self.items()):
---> 70             if storage_ref.expired():
     71                 del self[key]
     72             else:

/usr/local/lib/python3.7/dist-packages/torch/multiprocessing/reductions.py in expired(self)
     33 
     34     def expired(self):
---> 35         return torch.Storage._expired(self.cdata)  # type: ignore[attr-defined]
     36 
     37     def __del__(self):

/usr/local/lib/python3.7/dist-packages/torch/storage.py in _expired(cls, *args, **kwargs)
    751     @classmethod
    752     def _expired(cls, *args, **kwargs):
--> 753         return eval(cls.__module__)._UntypedStorage._expired(*args, **kwargs)
    754 
    755     def is_pinned(self):

AttributeError: module 'torch.cuda' has no attribute '_UntypedStorage'

vickyr · July 28, 2022, 7:04am

Hello @Xelif,

I don’t have an exact solution for this issue but I will share what I did to solve this issue.

I got this same issue “AttributeError: module ‘torch.cuda’ has no attribute ‘_UntypedStorage’” in Colab while training Yolactedge. Previously the same notebook was working fine. I saw that the PyTorch version was 1.12 and I downgraded it to 1.8 Just to give it a try.

Used the Command
`
%cd /usr/local
!rm cuda
!ln -s cuda-10.0 cuda
!nvcc --version

!pip install torch==1.8.2 torchvision==0.9.2 torchaudio==0.8.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu111

`

And it solved the problem for me. Maybe it is helpful to you.