Getting urllib3 Protocol Error when iterating through data loader

Hey everyone, I’m getting a protocol error when iterating through the data loader. Something to do with the num_workers I think. The error is erratic i.e. sometimes it happens, sometimes it doesn’t. I have tried googling it but can’t find details on it, so hope to get some help here please.

This is my code:

def __getitem__(self, index):
        
      pair_key = self.list_files[index]
      pair = self.s3_client.list_objects(Bucket=self.bucket_name, Prefix=pair_key, Delimiter='/')

      input_image_key = pair.get('Contents')[1].get('Key')
      input_image_path = f's3://{self.bucket_name}/{input_image_key}'
      input_image_s3_source = get_file_from_filepath(input_image_path)
      pil_input_image = Image.open(input_image_s3_source)

      target_image_key = pair.get('Contents')[0].get('Key')
      target_image_path = f's3://{self.bucket_name}/{target_image_key}'
      target_image_s3_source = get_file_from_filepath(target_image_path)
      pil_target_image = Image.open(target_image_s3_source)
      
      input_image = self.transform(pil_input_image)
      target_image = self.transform(pil_target_image)

      return input_image, target_image

def main():
  train_loader = DataLoader(
      train_dataset,
      batch_size=args.batch_size,
      shuffle=False,
      num_workers=args.num_workers,
      drop_last=True,
      pin_memory=True
  )
  print("Length train loader ",len(train_loader))
  val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, pin_memory=True, drop_last = True)

def train_fn():
    loop = tqdm(loader, leave=True)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    for idx, (x, y) in enumerate(loop):
        ...

This is the error message:

Traceback (most recent call last):
  File "main.py", line 623, in <module>
    g_scaler=g_scaler, d_scaler=d_scaler, runtime_log_folder=runtime_log_folder, runtime_log_file_name=runtime_log_file_name)
  File "main.py", line 223, in train_fn
    for idx, (x, y) in enumerate(loop):
  File "/opt/conda/lib/python3.6/site-packages/tqdm/std.py", line 1171, in __iter__
    for obj in iterable:
  File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 525, in __next__
    (data, worker_id) = self._next_data()
  File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1252, in _next_data
    return (self._process_data(data), w_id)
  File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1299, in _process_data
    data.reraise()
  File "/opt/conda/lib/python3.6/site-packages/torch/_utils.py", line 429, in reraise
    raise self.exc_type(msg)
urllib3.exceptions.ProtocolError: Caught ProtocolError in DataLoader worker process 3.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/urllib3/response.py", line 436, in _error_catcher
    yield
  File "/opt/conda/lib/python3.6/site-packages/urllib3/response.py", line 518, in read
    data = self._fp.read(amt) if not fp_closed else b""
  File "/opt/conda/lib/python3.6/http/client.py", line 463, in read
    n = self.readinto(b)
  File "/opt/conda/lib/python3.6/http/client.py", line 507, in readinto
    n = self.fp.readinto(b)
  File "/opt/conda/lib/python3.6/socket.py", line 586, in readinto
    return self._sock.recv_into(b)
  File "/opt/conda/lib/python3.6/ssl.py", line 1012, in recv_into
    return self.read(nbytes, buffer)
  File "/opt/conda/lib/python3.6/ssl.py", line 874, in read
    return self._sslobj.read(len, buffer)
  File "/opt/conda/lib/python3.6/ssl.py", line 631, in read
    v = self._sslobj.read(len, buffer)
ConnectionResetError: [Errno 104] Connection reset by peer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 210, in _worker_loop
    data = fetcher.fetch(index)
  File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/opt/ml/code/ImageDataset.py", line 137, in __getitem__
    pil_target_image = Image.open(target_image_s3_source)
  File "/opt/conda/lib/python3.6/site-packages/PIL/Image.py", line 2984, in open
    prefix = fp.read(16)
  File "/opt/conda/lib/python3.6/site-packages/smart_open/s3.py", line 511, in read
    self._fill_buffer(size)
  File "/opt/conda/lib/python3.6/site-packages/smart_open/s3.py", line 622, in _fill_buffer
    bytes_read = self._buffer.fill(self._raw_reader)
  File "/opt/conda/lib/python3.6/site-packages/smart_open/bytebuffer.py", line 152, in fill
    new_bytes = source.read(size)
  File "/opt/conda/lib/python3.6/site-packages/smart_open/s3.py", line 423, in read
    binary = self._read_from_body(size)
  File "/opt/conda/lib/python3.6/site-packages/smart_open/s3.py", line 411, in _read_from_body
    binary = self._body.read(size)
  File "/opt/conda/lib/python3.6/site-packages/botocore/response.py", line 77, in read
    chunk = self._raw_stream.read(amt)
  File "/opt/conda/lib/python3.6/site-packages/urllib3/response.py", line 540, in read
    raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
  File "/opt/conda/lib/python3.6/contextlib.py", line 99, in __exit__
    self.gen.throw(type, value, traceback)
  File "/opt/conda/lib/python3.6/site-packages/urllib3/response.py", line 454, in _error_catcher
    raise ProtocolError("Connection broken: %r" % e, e)
urllib3.exceptions.ProtocolError: ("Connection broken: ConnectionResetError(104, 'Connection reset by peer')", ConnectionResetError(104, 'Connection reset by peer'))

Just to add some details here. The line of code that results in the error is the line of code of trying to access an AWS S3 bucket.

I noticed a pattern that this error tends to happen when I run multiple ML instances concurrently, like 3 or 4 at the same time. Then 2 or 3 instances will fail with this error but at least 1 or 2 instances will complete training. However I’m not sure what to deduce from this…