Hey everyone, I’m getting a protocol error when iterating through the data loader. Something to do with the num_workers I think. The error is erratic i.e. sometimes it happens, sometimes it doesn’t. I have tried googling it but can’t find details on it, so hope to get some help here please.
This is my code:
def __getitem__(self, index):
pair_key = self.list_files[index]
pair = self.s3_client.list_objects(Bucket=self.bucket_name, Prefix=pair_key, Delimiter='/')
input_image_key = pair.get('Contents')[1].get('Key')
input_image_path = f's3://{self.bucket_name}/{input_image_key}'
input_image_s3_source = get_file_from_filepath(input_image_path)
pil_input_image = Image.open(input_image_s3_source)
target_image_key = pair.get('Contents')[0].get('Key')
target_image_path = f's3://{self.bucket_name}/{target_image_key}'
target_image_s3_source = get_file_from_filepath(target_image_path)
pil_target_image = Image.open(target_image_s3_source)
input_image = self.transform(pil_input_image)
target_image = self.transform(pil_target_image)
return input_image, target_image
def main():
train_loader = DataLoader(
train_dataset,
batch_size=args.batch_size,
shuffle=False,
num_workers=args.num_workers,
drop_last=True,
pin_memory=True
)
print("Length train loader ",len(train_loader))
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, pin_memory=True, drop_last = True)
def train_fn():
loop = tqdm(loader, leave=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
for idx, (x, y) in enumerate(loop):
...
This is the error message:
Traceback (most recent call last):
File "main.py", line 623, in <module>
g_scaler=g_scaler, d_scaler=d_scaler, runtime_log_folder=runtime_log_folder, runtime_log_file_name=runtime_log_file_name)
File "main.py", line 223, in train_fn
for idx, (x, y) in enumerate(loop):
File "/opt/conda/lib/python3.6/site-packages/tqdm/std.py", line 1171, in __iter__
for obj in iterable:
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 525, in __next__
(data, worker_id) = self._next_data()
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1252, in _next_data
return (self._process_data(data), w_id)
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1299, in _process_data
data.reraise()
File "/opt/conda/lib/python3.6/site-packages/torch/_utils.py", line 429, in reraise
raise self.exc_type(msg)
urllib3.exceptions.ProtocolError: Caught ProtocolError in DataLoader worker process 3.
Original Traceback (most recent call last):
File "/opt/conda/lib/python3.6/site-packages/urllib3/response.py", line 436, in _error_catcher
yield
File "/opt/conda/lib/python3.6/site-packages/urllib3/response.py", line 518, in read
data = self._fp.read(amt) if not fp_closed else b""
File "/opt/conda/lib/python3.6/http/client.py", line 463, in read
n = self.readinto(b)
File "/opt/conda/lib/python3.6/http/client.py", line 507, in readinto
n = self.fp.readinto(b)
File "/opt/conda/lib/python3.6/socket.py", line 586, in readinto
return self._sock.recv_into(b)
File "/opt/conda/lib/python3.6/ssl.py", line 1012, in recv_into
return self.read(nbytes, buffer)
File "/opt/conda/lib/python3.6/ssl.py", line 874, in read
return self._sslobj.read(len, buffer)
File "/opt/conda/lib/python3.6/ssl.py", line 631, in read
v = self._sslobj.read(len, buffer)
ConnectionResetError: [Errno 104] Connection reset by peer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 210, in _worker_loop
data = fetcher.fetch(index)
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/opt/ml/code/ImageDataset.py", line 137, in __getitem__
pil_target_image = Image.open(target_image_s3_source)
File "/opt/conda/lib/python3.6/site-packages/PIL/Image.py", line 2984, in open
prefix = fp.read(16)
File "/opt/conda/lib/python3.6/site-packages/smart_open/s3.py", line 511, in read
self._fill_buffer(size)
File "/opt/conda/lib/python3.6/site-packages/smart_open/s3.py", line 622, in _fill_buffer
bytes_read = self._buffer.fill(self._raw_reader)
File "/opt/conda/lib/python3.6/site-packages/smart_open/bytebuffer.py", line 152, in fill
new_bytes = source.read(size)
File "/opt/conda/lib/python3.6/site-packages/smart_open/s3.py", line 423, in read
binary = self._read_from_body(size)
File "/opt/conda/lib/python3.6/site-packages/smart_open/s3.py", line 411, in _read_from_body
binary = self._body.read(size)
File "/opt/conda/lib/python3.6/site-packages/botocore/response.py", line 77, in read
chunk = self._raw_stream.read(amt)
File "/opt/conda/lib/python3.6/site-packages/urllib3/response.py", line 540, in read
raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
File "/opt/conda/lib/python3.6/contextlib.py", line 99, in __exit__
self.gen.throw(type, value, traceback)
File "/opt/conda/lib/python3.6/site-packages/urllib3/response.py", line 454, in _error_catcher
raise ProtocolError("Connection broken: %r" % e, e)
urllib3.exceptions.ProtocolError: ("Connection broken: ConnectionResetError(104, 'Connection reset by peer')", ConnectionResetError(104, 'Connection reset by peer'))