Hi,
I am getting a MemoryError
when iterating over train_loader
in the following code snippet:
def optimizer_step():
optimizer.zero_grad()
losses = []
print(h.heap())
for x_batch, y_batch in train_loader:
# print(f"batch_size: {x_batch.shape}, {y_batch.shape}")
prediction = model(x_batch)
loss = loss_function(prediction, y_batch)
print("Loss: ", loss.item())
loss.backward()
losses.append(loss.unsqueeze(0))
losses = torch.cat(losses, dim=0)
return torch.sum(losses)
The thing that’s odd is I’m running Python 64-bit under Windows 10 with >30GB free RAM. When I run guppy.hpy.heap()
I get see this memory usage:
Partition of a set of 642221 objects. Total size = 72264693 bytes.
And here is the full traceback:
Traceback (most recent call last):
File "C:\Users\Gili\Documents\myproject\aggregator\src\main\python\com.mycompany.aggregator\pytorch-ours.py", line 181, in <module>
optimizer.step(optimizer_step)
File "C:\Users\Gili\Documents\myproject\python\lib\site-packages\torch\optim\optimizer.py", line 88, in wrapper
return func(*args, **kwargs)
File "C:\Users\Gili\Documents\myproject\python\lib\site-packages\torch\autograd\grad_mode.py", line 28, in decorate_context
return func(*args, **kwargs)
File "C:\Users\Gili\Documents\myproject\python\lib\site-packages\torch\optim\lbfgs.py", line 311, in step
orig_loss = closure()
File "C:\Users\Gili\Documents\myproject\python\lib\site-packages\torch\autograd\grad_mode.py", line 28, in decorate_context
return func(*args, **kwargs)
File "C:\Users\Gili\Documents\myproject\aggregator\src\main\python\com.mycompany.aggregator\pytorch-ours.py", line 157, in optimizer_step
for x_batch, y_batch in train_loader:
File "C:\Users\Gili\Documents\myproject\python\lib\site-packages\torch\utils\data\dataloader.py", line 354, in __iter__
self._iterator = self._get_iterator()
File "C:\Users\Gili\Documents\myproject\python\lib\site-packages\torch\utils\data\dataloader.py", line 305, in _get_iterator
return _MultiProcessingDataLoaderIter(self)
File "C:\Users\Gili\Documents\myproject\python\lib\site-packages\torch\utils\data\dataloader.py", line 918, in __init__
w.start()
File "C:\Python39\lib\multiprocessing\process.py", line 121, in start
self._popen = self._Popen(self)
File "C:\Python39\lib\multiprocessing\context.py", line 224, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "C:\Python39\lib\multiprocessing\context.py", line 327, in _Popen
return Popen(process_obj)
File "C:\Python39\lib\multiprocessing\popen_spawn_win32.py", line 93, in __init__
reduction.dump(process_obj, to_child)
File "C:\Python39\lib\multiprocessing\reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
MemoryError
How do I go about debugging what is going on? I tried reducing num_workers
to 1
and the problem still occurs. Setting num_workers
to 0
avoids the problem but this operation is computation-intensive so I want to run across multiple processes.