cc @ptrblck I have a question regarding pytorch tensor memory usage, it seems that what should be functionally similar designs consumes drastically different amount of CPU memory, I have not tried GPU memory yet.
Below are two implementations of replay buffer used in RL:
Implementation 1, uses 4.094GiB memory, creates 20003 tensors in total
from time import sleep
from copy import deepcopy
import gc
import torch as t
if __name__ == "__main__":
buffer = []
state = t.randint(0, 255, [1, 8, 224, 224], dtype=t.uint8)
for i in range(10000):
old_state = state
state = t.randint(0, 255, [1, 8, 224, 224], dtype=t.uint8)
buffer.append(
{"state": {"state": old_state},
"action": {"action": t.zeros([1, 1])},
"next_state": {"state": state},
"reward": 0.0,
"terminal": False}
)
count = 0
for obj in gc.get_objects():
try:
if t.is_tensor(obj) or (
hasattr(obj, 'data') and t.is_tensor(obj.data)):
count += 1
except:
continue
print(count)
sleep(20)
Implementation 2, uses 7.561GiB memory, creates 20003 tensors in total
from time import sleep
from copy import deepcopy
import gc
import torch as t
if __name__ == "__main__":
buffer = []
state = t.randint(0, 255, [1, 8, 224, 224], dtype=t.uint8)
for i in range(10000):
old_state = state
state = t.randint(0, 255, [1, 8, 224, 224], dtype=t.uint8)
transition = {"state": {"state": old_state},
"action": {"action": t.zeros([1, 1])},
"next_state": {"state": state},
"reward": 0.0,
"terminal": False}
# some processing
transition["state"]["state"] = transition["state"]["state"].detach()
transition["action"]["action"] = transition["action"]["action"].detach()
transition["next_state"]["state"] = transition["next_state"]["state"].detach()
if i > 0:
last_transition = buffer[i - 1]
if transition["state"]["state"].equal(
last_transition["next_state"]["state"]):
# store a reference, instead of deep-copying
transition["state"] = last_transition["next_state"]
else:
transition["state"] = deepcopy(transition["state"])
transition["action"] = deepcopy(transition["action"])
transition["next_state"] = deepcopy(transition["next_state"])
else:
transition = deepcopy(transition)
buffer.append(transition)
count = 0
for obj in gc.get_objects():
try:
if t.is_tensor(obj) or (
hasattr(obj, 'data') and t.is_tensor(obj.data)):
count += 1
except:
continue
print(count)
sleep(20)
I suppose that the difference is caused by deepcopying the storage used by tensors, however, it seems that pytorch will not release / reuse the storage, below code takes 9.504GiB memory and creates 30003 tensors:
buffer = []
state = t.randint(0, 255, [1, 8, 224, 224], dtype=t.uint8)
for i in range(10000):
old_state = state
state = t.randint(0, 255, [1, 8, 224, 224], dtype=t.uint8)
transition = {"state": {"state": old_state},
"action": {"action": t.zeros([1, 1])},
"next_state": {"state": state},
"reward": 0.0,
"terminal": False}
# some processing
transition["state"]["state"] = transition["state"]["state"].detach()
transition["action"]["action"] = transition["action"]["action"].detach()
transition["next_state"]["state"] = transition["next_state"]["state"].detach()
if i > 0:
last_transition = buffer[i - 1]
if transition["state"]["state"].equal(
last_transition["next_state"]["state"]):
# store a reference, instead of deep-copying
transition["state"] = last_transition["next_state"]
else:
transition["state"] = deepcopy(transition["state"])
transition["action"] = deepcopy(transition["action"])
transition["next_state"] = deepcopy(transition["next_state"])
else:
transition = deepcopy(transition)
buffer.append(transition)
x = [t.randint(0, 255, [1, 8, 224, 224], dtype=t.uint8) for _ in range(10000)]
count = 0
for obj in gc.get_objects():
try:
if t.is_tensor(obj) or (
hasattr(obj, 'data') and t.is_tensor(obj.data)):
count += 1
except:
continue
print(count)
sleep(20)
I wonder is there any way to reduce memory usage if I would like to deepcopy the tensor, then discard the original tensor and keep the copy only, i.e. reduce the memory usage of implementation 2 to implementation 1.