Hi, I am using FSDP but I get the error RuntimeError: setStorage: sizes [1, 1, 16], strides [16, 16, 1], storage offset 160, and itemsize 4 requiring a storage size of 704 are out of bounds for storage of size 0
. A code snippet to reproduce the error is shown below:
I tested the code using both pytorch==2.3.1
and pytoch=1.12.1
. For these two versions, I got the same error.
Could you help me debug this problem?
import os
import sys
import argparse
import torch
import functools
import torch.optim as optim
import torch.nn as nn
from torch.distributed import init_process_group
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp.wrap import lambda_auto_wrap_policy
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--local-rank', type=int, default=-1)
parser.add_argument('--master-port', type=int, default=-1)
return parser.parse_args()
class Learner(nn.Module):
def __init__(self, depth=3):
super().__init__()
vec_lst = []
for _ in range(depth):
vec_lst.append(
nn.ParameterList([
nn.Parameter(torch.rand(c+1, 16)) for c in range(4)
])
)
self.vec_lst = nn.ParameterList(vec_lst)
def forward(self):
return self.vec_lst
class FreezeModel(nn.Module):
def __init__(self):
super().__init__()
self.depth = 1
def forward(self, x, vec_lst):
bs = x.size(0)
vec = vec_lst[self.depth]
for i in range(4):
x = x + vec[i].unsqueeze(0).expand(bs, -1, -1)
return x
class MyModel(nn.Module):
def __init__(self):
super().__init__()
self.learner = Learner()
self.freezed_model = FreezeModel()
def forward(self, x):
vec_lst = self.learner()
x = self.freezed_model(x, vec_lst)
x = x[:, 0, :]
return x
def trainer_policy_fn(module):
return isinstance(module, Learner)
def main():
args = parse_arguments()
torch.cuda.set_device(args.local_rank)
init_process_group(
init_method='env://',
backend='nccl',
)
model = MyModel()
for param in model.freezed_model.parameters():
param.requires_grad_ = False
cur_device = torch.cuda.current_device()
my_auto_wrap_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=trainer_policy_fn)
model = FSDP(
model,
auto_wrap_policy=my_auto_wrap_policy,
device_id=cur_device,
use_orig_params=False,
limit_all_gathers=True,
)
optimizer = optim.Adam(model.parameters(), lr=0.001)
input_tensor = torch.randn(4, 1, 16).cuda()
target_tensor = torch.randn(4, 16).cuda(non_blocking=True)
criterion = nn.CrossEntropyLoss()
for epoch in range(10):
output = model(input_tensor)
loss = criterion(output, target_tensor)
loss.backward()
optimizer.step()
if __name__ == '__main__':
main()