Out of Memory in Multi GPU -- Model Parallelization

I am facing an error “Out of memory” when trying to load 100 MySmallModel in “cuda:0”.

I have two GPU with 12GB.

RuntimeError: CUDA out of memory. Tried to allocate 28.00 MiB (GPU 0; 11.91 GiB total capacity; 11.18 GiB already allocated; 19.38 MiB free; 34.67 MiB cached)

Can any please help me how can I solve this problem?/ How can I divide “self.fc1 = nn.Linear(70000, 3000)” layer into multiple layers and then do the computation?

class MySmallModel(nn.Module):
    def __init__(self,nodes):
        super(MySmallModel, self).__init__()
            self.fc1 = nn.Linear(70000, 3000)
            self.fc3 = nn.Linear(3000, 1000)
        
        self.fc1.cuda(0)
        self.fc3.cuda(1)
        
    def forward(self, x):
        
        x = F.relu(self.fc1(x))
        x = x.cuda(1)
        x = F.relu(self.fc3(x))

        return x

class Classifier(nn.Module):
    def __init__(self,input_nodes):
        super(Classifier, self).__init__()
        
        self.networks = nn.ModuleList([MySmallModel() for i in range(100)])

        self.sharedlayer = nn.Sequential(
            nn.Linear(30000, 300),
            nn.ReLU(),
            nn.Linear(300, 100),
            nn.ReLU(),
        )
        self.sharedlayer.cuda(1)
   
    def forward(self, input_):
        x_list=[]
        x_list.append(F.relu(self.networks[i](tensor_Data)))
        x = torch.cat((x_list), 1)
        
        x = x.cuda(1)
        h_shared = self.sharedlayer(x)
        
        return h_shared 

================================================
criterion = nn.MSELoss()
model = Classifier(input_nodes)
optimizer = optim.SGD(model.parameters(), lr=0.01)

trainloader = torch.utils.data.DataLoader(NN_data_train, batch_size=1, shuffle=True)
for epoch in range(n_epochs):
	running_loss = 0
	i = 0
	model.train()
	for data, label in trainloader:
		 out1 = model(data.cuda(0))
		 output = torch.cat([out1], 1)
		 loss = criterion(output.cuda(1), label.cuda(1))
		 optimizer.zero_grad()
		 loss.backward()
		 optimizer.step()

I have already reduce the batch size to 1. But still same memory problem.
Please help.