How to feed batches of sequences into the CNN and LSTM?

I have a video dataset with thousands of videos and each video has 100 frames. The inputs are 2 videos consisting of video_x and video_y which each has 100 frames and the size of each of them is [2,100,3,600,600] which is following order [batches, frames, channels, height, width]. Output is also a video with size of [2,100,3,600,600]. I used off-the- shelf video data loader to extract the frames and feed them to the network.

I reduced the dimension of the input from 5D to 4D by multiplying batches with the frames ([batches, frames, channels, height, width] to [batches*frames, channels, height, width]) to be able to feed into the conv2d, and at the end of the forward function, I reshaped it into the original size. But since the size of input after dimension reduction changes to [200,3,600,600] feeding 200 frames into the CNN is a huge burden foe network and raised with error:

“RuntimeError: DataLoader worker (pid 2002326) is killed by signal: Bus error. It is possible that dataloader’s workers are out of shared memory. Please try to raise your shared memory limit”.

I am wondering how can I feed the frames into the network by splitting them into batches? for example, instead of feeding 100 frames, load 5 batches of 20 frames. I eliminated LSTM from my code for simplification.

num = 1000
epochs = 1
batch_size = 2
lr = 1e-5
weight_decay=1e-7
num_filters = 16
kernel=3
num_workers = 4

np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)

torch.cuda.empty_cache()
os.environ['CUDA_LAUNCH_BLOCKING']="1"
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = True

def loader(idx, label, path0=os.getcwd()):   
    
    if label == 'video_x':          
        dataset = datasets.VideoDataset(
                	os.path.join(path0,  "video_load_x.csv"),
        transform=torchvision.transforms.Compose([
        transforms.VideoFilePathToTensor(max_len=None, fps=1, padding_mode='last')]))     
                   
        data_loader_x = torch.utils.data.DataLoader(dataset, batch_size = batch_size, shuffle = True)
        for videos_x in data_loader_x:
            return videos_x 
            
    if label == 'video_y':               
        dataset = datasets.VideoDataset(
                	os.path.join(path0,  "video_load_y.csv"),
        transform=torchvision.transforms.Compose([
        transforms.VideoFilePathToTensor(max_len=None, fps=1, padding_mode='last')]))                       
                  
        data_loader_y = torch.utils.data.DataLoader(dataset, batch_size = batch_size, shuffle = True)
        for videos_y in data_loader_y:
                   
                return videos_y         
    
    if label == 'video_output':               
         dataset = datasets.VideoDataset(
                 	os.path.join(path0,  "video_output.csv"),
         transform=torchvision.transforms.Compose([
         transforms.VideoFilePathToTensor(max_len=None, fps=1, padding_mode='last')]))                       
                   
         data_loader_output = torch.utils.data.DataLoader(dataset, batch_size = batch_size, shuffle = True)
         for videos_output in data_loader_output:
               
                 return videos_output      
    
class dataset(Dataset):
    def __init__(self, num, path0=os.getcwd(), train_ratio = 0.8, 
                 is_train=True, loader=loader):

        self.train_list, self.test_list = train_test_split(np.arange(num), 
                                                           train_size=train_ratio, 
                                                           random_state=42)
        self.is_train = is_train
        self.loader = loader
            
    def __len__(self):
        if self.is_train is True:
            return len(self.train_list)
        else:
            return len(self.test_list)
    
    def __getitem__(self, idx):
       if self.is_train is True:
           fidx = self.train_list[idx]             
           train_video_load_x = np.transpose(self.loader(fidx, 'video_x')[0],(1,0,2,3)) # [100,3,600,600]  [0] deletes the extra bach from loader                
           train_video_load_y = np.transpose(self.loader(fidx, 'video_y')[0],(1,0,2,3)) # [100,3,600,600]             
           train_output = np.transpose(self.loader(fidx, 'video_output')[0],(1,0,2,3))  # [100,3,600,600]          
           return train_video_load_x,train_video_load_y,train_output
                                         
       else:
           fidx = self.test_list[idx]                   
           test_video_load_x = np.transpose(self.loader(fidx, 'video_x')[0],(1,0,2,3))  # [100,3,600,600]                
           test_video_load_y = np.transpose(self.loader(fidx, 'video_y')[0],(1,0,2,3))  # [100,3,600,600]        
           test_output = np.transpose(self.loader(fidx, 'video_output')[0],(1,0,2,3))   # [100,3,600,600] 
           # print('test_output',test_output.shape)
           return test_video_load_x,test_video_load_y,test_output 
            
        
def criterion(y, y_pred, loss):   
    return loss(y,y_pred)           

class Model(pl.LightningModule):
    def __init__(self, num, kernel, num_filters,criterion = criterion):
        super(Model, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, num_filters, kernel_size = kernel*3, 
                       padding = 4),
            nn.BatchNorm2d(num_filters),
            nn.ReLU(inplace=True))
      
        self.conv2 = nn.Sequential(
            nn.Conv2d(num_filters, 32, kernel_size = kernel, 
                      stride=2, padding = 1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True))

        self.tsconv1= nn.Sequential(
            nn.ConvTranspose2d(32, num_filters, kernel_size = kernel, padding = 1),
            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(num_filters))
        
        self.tsconv2= nn.Sequential(
            nn.Conv2d(num_filters, 3, kernel_size = kernel*3, padding = 4, bias=True),
            nn.ReLU(inplace=True)) 
        
        self.num = num        
        self.mseloss = nn.MSELoss(reduction='mean')        
        self.criterion = criterion
        
    def forward(self,vidx,vidy):
      
        print('vidx:',vidx.shape)      #[2, 100, 3, 600, 600]
        print('vidy:',vidy.shape)      #[2, 100, 3, 600, 600]
              
        batch_size, timesteps, C, H, W = vidx.size() 
        vidx = vidx.reshape(batch_size * timesteps, C, H, W) 
        
        batch_size, timesteps, C, H, W = vidy.size() 
        vidy = vidy.view(batch_size * timesteps, C, H, W)
                    
        print('vidx_size:',vidx.shape)       #[200, 3, 600, 600]
        print('vidy_size:',vidy.shape)       #[200, 3, 600, 600]        
                
        vidx = self.conv1(vidx)         
        print('conv1-vx',vidx.shape)      #([200, 16, 600, 600])
        vidx = self.conv2(vidx)         
        print('conv2-vx',vidx.shape)      #([200, 32, 300, 300])
        vidx= self.tsconv1(vidx)        
        print('tconv1_vx',vidx.shape)     #([200, 16 600, 600])    
        vidx = self.tsconv2(vidx)        
        print('tconv2_vx',vidx.shape)     #([200,3, 600, 600])
        
        vidy = self.conv1(vidy)              #([200, 16, 600, 600])           
        vidy = self.conv2(vidy)              #([200, 32, 300, 300])          
        vidy= self.tsconv1(vidy)            #([200, 16 600, 600])                
        vidy = self.tsconv2(vidy)           #([200,3, 600, 600])
       
        
        x = torch.cat((vidx, vidy), dim=0).view(-1,200,3,600,600)  #([2,200,3, 600, 600])
        print('concat_x',x.shape)
        return x

    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=lr, weight_decay=weight_decay)       
    
    def training_step(self, batch, batch_nb):
        vidx,vidy,y = batch                
        pred = self.forward(vidx,vidy)  # [2, 200, 3, 600, 600]      
        loss_x = self.criterion(y, pred[:,:100,:,:,:], self.mseloss) 
        loss_y = self.criterion(y, pred[:,100:,:,:,:], self.mseloss)
        loss = (loss_x+loss_y)/2         
        self.log('train_loss_mse', loss, on_step=False, on_epoch=True, logger=True)               
        return loss
    
    def validation_step(self, batch, batch_nb):       
        vidx,vidy,y = batch                                
        pred = self.forward(vidx,vidy)  
        print('pred', pred.shape)  # [2, 200, 3, 600, 600]      
        loss_x = self.criterion(y, pred[:,:100,:,:,:], self.mseloss) 
        loss_y = self.criterion(y, pred[:,100:,:,:,:], self.mseloss)
        loss = (loss_x+loss_y)/2    
        self.log('val_loss_mse', loss, on_step=False, on_epoch=True,sync_dist=True, logger=True)            
        return loss
   
    def train_dataloader(self):
        train_dataset = dataset(self.num, is_train=True)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                          num_workers=num_workers, pin_memory=True) 
        return train_loader
    
    def val_dataloader(self):
        val_dataset = dataset(self.num, is_train=False)       
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                          num_workers=num_workers, pin_memory=True) 
        return val_loader

model = Model(num, kernel, num_filters)
trainer = pl.Trainer(max_epochs=epochs, progress_bar_refresh_rate=1, 
                    gpus=2,
                    distributed_backend='ddp',                  
                    benchmark=True,
                    sync_batchnorm=True,
                    precision=16)                                    
trainer.fit(model)