Stack image tensors for video processing?

My ConvLSTM network requires the following input shape: (64, 10, 3, 32, 32) [batch size, length of the seq., color channel, img. size). Previously I have only worked with the Conv part of the network which required input shape: (64, 3, 32, 32). Can I now simply stack my tensors or is another procedure better suited?

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 10, 5)
        self.conv2 = nn.Conv2d(10, 20, 5)
        self.conv3 = nn.Conv2d(20, 30, 5)
        self.pool = nn.MaxPool2d(2, 2)

    def forward(self, i):
        x = i.view(-1, i.shape[2], i.shape[3], i.shape[4])
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(i.shape[0], i.shape[1], -1)
        return x
    
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(480, 100)
        self.fc = nn.Linear(100*10, 2)
        
    def forward(self, x):
        x, _ = self.lstm(x)
        x = x.view(x.shape[0], -1)
        print('here', x.shape)
        x = self.fc(x)
        return x  

EDIT: I think what I’m looking for is a dataloader for video that takes as input a list of images.