The main question for the first approach would be, how to stack your different inputs together?
The images might be resized to a common size and stacked together, but how would you handle the list data?
You could easily create a model architecture, where you have different paths for each input and concatenate them together at some point. Assuming your images have one single channel, here is a small example:
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.features1 = nn.Sequential(
nn.Conv2d(1, 3, 3, 1, 1),
nn.MaxPool2d(2),
nn.ReLU(),
)
self.features2 = nn.Sequential(
nn.Conv2d(1, 3, 3, 1, 1),
nn.MaxPool2d(2),
nn.ReLU(),
)
self.features3 = nn.Sequential(
nn.Linear(10, 5),
nn.ReLU(),
)
self.classifier = nn.Linear(128*128*3 + 32*32*3 + 5, 4)
def forward(self, x1, x2, x3):
x1 = self.features1(x1)
x2 = self.features2(x2)
x3 = self.features3(x3)
x1 = x1.view(x1.size(0), -1)
x2 = x2.view(x2.size(0), -1)
x3 = x3.view(x3.size(0), -1)
x = torch.cat((x1, x2, x3), dim=1)
x = self.classifier(x)
return x
model = MyModel()
batch_size = 1
x1 = torch.randn(batch_size, 1, 256, 256)
x2 = torch.randn(batch_size, 1, 64, 64)
x3 = torch.randn(batch_size, 10)
output = model(x1, x2, x3)