You can treat this use case as a binary classification use case and would just have to make sure the output of your model and the corresponding target have the right shape.
E.g. instead of using conv and pooling layers at the beginning of your model, then flattening the activations, and passing them to linear layers, you could write a model using only conv layers, so that the spatial size of the activations stays constant.
Here is a simple example:
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 3, 1, 1)
self.conv2 = nn.Conv2d(6, 1, 3, 1, 1)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.conv2(x)
return x
model = MyModel()
x = torch.randn(10, 3, 24, 24)
y = torch.randint(0, 2, (10, 1, 24, 24)).float()
dataset = TensorDataset(x, y)
loader = DataLoader(
dataset,
batch_size=2,
num_workers=2,
shuffle=True
)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
nb_epochs = 10
for epoch in range(nb_epochs):
for data, target in loader:
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
print('Epoch {}, loss {}'.format(epoch, loss.item()))
Also, have a look at this post for some information about how to apply the same random transformations on your input image and mask.