Accuracy 0% for binary classification problem

I am using the OpenFL framework for doing Federated Learning experiments. I run their tutorial notebooks without problems, so for example I am able to run classification on MNIST and everything is ok.
Now I am using 2 clients with 2 different datasets. However, my accuracy is around 0% for a binary classification problem.
So, I have 2 classes, “neg” and “pos” for both datasets. Images of the first dataset are 3000x2951 while images of the second are 4892x4020. I resize both to 256x256. My network is a ResNet9 without any sigmoid at the end, because I am using BCEWithLogitsLoss(). Here a bit of code, to check if everything is ok:

optimizer_adam = optim.Adam(params_to_update, lr=1e-4)

def cross_entropy(output, target):
    """Binary cross-entropy metric
    """
    target = target.unsqueeze(1)
    criterion = nn.BCEWithLogitsLoss()
    loss = criterion(output, target.float())
    return loss
def train(net_model, train_loader, optimizer, device, loss_fn=cross_entropy, some_parameter=None):
    torch.manual_seed(0)
    device='cpu'
    function_defined_in_notebook(some_parameter)
    
    train_loader = tqdm.tqdm(train_loader, desc="train")
    net_model.train()
    net_model.to(device)

    losses = []

    for data, target in train_loader:
        data, target = torch.tensor(data).to(device), torch.tensor(
            target).to(device, dtype=torch.int64)
        optimizer.zero_grad()
        #data = data.type(torch.LongTensor)
        #target = target.type(torch.LongTensor)
        output = net_model(data)
        loss = loss_fn(output=output, target=target)
        loss.backward()
        optimizer.step()
        losses.append(loss.detach().cpu().numpy())
    
    return {'train_loss': np.mean(losses),}


@task_interface.register_fl_task(model='net_model', data_loader='val_loader', device='device')     
def validate(net_model, val_loader, device):
    torch.manual_seed(0)
    device = torch.device('cpu')
    net_model.eval()
    net_model.to(device)
    
    val_loader = tqdm.tqdm(val_loader, desc="validate")
    val_score = 0
    total_samples = 0

    with torch.no_grad():
        for data, target in val_loader:
            samples = target.shape[0]
            total_samples += samples
            data, target = torch.tensor(data).to(device), \
                torch.tensor(target).to(device, dtype=torch.int64)
            output = net_model(data)
            
            pred = (output >= 0.5).long() # Binarize predictions to 0 and 1
            val_score = (pred == target).sum().cpu().item()/data.size(0)
            
            #val_score += pred.eq(target).sum().cpu().numpy()
            
    return {'acc': val_score / total_samples,}

I think that all this is correct. So the only part that can be wrong is when I import the data because in this federated learning framework is a bit tricky. Basically my datasets are organized both in this way: /Dataset1(2)/Train(Test)/neg(pos)/images.png. I want to extract x_train, y_train, x_test and y_test because I am following exactly the structure of a tutorial that works. So this is my proposed solution:

def download_data(self):
        """Download prepared dataset."""
        image_list_train = []
        image_list_test = []
        x_train = []
        y_train = []
        x_test = []
        y_test = []
        base_dir_train = 'Montgomery_real_splitted/TRAIN/'
        base_dir_test = 'Montgomery_real_splitted/TEST/'
        for f in sorted(os.listdir(base_dir_train)):
            if os.path.isdir(base_dir_train+f):
                print(f"{f} is a target class")
                for i in sorted(os.listdir(base_dir_train+f)):
                    y_train.append(f)
                    im = Image.open(base_dir_train+f+'/'+i)
                    x_train.append(im)
        for f in sorted(os.listdir(base_dir_test)):
            if os.path.isdir(base_dir_test+f):
                print(f"{f} is a target class")
                for i in sorted(os.listdir(base_dir_test+f)):
                    y_test.append(f)
                    imt=Image.open(base_dir_test+f+'/'+i)
                    x_test.append(imt)
        y_train = np.array(y_train)
        y_test = np.array(y_test)
        
        for i in range(len(y_train)):
            if y_train[i]=="neg":
                y_train[i]=0
            else:
                y_train[i]=1
        y_train = y_train.astype(np.uint8)
        
        for i in range(len(y_test)):
            if y_test[i]=="neg":
                y_test[i]=0
            else:
                y_test[i]=1
        y_test = y_test.astype(np.uint8)    


        print('Mont-china data was loaded!')
        return (x_train, y_train), (x_test, y_test)

This code above is in a python script needed to load the data. Then, inside the Jupyter notebook I have these cells in order to import the dataset:

normalize = T.Normalize(
    mean=[0.1307],
    std=[0.3081]
)

augmentation = T.RandomApply(
    [T.RandomHorizontalFlip(),
     T.RandomRotation(10)], 
    p=.8
)

training_transform = T.Compose(
    [T.Resize((256,256)),
     augmentation,
     T.ToTensor()]
)

valid_transform = T.Compose(
    [T.Resize((256,256)),
     T.ToTensor()]
)


class TransformedDataset(Dataset):

    def __init__(self, dataset, transform=None, target_transform=None):
        """Initialize Dataset."""
        self.dataset = dataset
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        """Length of dataset."""
        return len(self.dataset)

    def __getitem__(self, index):
        img, label = self.dataset[index]
        label = self.target_transform(label) if self.target_transform else label
        img = self.transform(img) if self.transform else img
        return img, label


class MontChinaDataset(DataInterface):
    def __init__(self, **kwargs):
        self.kwargs = kwargs
    
    @property
    def shard_descriptor(self):
        return self._shard_descriptor
        
    @shard_descriptor.setter
    def shard_descriptor(self, shard_descriptor):
        """
        Describe per-collaborator procedures or sharding.

        This method will be called during a collaborator initialization.
        Local shard_descriptor  will be set by Envoy.
        """
        self._shard_descriptor = shard_descriptor
        
        self.train_set = TransformedDataset(
            self._shard_descriptor.get_dataset('train'),
            transform=training_transform
        )
        self.valid_set = TransformedDataset(
            self._shard_descriptor.get_dataset('val'),
            transform=valid_transform
        )
        
    def get_train_loader(self, **kwargs):
        """
        Output of this method will be provided to tasks with optimizer in contract
        """
        generator=torch.Generator()
        generator.manual_seed(0)
        return DataLoader(
            self.train_set, batch_size=self.kwargs['train_bs'], shuffle=True, generator=generator
            )

    def get_valid_loader(self, **kwargs):
        """
        Output of this method will be provided to tasks without optimizer in contract
        """
        return DataLoader(self.valid_set, batch_size=self.kwargs['valid_bs'])

    def get_train_data_size(self):
        """
        Information for aggregation
        """
        return len(self.train_set)

    def get_valid_data_size(self):
        """
        Information for aggregation
        """
        return len(self.valid_set)
    

fed_dataset = MontChinaDataset(train_bs=16, valid_bs=16)

The strange thing is that the loss decreases, while the accuracy remains 0 or around 0.

[12:29:44] METRIC   Round 0, collaborator env_one train result train_loss:  0.673127                                                           experiment.py:116
[12:29:53] METRIC   Round 0, collaborator env_one locally_tuned_model_validate result acc:  0.000000                                           experiment.py:116
[12:29:56] METRIC   Round 0, collaborator env_one aggregated_model_validate result acc:     0.000000                                           experiment.py:116
[12:30:49] METRIC   Round 0, collaborator env_two train result train_loss:  0.562856                                                           experiment.py:116
[12:31:14] METRIC   Round 0, collaborator env_two locally_tuned_model_validate result acc:  0.000000                                           experiment.py:116
[12:31:19] METRIC   Round 0, collaborator env_two aggregated_model_validate result acc:     0.000000                                           experiment.py:116
[12:31:21] METRIC   Round 0, collaborator Aggregator train result train_loss:       0.581464                                                   experiment.py:116
           METRIC   Round 0, collaborator Aggregator locally_tuned_model_validate result acc:       0.000000                                   experiment.py:116
[12:31:22] METRIC   Round 0, collaborator Aggregator aggregated_model_validate result acc:  0.000000                                           experiment.py:116
[12:31:39] METRIC   Round 1, collaborator env_one train result train_loss:  0.637785                                                           experiment.py:116
[12:31:41] METRIC   Round 1, collaborator env_one locally_tuned_model_validate result acc:  0.000000                                           experiment.py:116
[12:31:44] METRIC   Round 1, collaborator env_one aggregated_model_validate result acc:     0.000000                                           experiment.py:116
[12:31:55] METRIC   Round 1, collaborator env_two train result train_loss:  0.432979                                                           experiment.py:116
[12:32:00] METRIC   Round 1, collaborator env_two locally_tuned_model_validate result acc:  0.000000                                           experiment.py:116
[12:32:05] METRIC   Round 1, collaborator env_two aggregated_model_validate result acc:     0.000000                                           experiment.py:116
[12:32:08] METRIC   Round 1, collaborator Aggregator train result train_loss:       0.467540                                                   experiment.py:116
           METRIC   Round 1, collaborator Aggregator locally_tuned_model_validate result acc:       0.000000                                   experiment.py:116
           METRIC   Round 1, collaborator Aggregator aggregated_model_validate result acc:  0.000000   

And this goes on for several rounds

Can you look into the data it’s fetching to inspect what the model is training upon?

data, target = next(iter(train_loader))

And visualize the data variable.

I solved. Problem was in my validation code. I fixed these lines:

output = torch.sigmoid(output) #compress output into prob distribution
pred = (output >= 0.5).long() # Binarize predictions to 0 and 1
val_score += (pred == target).sum().cpu().item()/data.size(0)