How to deal with the imbalanced data for the binary classification?

Hi, if I have come to the wrong discussion board, please forgive me and let me know the proper one.
I need your feedback on my model and training process.

I was trying to learn the GAT model using multiple graph datasets at once.
However, there were data imbalances between positive and negative labels across the dataset.

for i in range(len(GraphList)):
    print(torch.bincount(GraphList[i].edge_label))

tensor([3292, 1613]) #[the number of 0 labels, the number of 1 labels]
tensor([1724, 1177])
tensor([1532, 1560])
tensor([13707,  5585])
tensor([188, 274])
tensor([2077,  465])
tensor([2092,  608])
tensor([2170,  536])
tensor([2398,  463])

For your understanding, I attach the example of mini-batch data after calling LinkNeighborLoader which is the edge-sampling version of DataLoader.

Data(x=[2], edge_index=[2, 453], edge_label=[32], edge_label_index=[2, 32], edge_class=[32], num_nodes=513, n_id=[513], e_id=[453], batch=[513], num_sampled_nodes=[3], num_sampled_edges=[2], input_id=[32], edge_index_class=[453])

To train the model classifying the binary labels for all datasets, I constructed the model as below which is based on the pytorch geometric.

class MyModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(MyModel, self).__init__()
        
        # MLP
        self.convert_layer = nn.Sequential(
            nn.Linear(4, 16, bias=True),
            nn.LeakyReLU(),
            nn.Linear(16, 64, bias=True),
            nn.LeakyReLU(),
            nn.Linear(64, 256, bias=True)
        )
        
        # GAT
        self.conv1 = GATConv(in_channels, hidden_channels, heads=6) # in_channel, hidden_channel, heads
        self.lin1 = torch.nn.Linear(in_channels, hidden_channels * 6)
        self.norm1 = LayerNorm(hidden_channels * 6)
        self.conv2 = GATConv(hidden_channels * 6, out_channels, heads=1, concat=False) # hidden_channel * heads, out_channel, heads
        self.lin2 = torch.nn.Linear(hidden_channels * 6, out_channels)

    def convert_data_size(self, data1, data2):
        x2 = torch.zeros((len(data1), 256), dtype=torch.float).to(device) #(107940, 256)
        x3 = self.convert_layer (data2).to(device) #(19392, 256)
        nonzero_index = torch.tensor(cID).to(device)
        x2.index_add_(0, nonzero_index, x3) #(107940, 256)

        return x2

    
    def forward(self, data1, data2, neighbor_cl_ids, edge_index):
        x_dict, attn1_dict, attn2_dict = {}, {}, {}
        
        for i in range(len(data2)):
            data2[i] = self.convert_data_size(data1, data2[i]) #(107940, 256), requires_grad=true
            
        for e_i_class in torch.unique(neighbor_cl_ids):
            x = torch.cat((data1, data2[int(e_i_class.item())]), dim=1) #(107940, 512)
            x_1, a1 = self.conv1(x, edge_index, return_attention_weights=True) 
            x = F.leaky_relu(self.norm1(x_1)) 
            x = F.dropout(x, p = 0.2, training = self.training)
            x_2, a2 = self.conv2(x, edge_index, return_attention_weights=True)
            x = x_2 + self.lin2(x)
            
            x_dict[int(e_i_class.item())] = x
            attn1_dict[int(e_i_class.item())] = a1
            attn2_dict[int(e_i_class.item())] = a2
            
        return x_dict, attn1_dict, attn2_dict
for epoch in range(1, num_epochs+1):
    tr_losses = 0
    val_losses = 0
    tr_loss_sum = 0
    val_loss_sum = 0
    
    model.train()
    for data in tqdm(train_loader):
        tr_loss = []
        data = data.to(device)
        data.edge_class = data.edge_class[data.input_id]
        data.edge_index_class = torch.zeros(len(data.edge_index[0])).to(device)    
                
        for i in range(len(data.edge_index_class)):
            if ((data.edge_index[1][i] in data.edge_label_index[0]) or (data.edge_index[1][i] in data.edge_label_index[1])):
                data.edge_index_class[i] = data.edge_class[(data.edge_index[1][i]==data.edge_label_index[0])
                                                             |(data.edge_index[1][i]==data.edge_label_index[1])]
            else:
                data.edge_index_class[i] = torch.max(data.edge_index_class[(data.edge_index[1][i]==data.edge_index[0])])
                
        optimizer.zero_grad()
        z, a1, a2 = model(data.x[0], data.x[1], data.edge_index_class, data.edge_index) 
        
        for i in torch.unique(data.edge_class):
            tr_loss_mean = 0
            i = int(i.item())
            try:
                tr_out = ((z[i][data.edge_label_index[0][data.edge_class==i]] * z[i][data.edge_label_index[1][data.edge_class==i]]).mean(dim=-1)).view(-1)
                tr_loss.append(criterion(tr_out, data.edge_label[data.edge_class==i].float()))
            except KeyError:
                pass
        tr_loss_mean = sum(tr_loss)/len(tr_loss)
        tr_loss_mean.backward()
        optimizer.step()
        
        tr_losses += tr_loss_mean.item()
    avg_tr_loss = tr_losses/len(train_loader.dataset)

    model.eval()
    with torch.no_grad():
        y_val_pred, y_val_pred_prob, y_val_true = [], [], []
        for data in tqdm(val_loader):
            val_loss = []
            data = data.to(device)
            data.edge_class = data.edge_class[data.input_id]
            data.edge_index_class = torch.zeros(len(data.edge_index[0])).to(device)
        
            for i in range(len(data.edge_index_class)):
                if ((data.edge_index[1][i] in data.edge_label_index[0]) or (data.edge_index[1][i] in data.edge_label_index[1])):
                    data.edge_index_class[i] = data.edge_class[(data.edge_index[1][i]==data.edge_label_index[0])
                                                                 |(data.edge_index[1][i]==data.edge_label_index[1])]
                else:
                    data.edge_index_class[i] = torch.max(data.edge_index_class[(data.edge_index[1][i]==data.edge_index[0])])
                
            y_val_true.append(data.edge_label)
            z, a1, a2 = model(data.x[0], data.x[1], data.edge_index_class, data.edge_index)
            
            for i in torch.unique(data.edge_class):
                val_loss_mean = 0
                i = int(i.item())
                try:
                    val_out = ((z[i][data.edge_label_index[0][data.edge_class==i]] * z[i][data.edge_label_index[1][data.edge_class==i]]).mean(dim=-1)).view(-1)
                    val_out_sig = ((z[i][data.edge_label_index[0][data.edge_class==i]] * z[i][data.edge_label_index[1][data.edge_class==i]]).mean(dim=-1)).view(-1).sig
                    val_loss.append(criterion(val_out, data.edge_label[data.edge_class==i].float()))
                except KeyError:
                    pass
                
                y_val_pred.append((val_out_sig>0.5).float().cpu())
                y_val_pred_prob.append((val_out_sig).float().cpu())
            val_loss_mean = sum(val_loss) / len(val_loss)
            val_losses += val_loss_mean.item()
    avg_val_loss = val_losses/len(val_loader.dataset)
    print(f'Epoch: {epoch:03d}, Training Loss: {avg_tr_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')
    
    y, pred, pred_prob = torch.cat(y_val_true, dim=0).cpu().numpy(), torch.cat(y_val_pred, dim=0).cpu().numpy(), torch.cat(y_val_pred_prob, dim=0).cpu().numpy()
    for i in torch.unique(val_loader.data.edge_class):
        i = int(i.item())
        
        val_f1 = f1_score(y[val_loader.data.edge_class==i], pred[val_loader.data.edge_class==i]) #average='micro'
        val_auc = roc_auc_score(y[val_loader.data.edge_class==i], pred_prob[val_loader.data.edge_class==i])
        val_aupr = average_precision_score(y[val_loader.data.edge_class==i], pred_prob[val_loader.data.edge_class==i])
        print(f'Performance of {cellline_list[i]} --> Validation AUC: {val_auc:.4f}, Validation AUPR: {val_aupr:.4f}, Validation F1-score: {val_f1:.4f}')

After that, I got the result with very poor AUC, AUPR, and F1 scores as below.
Unfortunately, the scores continued to drop as each epoch passed.

I expected that the model could learn the pattern of positive and negative samples by sharing the same parameters in conv1 and conv2, but it looks failed.
If there are not only data imbalance problems but also training processes, please tell me.
Thank you for reading the question.
Have a nice day!


[Update]
I tried to balance the ratio of the positive and negative samples, but it still had poor performance.
Are there any possible approaches to make the model learn?