Domain Adaptation on Software Datasets

Hello all,

I have been trying to use this domain adaptation technique [ link ] for my work on software domain, specifically on AEEEM or Promise Dataset. These are basically tabular imbalanced datasets ( .csv files ). I have modified the code from the above link for my use case as follows -

# Create Source and Target Domain Datasets


source = pd.read_csv( '/content/Mylyn.csv' )

labels = source[ 'class' ].values
features = source.drop( [ 'class' ], axis = 1 ).values


features_source_train, features_source_test, labels_source_train, labels_source_test = train_test_split( features, labels, test_size = 0.2 )


features_source_train = torch.Tensor( features_source_train )
features_source_test = torch.Tensor( features_source_test )

labels_source_train = torch.Tensor( labels_source_train )
labels_source_test = torch.Tensor( labels_source_test )


dataset_source_train = utils.TensorDataset( features_source_train, labels_source_train )
dataset_source_test = utils.TensorDataset( features_source_test, labels_source_test )


# Create Target Datasets


target = pd.read_csv( '/content/PDE.csv' )

labels = target[ 'class' ].values
features = target.drop( [ 'class' ], axis = 1 ).values


features_target_train, features_target_test, labels_target_train, labels_target_test = train_test_split( features, labels, test_size = 0.2 )


features_target_train = torch.Tensor( features_target_train )
features_target_test = torch.Tensor( features_target_test )

labels_target_train = torch.Tensor( labels_target_train )
labels_target_test = torch.Tensor( labels_target_test )


dataset_target_train = utils.TensorDataset( features_target_train, labels_target_train )
dataset_target_test = utils.TensorDataset( features_target_test, labels_target_test )
# Models


class Encoder( nn.Module ):
  ''' Encoder will be used to generate important features from the Data '''

  def __init__( self ):
    super().__init__()


    self.fc1 = nn.Linear( 61, 100 )
    self.fc2 = nn.Linear( 100, 200 )
    self.fc3 = nn.Linear( 200, 400 )
    self.fc_final = nn.Linear( 400, 500 )

    self.relu = nn.ReLU()
    self.dropout = nn.Dropout()


  def forward( self, features_in ):


    x = self.relu( self.fc1( features_in ) )
    x = self.relu( self.fc2( x ) )
    x = self.dropout( x )
    x = self.relu( self.fc3( x ) )

    features = self.fc_final( x )


    return features


class Classifier( nn.Module ):
  """ The classifier will be used to classify the features generated from the Encoder """

  def __init__( self ):
    super().__init__()

    self.fc = nn.Linear( 500, 2 )
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout()

  def forward( self, features_in ):

    output = self.dropout( self.relu( features_in ) )
    output = self.fc( output )

    return output


class Discriminator( nn.Module ):
  """ The discriminator will be used to distinguish between the source and target data """

  def __init__( self ):
    super().__init__()

    self.fc1 = nn.Linear( 500, 250 )
    self.fc2 = nn.Linear( 250, 125 )
    self.fc3 = nn.Linear( 125, 2 )
    self.relu = nn.ReLU()
    self.logsoftmax = nn.LogSoftmax( )

  def forward( self, input ):

    x = self.relu( self.fc1( input ) )
    x = self.relu( self.fc2( x ) )
    output = self.logsoftmax( self.fc3( x ) )

    return output


# Write Training loops


# Pre-Train Classifier on Source Domain

def pre_train( encoder, classifier, data_loader ):

  encoder.train()
  classifier.train()

  optimizer = optim.Adam( params = list( encoder.parameters() ) + list( classifier.parameters() ), lr = 0.001 )
  loss_fn = nn.CrossEntropyLoss()

  min_loss = 1000000
  losses = []


  ## Training the network

  for epoch in range( n_epochs_pre ):

    print( f' Current Epoch = { epoch + 1 } ' )
    for step, ( features, labels ) in enumerate( data_loader ):

      labels = labels.type( torch.LongTensor )


      features = features.to( device )
      labels = labels.to( device )


      optimizer.zero_grad()

      preds = classifier( encoder( features ) )
      loss = loss_fn( preds, labels )


      loss.backward()
      optimizer.step()

      if step % 5 == 0:
        print( f' Current Epoch = { epoch + 1 }, Current Loss = { loss }')
    

    losses.append( loss.detach().numpy() )
    if loss < min_loss:
      min_loss = loss
      torch.save( encoder.state_dict(), 'best_encoder.pt' )
      torch.save( classifier.state_dict(), 'best_classifier.pt' )

  

  plt.plot( losses )
  plt.xlabel( ' Epochs ' )
  plt.ylabel( ' Loss values ' )
  plt.title( ' Classifier loss on pre-training ' )


  return encoder, classifier


# Adversarial Training of Target Encoder and Discriminator

def adversarial_training( source_encoder, target_encoder, discriminator, source_data_loader, target_data_loader ):


  ## Setup

  target_encoder.train()
  discriminator.train()


  loss_func = nn.CrossEntropyLoss()
  target_optimizer = optim.Adam( target_encoder.parameters(), lr = 0.001 )
  discriminator_optimizer = optim.Adam( discriminator.parameters(), lr = 0.001 )

  len_data_loader = min( len( source_data_loader ), len( target_data_loader ) )


  min_d_loss = float( ' inf ' )
  min_te_loss = float( ' inf ' )


  ## Training

  for epoch in range( n_epochs ):

    data_zip = enumerate( zip( source_data_loader, target_data_loader ) )

    for step, ( ( features_source_pre, _ ), ( features_target_pre, _ ) ) in data_zip:
    


      # Train Discriminator


      features_source_pre = features_source_pre.to( device )
      features_target_pre = features_target_pre.to( device )


      discriminator_optimizer.zero_grad()


      features_source = source_encoder( features_source_pre )
      features_target = target_encoder( features_target_pre )
      features_concat = torch.cat( ( features_source, features_target ), 0 )


      pred_concat = discriminator( features_concat )
      pred_concat = pred_concat.squeeze()


      label_source = torch.ones( features_source.size( 0 ) ).long().to( device )
      label_target = torch.zeros( features_target.size( 0 ) ).long().to( device )
      label_concat = torch.cat( ( label_source, label_target ), 0 )


      loss_discriminator = loss_func( pred_concat, label_concat )
      loss_discriminator.backward()


      discriminator_optimizer.step()


      ## Train target encoder

      discriminator_optimizer.zero_grad()
      target_optimizer.zero_grad()


      features_target = target_encoder( features_target_pre )

      pred_target = discriminator( features_target.detach() )
      #pred_target = pred_target.squeeze( 0 )

      label_target = torch.ones( features_target.size( 0 ) ).long().to( device )
      label_target = label_target.type( torch.LongTensor )


      loss_target = loss_func( pred_target, label_target )
      loss_target.backward()


      target_optimizer.step()


      if step % 5 == 0:
        print( f' Current Epoch = { epoch }, Discriminator Loss = { loss_discriminator }, Target Encoder Loss = { loss_target } ' )
      

    #print( 'this will print after every epoch ' )
    if loss_discriminator < min_d_loss:
      torch.save( discriminator.state_dict(), 'best_discriminator.pt' )
    
    if loss_target < min_te_loss:
      torch.save( target_encoder.state_dict(), 'best_target_encoder.pt' )
  

  return target_encoder


# Evaluation function 

def evaluate( encoder, classifier, data_loader ):


  pred_list = [ ]
  label_list = [ ]


  with torch.no_grad():

    for ( features, labels ) in data_loader:

      pred_eval =  classifier( encoder( features ) )

      pred = pred_eval.max( 1 )[ 1 ]
      
      pred_list.extend( pred )
      label_list.extend( labels )
  
  
  f1 = f1_score( label_list, pred_list )
  precision = precision_score( label_list, pred_list )
  recall = recall_score( label_list, pred_list )
  g_mean = geometric_mean_score( label_list, pred_list )
  balanced_accuracy = balanced_accuracy_score( label_list, pred_list )


  print( f1, precision, recall, g_mean, balanced_accuracy )



# Main function

n_epochs_pre = 100
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
n_epochs = 100

source_data_loader = DataLoader( dataset_source_train, batch_size = 100, drop_last = True, shuffle = True )
source_data_loader_test = DataLoader( dataset_source_test, batch_size = 100, drop_last = True, shuffle = True )

target_data_loader = DataLoader( dataset_target_train, batch_size = 100, drop_last = True, shuffle = True )
target_data_loader_test = DataLoader( dataset_target_test, batch_size = 100, drop_last = True, shuffle = True )


source_encoder = Encoder()
source_classifier = Classifier()
target_encoder = Encoder()
discriminator = Discriminator()


source_encoder.to( device )
source_classifier.to( device )
target_encoder.to( device )
discriminator.to( device )


print( ' Step => Pre - Training ' )
print( ' Source Encoder = ' )
print( source_encoder )
print( ' Source Classifier = ' )
print( source_classifier )


# Train Classifier
source_encoder, source_classifier = pre_train( source_encoder, source_classifier, source_data_loader )

print( ' Step => Adversarial Training ' )
print( ' Target Encoder = ' )
print( target_encoder )
print( ' Discriminator = ' )
print( discriminator )


target_encoder.load_state_dict( source_encoder.state_dict() )

# Train Adversarially
target_encoder = adversarial_training( source_encoder, target_encoder, discriminator,
                     source_data_loader, target_data_loader )


source_encoder.load_state_dict( torch.load( '/content/best_encoder.pt' ) )
source_classifier.load_state_dict( torch.load( '/content/best_classifier.pt' ) )
target_encoder.load_state_dict( torch.load( '/content/best_target_encoder.pt' ) )
discriminator.load_state_dict( torch.load( '/content/best_discriminator.pt' ) )


evaluate( source_encoder, source_classifier, target_data_loader_test )
evaluate( target_encoder, source_classifier, target_data_loader_test )

In the above code, the discriminator_loss decreases but the target_encoder_loss does not decrease and I can’t seem to figure out, how to make it decrease?

Also, I am aiming for around 60 - 70 % f1-score values and similar g-mean and balanced accuracy values, is it feasible?