Gradient reversal domain adapation not converging always

Hello, I was hoping if someone would be kind enough to see if the following is correct? I am attempting to re-create https://github.com/pumpikano/tf-dann/blob/master/Blobs-DANN.ipynb for pytorch on the blobs, but only sometimes get convergence. i.e., I’ll run the code 10 times, and might get convergence 50% of the time.

n_epochs = 10000
lam = 0
j = 0

ce_loss = nn.CrossEntropyLoss()

for epoch in range(n_epochs):
count_0 = 0
count_1 = 0

#us_loader_iter = iter(trainloader_us)
train_loader_iter = iter(trainloader)
test_loader_iter = iter(testloader)
running_loss = torch.FloatTensor(np.zeros(1))
running_US_loss = torch.FloatTensor(np.zeros(1)) 

for i, data in (enumerate(trainloader)):
    
    j += 1
    p = float(j) / (n_epochs * len(trainloader))
    lam = float(2. / (1. + np.exp(-10. * p)) - 1)
    lr = 0.01 / (1. + 10 * p)**0.75
    
    dnet.lambd =  lam
            
    class_batch, class_targets = data
    test_batch, _ = next(test_loader_iter)
    
    if len(test_batch) != len(class_batch):
        break
    us_batch = Variable(torch.stack([class_batch, test_batch]).view(-1,2))
    us_targets = torch.LongTensor(np.zeros(len(class_batch)*2))
    us_targets[len(class_batch):] = 1
    
    us_targets = Variable(us_targets)
    test_batch = Variable(test_batch)
    
    class_batch, class_targets = Variable(class_batch), Variable(class_targets)

    # un-supervised loss step 1
    feat_ext.zero_grad()
    classifier.zero_grad()
    dnet.zero_grad()
    
    params_fc = [{'params': classifier.parameters(), 'lr': lr}]
    params_dnet = [{'params': dnet.parameters(), 'lr': lr}]
    params_feats = [{'params': feat_ext.parameters(), 'lr': lr}]
            
    optim_fc = optim.Adam(params_fc)
    optim_dnet = optim.Adam(params_dnet)
    optim_feats = optim.Adam(params_feats)
    #run class batch through
    feats = feat_ext(class_batch).view(len(class_batch), -1)
    preds = classifier(feats)
    loss_class = ce_loss(preds, class_targets)
    loss_class.backward(retain_graph= True)
    
    optim_fc.step()
    optim_feats.step()
    
    #run us_batch through
    dnet.zero_grad()
    feat_ext.zero_grad()
    classifier.zero_grad()
    feats = feat_ext(us_batch).view(len(us_batch), -1)
    out = dnet(feats)
    
    loss_us = ce_loss(out, us_targets)
    loss_us.backward(retain_graph = True)
    
    optim_feats.step()
    optim_dnet.step()

As a followup, here are the model definitions:

class F_extractor(nn.Module):

def __init__(self, d):
    super(F_extractor, self).__init__()
    self.fc1 = nn.Linear(2,d)
    
def forward(self, x):
    x = F.relu(self.fc1(x))
    return x

class Classifier(nn.Module):

def __init__(self, d):
    super(Classifier, self).__init__()
    self.fc1 = nn.Linear(d, 2)
    

def forward(self, x):
    x = self.fc1(x)
    return x

class GradReverse(Function):

def __init__(self, lambd):
    self.lambd = lambd
    
def forward(self, x):
    return x.view_as(x)

def backward(self, grad_output):
    return (grad_output * (-self.lambd))

def grad_reverse(x, lambd):

return GradReverse(lambd)(x)

class Discriminator(nn.Module):

def __init__(self, d, lambd = 0):
    super(Discriminator, self).__init__()
    self.fc1 = nn.Linear(d, 8) 
    self.fc2 = nn.Linear(8, 2)
    self.lambd = lambd
    
def forward(self, x):
    x = grad_reverse(x, self.lambd)
    x = self.fc1(x)
    x = F.relu(x)
    x = self.fc2(x)

    return x