Optim.Adam doesn't work

Hi all. I just moved from tensorflow to pytorch and I am really new to pytorch. I just wrote my just model but it doesn’t learn anything. Can someone help me out with that? Any information is appreciated. Here is my code. I omitted the data_loader file and run file. But basically in my run file, I run model = RETAIN(config) and model.run()

import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import pdb
from ops import *

class RETAIN(nn.Module):
def init(self, config):
super(RETAIN, self).init()
for name in config.dict:
setattr(self,name,getattr(config,name))

    self.W_emb = nn.Parameter(torch.randn(self.num_features,self.embed_size).type(torch.FloatTensor), requires_grad=True)
    self.alpha_rnn = nn.LSTM(self.embed_size,self.hidden_units,self.num_layers,batch_first=True)
    self.alpha_weight = nn.Parameter(torch.randn(self.hidden_units,1).type(torch.FloatTensor), requires_grad=True)
    self.alpha_bias = nn.Parameter(torch.randn(1).type(torch.FloatTensor), requires_grad=True)
    self.beta_weight = nn.Parameter(torch.randn(self.hidden_units,self.hidden_units).type(torch.FloatTensor), requires_grad=True)
    self.beta_bias = nn.Parameter(torch.randn(self.hidden_units).type(torch.FloatTensor), requires_grad=True)
    self.beta_rnn = nn.LSTM(self.embed_size,self.hidden_units,self.num_layers,batch_first=True)
    self.out_weight = nn.Parameter(torch.randn(self.hidden_units,1).type(torch.FloatTensor), requires_grad=True)
    self.out_bias = nn.Parameter(torch.randn(1).type(torch.FloatTensor), requires_grad=True)

    self.softmax = nn.Softmax()
    self.tanh = nn.Tanh()
    self.sigmoid = nn.Sigmoid()

    # l = nn.ModuleList([self.alpha_rnn, self.beta_rnn])

    self.sigmoid_cross_entropy_with_logits = lambda logits,labels: nn.BCELoss()(self.sigmoid(logits),labels)
    self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)


def forward(self,x,y):
    inp = Variable(torch.from_numpy(x).type(torch.FloatTensor))
    label = Variable(torch.from_numpy(y).type(torch.FloatTensor))
    
    # Embed the input
    embedded_x = torch.matmul(inp,self.W_emb)

    # Reverse the data:
    idx = Variable(torch.arange(self.steps-1,-1,-1).type(torch.LongTensor))
    reversed_x = torch.index_select(embedded_x,1,idx)

    h0 = Variable(torch.zeros(self.num_layers,reversed_x.size()[0],reversed_x.size()[2]))
    c0 = Variable(torch.zeros(self.num_layers,reversed_x.size()[0],reversed_x.size()[2]))

    # Get alpha attention
    alpha_output,_ = self.alpha_rnn(reversed_x,(h0,c0))
    alpha_att = torch.matmul(alpha_output,self.alpha_weight) + self.alpha_bias
    alpha_att = torch.squeeze(alpha_att,2)
    alpha_att = self.softmax(alpha_att)
    alpha_att = torch.unsqueeze(alpha_att,2)
    alpha_att = torch.index_select(alpha_att,1,idx)

    # Get beta attention
    beta_output,_ = self.beta_rnn(reversed_x,(h0,c0))
    beta_att = torch.matmul(beta_output,self.beta_weight) + self.beta_bias
    beta_att = self.tanh(beta_att)
    beta_att = torch.index_select(beta_att,1,idx)

    c_i = torch.sum(alpha_att * (beta_att*embedded_x),1)
    logits = torch.matmul(c_i,self.out_weight) + self.out_bias
    preds = self.sigmoid(logits)

    preds = preds.data.numpy()
    roc, auc = ROC_AUC(preds, y)
    preds = preds >= 0.5
    acc = accuracy(preds, y)
    
    loss = torch.sum(self.sigmoid_cross_entropy_with_logits(logits,label))

    return loss, loss.data.numpy(), auc, acc

def get_batch(self):
    index = np.random.choice(len(self.train_x),self.batch_size,replace=False)
    batch_x = [self.train_x[i] for i in index]
    batch_y = [self.train_y[i] for i in index]
    return np.array(batch_x), np.array(batch_y)


def run(self):

    eval_loss_min = float('inf')
    eval_auc_min = float('inf')
    eval_acc_min = float('inf')
    step_min = 0  
    for i in range(self.total_iter):
        data, label = self.get_batch()

        self.optimizer.zero_grad()
        loss, train_loss, train_auc, train_acc = self.forward(data,label)

        loss.backward()
        self.optimizer.step()

        
        if (i+1)%self.check_iter == 0:
            _, eval_loss, eval_auc, eval_acc = self.forward(self.eval_x,self.eval_y)

            print("-----------------------------------------------------------------------------")
            print self.task 
            if eval_loss < eval_loss_min:
               eval_loss_min = eval_loss
               eval_auc_min = eval_auc
               eval_acc_min = eval_acc
               step_min=i+1
               print("MIN_test_loss is updated, lr: %f" %self.lr)

                        

            print("Step:%6d,      Train loss: %.3f, Train AUC: %.3f, Train Accuracy: %.3f" \
                        % (i+1, train_loss, train_auc, train_acc))
            print("Step:%6d,       Eval loss: %.3f,  Eval Auc: %.3f,  Eval Accuracy: %.3f" \
                        % (i+1, eval_loss, eval_auc, eval_acc))