Model Not Training, fixed loss over every sample

I have a tree lstm code, and am trying to train it but after each epoch my loss is staying the exact same.
I have been stuck for over 10 days, can anyone help me with this please.

Trying different learning rates has also not helped

class treeEncoder(nn.Module):
    def __init__(self, cuda,in_dim, mem_dim,emb,labels,labelMap,criterion,device):
        super(treeEncoder, self).__init__()
        self.cudaFlag = cuda
        self.in_dim = in_dim
        self.mem_dim = mem_dim
        self.device = device
        self.labels = labels
        self.labelMap = labelMap
        self.criterion = criterion

        self.ix = nn.Linear(self.in_dim,self.mem_dim)
        self.ih = nn.Linear(self.mem_dim,self.mem_dim)

        self.fx = nn.Linear(self.in_dim,self.mem_dim)
        self.fh = nn.Linear(self.mem_dim, self.mem_dim)

        self.ux = nn.Linear(self.in_dim,self.mem_dim)
        self.uh = nn.Linear(self.mem_dim,self.mem_dim)

        self.ox = nn.Linear(self.in_dim,self.mem_dim)
        self.oh = nn.Linear(self.mem_dim,self.mem_dim)
        
        self.emb = emb
        self.outputModule = OutputModule(self.cudaFlag,mem_dim,4,self.device,dropout=False)
    
    def predict(self,node):
        loss = Variable(torch.zeros(1))
        
        for i in range(node.num_children):
            _, _ = self.forward(node.childrenList[i])
        child_c, child_h = self.getChildStates(node)
        node.state = self.nodeForward(self.emb[node.idx].to(self.device),child_c,child_h)
        
        output = self.outputModule.forward(node.state[1], False)
        
        return output
    
    def forward(self,node):
        loss = Variable(torch.zeros(1))
        
        if self.cudaFlag:
            loss = loss.to(self.device)
        
        for i in range(node.num_children):
            _, child_loss = self.forward(node.childrenList[i])
            loss = loss + child_loss
        child_c, child_h = self.getChildStates(node)
        node.state = self.nodeForward(self.emb[node.uid].to(self.device),child_c,child_h)
        
        output = self.outputModule.forward(node.state[1], True)
        node.output = output

        label = Variable(torch.tensor(self.labelMap[node.label]))
            
        loss = loss + self.criterion(output.reshape(-1,4), label.reshape(-1))
        
        return node.state, loss
        
    def nodeForward(self,x,child_c,child_h):
        child_h_sum = torch.sum(child_h,0)

        i = torch.sigmoid(self.ix(x) + self.ih(child_h_sum))
        o = torch.sigmoid(self.ox(x)+self.oh(child_h_sum))
        u = torch.tanh(self.ux(x)+self.uh(child_h_sum))
        
        fx = self.fx(x)
        
        f = torch.cat([self.fh(child_hi)+fx for child_hi in child_h], 0)
        fc = torch.sigmoid(f)
        
        c = i*u + torch.sum(fc,0)
        h = o*torch.tanh(c)
        
        return c,h
    
    def getChildStates(self,node):
        if node.num_children==0:
            child_c = Variable(torch.zeros(1,self.mem_dim))
            child_h = Variable(torch.zeros(1,self.mem_dim))
            if self.cudaFlag:
                child_c, child_h = child_c.to(self.device), child_h.to(self.device)
        
        else:
            child_c = Variable(torch.Tensor(node.num_children,self.mem_dim))
            child_h = Variable(torch.Tensor(node.num_children,self.mem_dim))
            if self.cudaFlag:
                child_c, child_h = child_c.to(self.device), child_h.to(self.device)
            
            for idx in range(node.num_children):
                child_c[idx] = node.childrenList[idx].state[0]
                child_h[idx] = node.childrenList[idx].state[1]
        return child_c, child_h

for i in range(epochs):
    train_losses = []
    val_losses = []
    
    for tree in tqdm_notebook(x_train):
            count += 1
            optimizer.zero_grad()
            
            (h,c),loss = model(tree.root)
        
            label = Variable(torch.tensor(labelMap[tree.root.label]))
            
            if torch.cuda.is_available():
                label.to(device)
            
            loss.backward()
            train_losses.append(loss.item())
            optimizer.step()