Why mu model not optimized?

The eval metric is always the same.
Ans I found the model never optimized.
I step with the input flow to the model, and I found all Variables have grad_fn.
Then I use backward function.
I try to have something with the grad by
[p.grad.mean() for p in params if p.grad is not None],
The shown data is really small, e-3~e-9.
So, maybe my model is wrong?
I check much times, and it seems no problem.
What can I do next?

Post your code, then people may be able to help you.

This is the model:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable


class AttentionNet(nn.Module):
    def __init__(self, args):
        super(AttentionNet, self).__init__()
        self.use_gpu = args.use_gpu
        self.embd_size = args.w_embd_size
        self.batch_size = args.batch_size
        self.d = self.embd_size * 2  # word_embedding + char_embedding

        self.char_embd_net = CharEmbedding(args)
        self.word_embd_net = WordEmbedding(args)
        self.highway_net = Highway(self.d)
        self.ctx_embd_layer = nn.GRU(self.d, self.d, bidirectional=True, dropout=0.2, batch_first=True)

        self.W = nn.Parameter(torch.rand(1, 6 * self.d, 1).type(torch.FloatTensor),
                              requires_grad=True)  # (N, 6d, 1) for bmm (N, T*J, 6d)

        self.modeling_layer = nn.GRU(8 * self.d, self.d, bidirectional=True, dropout=0.2, batch_first=True)

        self.p1_layer = nn.Linear(10 * self.d, 1)
        self.p2_lstm_layer = nn.GRU(2 * self.d, self.d, bidirectional=True, dropout=0.2, batch_first=True)
        self.p2_layer = nn.Linear(10 * self.d, 1)

    def build_contextual_embd(self, x_c, x_w):
        # 1. Caracter Embedding Layer
        char_embd = self.char_embd_net(x_c)  # (N, seq_len, embd_size)

        # 2. Word Embedding Layer
        word_embd = self.word_embd_net(x_w)  # (N, seq_len, embd_size)

        # Highway Networks for 1. and 2.
        embd = torch.cat((char_embd, word_embd), 2)  # (N, seq_len, d==embd_size*2)
        embd = self.highway_net(embd)

        # 3. Contextual  Embedding Layer
        ctx_embd_out, _h = self.ctx_embd_layer(embd)
        return ctx_embd_out

    def forward(self, ctx_c, ctx_w, query_c, query_w):
        batch_size = ctx_c.size(0)
        T = ctx_w.size(1)  # context sentence length (word level)
        J = query_w.size(1)  # query sentence length   (word level)

        # 1. Caracter Embedding Layer 
        # 2. Word Embedding Layer
        # 3. Contextual  Embedding Layer
        embd_context = self.build_contextual_embd(ctx_c, ctx_w)  # (N, T, 2d)
        embd_query = self.build_contextual_embd(query_c, query_w)  # (N, J, 2d)

        # 4. Attention Flow Layer
        # Make a similarity matrix
        shape = (batch_size, T, J, 2 * self.d)  # (N, T, J, 2d)
        embd_context_ex = embd_context.unsqueeze(2)  # (N, T, 1, 2d)
        embd_context_ex = embd_context_ex.expand(shape)  # (N, T, J, 2d)
        embd_query_ex = embd_query.unsqueeze(1)  # (N, 1, J, 2d)
        embd_query_ex = embd_query_ex.expand(shape)  # (N, T, J, 2d)
        a_elmwise_mul_b = torch.mul(embd_context_ex, embd_query_ex)  # (N, T, J, 2d)

        cat_data = torch.cat((embd_context_ex, embd_query_ex, a_elmwise_mul_b), 3)  # (N, T, J, 6d), [h;u;h◦u]
        cat_data = cat_data.view(batch_size, -1, 6 * self.d)  # (N, T*J, 6d)
        S = torch.bmm(cat_data, self.W.expand(batch_size, 6 * self.d, 1))  # (N, T*J, 1)
        S = S.view(batch_size, T, J)  # (N, T, J), unsqueeze last dim
        S = F.softmax(S, dim=2)

        # Context2Query
        c2q = torch.bmm(S, embd_query)  # (N, T, 2d) = bmm( (N, T, J), (N, J, 2d) )
        # Query2Context
        # b: attention weights on the context
        tmp_b = torch.max(S, 2)[0]  # (N, T)
        b = F.softmax(tmp_b, dim=1)
        q2c = torch.bmm(b.unsqueeze(1), embd_context)  # (N, 1, 2d) = bmm( (N, 1, T), (N, T, 2d) )
        q2c = q2c.repeat(1, T, 1)  # (N, T, 2d), tiled T times

        # G: query aware representation of each context word
        G = torch.cat((embd_context, c2q, embd_context.mul(c2q), embd_context.mul(q2c)), 2)  # (N, T, 8d)

        # 5. Modeling Layer
        M, _h = self.modeling_layer(G)  # M: (N, T, 2d)

        # 6. Output Layer
        G_M = torch.cat((G, M), 2)  # (N, T, 10d)
        G_M = G_M.view(batch_size * T, -1)  # (N * T, 10d)

        p1 = self.p1_layer(G_M)  # (N * T, 1)
        p1 = p1.view(batch_size, T)  # (N, T)

        M2, _ = self.p2_lstm_layer(M)  # (N, T, 2d)
        G_M2 = torch.cat((G, M2), 2)  # (N, T, 10d)
        G_M2 = G_M2.view(batch_size * T, -1)  # (N * T, 10d)

        p2 = self.p2_layer(G_M2)  # (N * T, 1)
        p2 = p2.view(batch_size, T)  # (N, T)

        return p1, p2


class Highway(nn.Module):
    def __init__(self, in_size, n_layers=2, act=F.relu, gate_act=F.sigmoid):
        super(Highway, self).__init__()
        self.n_layers = n_layers
        self.act = act
        self.gate_act = gate_act

        self.normal_layer = nn.ModuleList([nn.Linear(in_size, in_size) for _ in range(n_layers)])
        self.gate_layer = nn.ModuleList([nn.Linear(in_size, in_size) for _ in range(n_layers)])

        self.gate = nn.ModuleList([nn.Linear(in_size, in_size) for _ in range(n_layers)])

    def forward(self, x):
        for i in range(self.n_layers):
            normal_layer_ret = self.act(self.normal_layer[i](x))
            gate = self.gate_act(self.gate_layer[i](x))

            x = gate * normal_layer_ret + (1 - gate) * x
        return x


# In : (N, sentence_len, word_len, vocab_size_c)
# Out: (N, sentence_len, c_embd_size)
class CharEmbedding(nn.Module):
    def __init__(self, args):
        super(CharEmbedding, self).__init__()
        assert args.out_chs//len(args.filters) == args.out_chs/len(args.filters), 'must be divied '
        self.out_dim = args.out_chs//len(args.filters)
        self.embd_size = args.c_embd_size
        self.embedding = nn.Embedding(args.vocab_size_c, self.embd_size)

        self.char_conv = nn.ModuleList([nn.Conv2d(1, self.out_dim, (self.embd_size, f)) for f in args.filters])
        # if args.use_gpu:
        #     self.char_conv = self.char_conv.cuda()
        self.dropout = nn.Dropout(args.dropout)
        self.fc1 = nn.Linear(self.out_dim * len(args.filters), 1)

    def forward(self, x):
        # x: (N, seq_len, word_len)
        N, seq_len, word_len = x.size()
        x = x.view(-1, word_len)  # (N*seq_len, word_len)
        x = self.embedding(x)  # (N*seq_len, word_len, c_embd_size)
        x = x.transpose(1, 2).contiguous()  # (N*seq_len, c_embd_size, word_len)
        x = x.unsqueeze(1)  # (N*seq_len, 1, c_embd_size, word_len)

        x = [F.adaptive_max_pool1d(F.relu(conv(x)).squeeze(), 1).squeeze() for conv in self.char_conv]
        x = torch.cat(x, 1)
        x = x.view(N, seq_len, -1)  # (N, seq_len, c_embd_size))
        x = self.dropout(x)

        return x


# In : (N, sentence_len, vocab_size_w)
# Out: (N, sentence_len, embd_size)
class WordEmbedding(nn.Module):
    def __init__(self, args, is_train_embd=False):
        super(WordEmbedding, self).__init__()
        self.embedding = nn.Embedding(args.vocab_size_w, args.w_embd_size)
        if args.pre_embd_w is not None:
            self.embedding.weight = nn.Parameter(args.pre_embd_w, requires_grad=is_train_embd)

    def forward(self, x):
        return F.relu(self.embedding(x))

This is the train code, input is generated for simple test:

model = Attention()
criterion = nn.CrossEntropyLoss()
params = filter(lambda p:p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(params)

c = Variable(torch.ones(4, 200, 10).long())
w = Variable(torch.ones(4, 200).long())
q_c = Variable(torch.ones(4, 50, 10).long())
q_w = Variable(torch.ones(4, 50, 10).long())

optimizer.zero_grad()
o_1, o_2  = model(c, w, q_c, q_w)

target1 = Variable(torch.ones(4).long())
target2 = Variable(torch.ones(4).long())

loss = critetion(o_1, target_1) + criterion(o_2, target_2)
loss.backward()
optimizer.step()

A litter long, but just some simple layer composement.