The eval metric is always the same.
Ans I found the model never optimized.
I step with the input flow to the model, and I found all Variables
have grad_fn
.
Then I use backward
function.
I try to have something with the grad by
[p.grad.mean() for p in params if p.grad is not None]
,
The shown data is really small, e-3~e-9
.
So, maybe my model is wrong?
I check much times, and it seems no problem.
What can I do next?
Post your code, then people may be able to help you.
This is the model:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
class AttentionNet(nn.Module):
def __init__(self, args):
super(AttentionNet, self).__init__()
self.use_gpu = args.use_gpu
self.embd_size = args.w_embd_size
self.batch_size = args.batch_size
self.d = self.embd_size * 2 # word_embedding + char_embedding
self.char_embd_net = CharEmbedding(args)
self.word_embd_net = WordEmbedding(args)
self.highway_net = Highway(self.d)
self.ctx_embd_layer = nn.GRU(self.d, self.d, bidirectional=True, dropout=0.2, batch_first=True)
self.W = nn.Parameter(torch.rand(1, 6 * self.d, 1).type(torch.FloatTensor),
requires_grad=True) # (N, 6d, 1) for bmm (N, T*J, 6d)
self.modeling_layer = nn.GRU(8 * self.d, self.d, bidirectional=True, dropout=0.2, batch_first=True)
self.p1_layer = nn.Linear(10 * self.d, 1)
self.p2_lstm_layer = nn.GRU(2 * self.d, self.d, bidirectional=True, dropout=0.2, batch_first=True)
self.p2_layer = nn.Linear(10 * self.d, 1)
def build_contextual_embd(self, x_c, x_w):
# 1. Caracter Embedding Layer
char_embd = self.char_embd_net(x_c) # (N, seq_len, embd_size)
# 2. Word Embedding Layer
word_embd = self.word_embd_net(x_w) # (N, seq_len, embd_size)
# Highway Networks for 1. and 2.
embd = torch.cat((char_embd, word_embd), 2) # (N, seq_len, d==embd_size*2)
embd = self.highway_net(embd)
# 3. Contextual Embedding Layer
ctx_embd_out, _h = self.ctx_embd_layer(embd)
return ctx_embd_out
def forward(self, ctx_c, ctx_w, query_c, query_w):
batch_size = ctx_c.size(0)
T = ctx_w.size(1) # context sentence length (word level)
J = query_w.size(1) # query sentence length (word level)
# 1. Caracter Embedding Layer
# 2. Word Embedding Layer
# 3. Contextual Embedding Layer
embd_context = self.build_contextual_embd(ctx_c, ctx_w) # (N, T, 2d)
embd_query = self.build_contextual_embd(query_c, query_w) # (N, J, 2d)
# 4. Attention Flow Layer
# Make a similarity matrix
shape = (batch_size, T, J, 2 * self.d) # (N, T, J, 2d)
embd_context_ex = embd_context.unsqueeze(2) # (N, T, 1, 2d)
embd_context_ex = embd_context_ex.expand(shape) # (N, T, J, 2d)
embd_query_ex = embd_query.unsqueeze(1) # (N, 1, J, 2d)
embd_query_ex = embd_query_ex.expand(shape) # (N, T, J, 2d)
a_elmwise_mul_b = torch.mul(embd_context_ex, embd_query_ex) # (N, T, J, 2d)
cat_data = torch.cat((embd_context_ex, embd_query_ex, a_elmwise_mul_b), 3) # (N, T, J, 6d), [h;u;h◦u]
cat_data = cat_data.view(batch_size, -1, 6 * self.d) # (N, T*J, 6d)
S = torch.bmm(cat_data, self.W.expand(batch_size, 6 * self.d, 1)) # (N, T*J, 1)
S = S.view(batch_size, T, J) # (N, T, J), unsqueeze last dim
S = F.softmax(S, dim=2)
# Context2Query
c2q = torch.bmm(S, embd_query) # (N, T, 2d) = bmm( (N, T, J), (N, J, 2d) )
# Query2Context
# b: attention weights on the context
tmp_b = torch.max(S, 2)[0] # (N, T)
b = F.softmax(tmp_b, dim=1)
q2c = torch.bmm(b.unsqueeze(1), embd_context) # (N, 1, 2d) = bmm( (N, 1, T), (N, T, 2d) )
q2c = q2c.repeat(1, T, 1) # (N, T, 2d), tiled T times
# G: query aware representation of each context word
G = torch.cat((embd_context, c2q, embd_context.mul(c2q), embd_context.mul(q2c)), 2) # (N, T, 8d)
# 5. Modeling Layer
M, _h = self.modeling_layer(G) # M: (N, T, 2d)
# 6. Output Layer
G_M = torch.cat((G, M), 2) # (N, T, 10d)
G_M = G_M.view(batch_size * T, -1) # (N * T, 10d)
p1 = self.p1_layer(G_M) # (N * T, 1)
p1 = p1.view(batch_size, T) # (N, T)
M2, _ = self.p2_lstm_layer(M) # (N, T, 2d)
G_M2 = torch.cat((G, M2), 2) # (N, T, 10d)
G_M2 = G_M2.view(batch_size * T, -1) # (N * T, 10d)
p2 = self.p2_layer(G_M2) # (N * T, 1)
p2 = p2.view(batch_size, T) # (N, T)
return p1, p2
class Highway(nn.Module):
def __init__(self, in_size, n_layers=2, act=F.relu, gate_act=F.sigmoid):
super(Highway, self).__init__()
self.n_layers = n_layers
self.act = act
self.gate_act = gate_act
self.normal_layer = nn.ModuleList([nn.Linear(in_size, in_size) for _ in range(n_layers)])
self.gate_layer = nn.ModuleList([nn.Linear(in_size, in_size) for _ in range(n_layers)])
self.gate = nn.ModuleList([nn.Linear(in_size, in_size) for _ in range(n_layers)])
def forward(self, x):
for i in range(self.n_layers):
normal_layer_ret = self.act(self.normal_layer[i](x))
gate = self.gate_act(self.gate_layer[i](x))
x = gate * normal_layer_ret + (1 - gate) * x
return x
# In : (N, sentence_len, word_len, vocab_size_c)
# Out: (N, sentence_len, c_embd_size)
class CharEmbedding(nn.Module):
def __init__(self, args):
super(CharEmbedding, self).__init__()
assert args.out_chs//len(args.filters) == args.out_chs/len(args.filters), 'must be divied '
self.out_dim = args.out_chs//len(args.filters)
self.embd_size = args.c_embd_size
self.embedding = nn.Embedding(args.vocab_size_c, self.embd_size)
self.char_conv = nn.ModuleList([nn.Conv2d(1, self.out_dim, (self.embd_size, f)) for f in args.filters])
# if args.use_gpu:
# self.char_conv = self.char_conv.cuda()
self.dropout = nn.Dropout(args.dropout)
self.fc1 = nn.Linear(self.out_dim * len(args.filters), 1)
def forward(self, x):
# x: (N, seq_len, word_len)
N, seq_len, word_len = x.size()
x = x.view(-1, word_len) # (N*seq_len, word_len)
x = self.embedding(x) # (N*seq_len, word_len, c_embd_size)
x = x.transpose(1, 2).contiguous() # (N*seq_len, c_embd_size, word_len)
x = x.unsqueeze(1) # (N*seq_len, 1, c_embd_size, word_len)
x = [F.adaptive_max_pool1d(F.relu(conv(x)).squeeze(), 1).squeeze() for conv in self.char_conv]
x = torch.cat(x, 1)
x = x.view(N, seq_len, -1) # (N, seq_len, c_embd_size))
x = self.dropout(x)
return x
# In : (N, sentence_len, vocab_size_w)
# Out: (N, sentence_len, embd_size)
class WordEmbedding(nn.Module):
def __init__(self, args, is_train_embd=False):
super(WordEmbedding, self).__init__()
self.embedding = nn.Embedding(args.vocab_size_w, args.w_embd_size)
if args.pre_embd_w is not None:
self.embedding.weight = nn.Parameter(args.pre_embd_w, requires_grad=is_train_embd)
def forward(self, x):
return F.relu(self.embedding(x))
This is the train code, input is generated for simple test:
model = Attention()
criterion = nn.CrossEntropyLoss()
params = filter(lambda p:p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(params)
c = Variable(torch.ones(4, 200, 10).long())
w = Variable(torch.ones(4, 200).long())
q_c = Variable(torch.ones(4, 50, 10).long())
q_w = Variable(torch.ones(4, 50, 10).long())
optimizer.zero_grad()
o_1, o_2 = model(c, w, q_c, q_w)
target1 = Variable(torch.ones(4).long())
target2 = Variable(torch.ones(4).long())
loss = critetion(o_1, target_1) + criterion(o_2, target_2)
loss.backward()
optimizer.step()
A litter long, but just some simple layer composement.