How to concatenate LSTM output with a Linear output?

class Network(nn.Module):
  def __init__(self,weight_matrix=embedding_matrix,hidden_dim=12):
    super().__init__()

    vocab_size = weight_matrix.shape[0]
    vector_dim = weight_matrix.shape[1]

    #text data
    self.hidden_dim = 12
    self.embedding = nn.Embedding(vocab_size,vector_dim)
    self.embedding.weight.data.copy_(torch.from_numpy(weight_matrix))
    self.embedding.weight.requires_grad = False

    self.lstm = nn.LSTM(input_size = vector_dim, hidden_size = hidden_dim,num_layers=1,batch_first=True)

    # categorical inputs
    # make a 4d vector for each school_state input
    # 51 is the size of dictionary we keep 
    self.state_embedding = nn.Embedding(51,2)
    self.prefix_embedding =  nn.Embedding(5,3)
    self.cat_embedding = nn.Embedding(50,26)
    self.sub_cat_embedding = nn.Embedding(401,20)
    self.grade_embedding = nn.Embedding(4,2)
    #numerical inputs
    self.numeric = nn.Linear(3,12)

    **self.linear1 = nn.Linear(hidden_dim , 128)**
    self.linear2 = nn.Linear(128,32)
    self.linear3 = nn.Linear(32,2)

  def forward(self,text,state,prefix,cat,sub_cat,grade,num):
    x1 = self.embedding(text)
    lstm_out, (h,c) = self.lstm(x1) #lstm_out #[128, 250, 12]
    out = lstm_out.contiguous().view(-1,self.hidden_dim) 
    
   
    x2 = self.state_embedding(state) #[128,2]
    x3 = self.prefix_embedding(prefix) #[128,3]
    x4 = self.cat_embedding(cat) ##[128,26]
    x5 = self.sub_cat_embedding(sub_cat) #[128,20]
    x6 = self.grade_embedding(grade) ##[128,2]
    x7 = self.numeric(num)
    print(x7.shape)

    combined = torch.cat((out,x2,x3,x4,x5,x6,x7),axis=1)
    
    x = F.relu(self.linear1(out))

    x = F.relu(self.linear2(x))
    x = F.sigmoid(self.linear3(x))[:,-1]
    return x

In the bolded code portion I would like to flatten the LSTM output to dim (batch_size, -1).
How to do that. Please anyone help. Actually Iam trying to concatinate different inputs.

lstm_out contains the last hidden states (last w.r.t to the number of layers) of all time steps. So

out = lstm_out.contiguous().view(-1,self.hidden_dim) 

will have a shape of (batch_size*seq_len, hidden_dim). Depending what you want, you can change it to

out = lstm_out.contiguous().view(batch_size, -1)

resulting in a shape of (batch_size, seq_len*hidden_dim). Again, this would mean you would consider all time steps for further processing. If you want only the last hidden state (last w.r.t. to number of layers AND time steps), you could do:

out = lstm_out[ : , -1, : ]  

which will give you a shape of (batch_size, hidden_dim)

1 Like

@chris, Thanks for the information. If I use

out = lstm_out.contiguous().view(batch_size, -1)

out will be tensor of dim (batch_size, seq_le * hidden_dim). suppose I concatenate this with output of nn.Linear(10,20) and I want to pass the concatenated out through other nn.Linear(a,b).

How can I dynamically initialize dimension ‘a’ as it also depends on seq_length indirectly.(Actually a will be (seq_len * hidden_dim) + 20). But how can i pass this during initialization.

Well, I don’t quite understand your code. For example combined is never used afterwards. It’s also not quite clear to me if all your sequences have the same length. In this case, you could of course do something like

self.linear1 = nn.Linear(seq_len*hidden_dim , 128)

If your sequences do not have the same length, but there is a maximum sequence length max_seq_len, then you can try

self.linear1 = nn.Linear(max_seq_len*hidden_dim , 128)

and then pad all sequences that are shorter than max_seq_len.

Yh your 1st part cleared my doubt.

class Network(nn.Module):
  def __init__(self,weight_matrix=embedding_matrix,hidden_dim=128):
    super().__init__()

    vocab_size = weight_matrix.shape[0]
    vector_dim = weight_matrix.shape[1]

    #text data
    self.hidden_dim = hidden_dim
    seq_len = 250
    self.embedding = nn.Embedding(vocab_size,vector_dim)
    self.embedding.weight.data.copy_(torch.from_numpy(weight_matrix))
    self.embedding.weight.requires_grad = False

    self.lstm = nn.LSTM(input_size = vector_dim, hidden_size = self.hidden_dim,num_layers=1,batch_first=True)

    # categorical inputs
    # make a 4d vector for each school_state input
    # 51 is the size of dictionary we keep 
    self.state_embedding = nn.Embedding(51,2)
    self.prefix_embedding =  nn.Embedding(5,3)
    self.cat_embedding = nn.Embedding(50,26)
    self.sub_cat_embedding = nn.Embedding(401,20)
    self.grade_embedding = nn.Embedding(4,2)
    #numerical inputs
    self.numeric = nn.Linear(3,12)
    
    self.linear1 = nn.Linear((self.hidden_dim * seq_len) + 65 , 128)

    self.linear2 = nn.Linear(128,128)
    self.linear3 = nn.Linear(128,256)
    self.linear4 = nn.Linear(256,64)
    self.bn = nn.BatchNorm1d(64)
    self.linear5 = nn.Linear(64,2)
    self.dropout = nn.Dropout(p=0.2)

  def forward(self,text,state,prefix,cat,sub_cat,grade,num):
    x1 = self.embedding(text)
    lstm_out, (h,c) = self.lstm(x1) #lstm_out #[120, 250, 128]
    out = lstm_out.contiguous() 
    out = out.flatten(start_dim=1)

    # print(out.shape)
   
    x2 = self.state_embedding(state).flatten(start_dim=1) #[120,2]
    x3 = self.prefix_embedding(prefix).flatten(start_dim=1) #[120,3]
    x4 = self.cat_embedding(cat).flatten(start_dim=1) ##[120,26]
    x5 = self.sub_cat_embedding(sub_cat).flatten(start_dim=1) #[120,20]
    x6 = self.grade_embedding(grade).flatten(start_dim=1) ##[120,2]
    x7 = self.numeric(num).flatten(start_dim=1)


    

    combined = torch.cat((out,x2,x3,x4,x5,x6,x7),axis=1)
    # print(combined.shape)
    
    x = F.relu(self.linear1(combined))
    x = F.relu(self.linear2(x))
    x = F.relu(self.linear3(x))
    x = self.dropout(x)
    x = F.relu(self.linear4(x))
    x = self.bn(x)
    x = F.sigmoid(self.linear5(x))

    return x


In first part of my code(init part) I specify seq_len statically.

self.linear1 = nn.Linear((self.hidden_dim * seq_len) + 65 , 128)

Am i right at this portion…? What my doubt is that seq_len always depends on input and here our init code is not dynamic. But I think that fine. Please correct me if iam wrong.

That’s what I meant, if all your inputs have a seq_len of 250, then there’s no issues. If some inputs are shorten, you need to pad them to 250.

1 Like