I wrote a model using convolutional filters of different sizes, from 1 to max_size, and I implemented it the following way:
def __init__( self, embedding_dim, hidden_dim=200, dropout_rate=0.3, max_conv_size=3, criterion=nn.L1Loss(), optimizer = optim.Adadelta, ): """ This architecture performs 2D convolutions using 1D kernels with size 1,...,max_conv_size """ super(AuthorConvNet, self).__init__() self.criterion = criterion self.optimizer = optimizer self.dropout = nn.Dropout(dropout_rate) self.max_conv_size = max_conv_size self.max_pool = lambda x: torch.max(x, dim=len(x.size())-2) # max_pool here is the max over the input raws self.embedding_dim = embedding_dim self.hidden_dim = hidden_dim # max_conv_size 1D convolutional filters of size 1,...,max_conv_size self.convs = [nn.Conv2d(1,1,(i,1)) for i in range(1, self.max_conv_size+1)] self.relu = nn.ReLU() self.linear1 = nn.Linear(embedding_dim*max_conv_size, hidden_dim) self.linear2 = nn.Linear(hidden_dim, hidden_dim) self.linear3 = nn.Linear(hidden_dim, 1) def forward(self, x): z = x.unsqueeze(1) # second dimension corresponds to in_channels, # so it should be 1 (1st is batch size) C =  for convolution in self.convs: # apply convolution for each kernel size between 1 and q conv = convolution(z).squeeze() C.append(conv) H = [self.relu(conv) for conv in C] p = [self.max_pool(h) for h in H] p = torch.cat(p, dim=len(p.size())-1) # multi-layer net out = self.linear1(p) out = self.dropout(out) out = self.relu(out) out = self.linear2(out) out = self.dropout(out) out = self.relu(out) out = self.linear3(out) return out
However, after training, it appears that using the model for predictions yield different results for same input. So I looked at the state_dict saved from training and only linear layers weights are saved (which explains why I obtain different results each time, the network convolutions are reinitialized randomly at each time). Does someone may have an explaination for this ?