I wrote a model using convolutional filters of different sizes, from 1 to max_size, and I implemented it the following way:

```
def __init__(
self,
embedding_dim,
hidden_dim=200,
dropout_rate=0.3,
max_conv_size=3,
criterion=nn.L1Loss(),
optimizer = optim.Adadelta,
):
""" This architecture performs 2D convolutions using 1D kernels with size 1,...,max_conv_size """
super(AuthorConvNet, self).__init__()
self.criterion = criterion
self.optimizer = optimizer
self.dropout = nn.Dropout(dropout_rate)
self.max_conv_size = max_conv_size
self.max_pool = lambda x: torch.max(x, dim=len(x.size())-2)[0] # max_pool here is the max over the input raws
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
# max_conv_size 1D convolutional filters of size 1,...,max_conv_size
self.convs = [nn.Conv2d(1,1,(i,1)) for i in range(1, self.max_conv_size+1)]
self.relu = nn.ReLU()
self.linear1 = nn.Linear(embedding_dim*max_conv_size, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, 1)
def forward(self, x):
z = x.unsqueeze(1) # second dimension corresponds to in_channels,
# so it should be 1 (1st is batch size)
C = []
for convolution in self.convs: # apply convolution for each kernel size between 1 and q
conv = convolution(z).squeeze()
C.append(conv)
H = [self.relu(conv) for conv in C]
p = [self.max_pool(h) for h in H]
p = torch.cat(p, dim=len(p[0].size())-1)
# multi-layer net
out = self.linear1(p)
out = self.dropout(out)
out = self.relu(out)
out = self.linear2(out)
out = self.dropout(out)
out = self.relu(out)
out = self.linear3(out)
return out
```

However, after training, it appears that using the model for predictions yield different results for same input. So I looked at the state_dict saved from training and only linear layers weights are saved (which explains why I obtain different results each time, the network convolutions are reinitialized randomly at each time). Does someone may have an explaination for this ?