Submodule's parameters seems not updated in training

Hi, I wrote a module based on this article: http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/

The idea is pass the input into multiple streams then concat together and connect to a FC layer. I divided my source code into 3 custom modules: TextClassifyCnnNet >> FlatCnnLayer >> FilterLayer

class FilterLayer(nn.Module):
    def __init__(self, filter_size, embedding_size, sequence_length, out_channels=128):
        super(FilterLayer, self).__init__()

        self.model = nn.Sequential(
            nn.Conv2d(1, out_channels, (filter_size, embedding_size)),
            nn.ReLU(inplace=True),
            nn.MaxPool2d((sequence_length - filter_size + 1, 1), stride=1)
        )

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))

    def forward(self, x):
        return self.model(x)
class FlatCnnLayer(nn.Module):
    def __init__(self, embedding_size, sequence_length, filter_sizes=[3, 4, 5], out_channels=128):
        super(FlatCnnLayer, self).__init__()

        self.filter_layers = nn.ModuleList(
            [FilterLayer(filter_size, embedding_size, sequence_length, out_channels=out_channels) for
             filter_size in filter_sizes])

    def forward(self, x):
        pools = []
        for filter_layer in self.filter_layers:
            out_filter = filter_layer(x)
            # reshape from (batch_size, out_channels, h, w) to (batch_size, h, w, out_channels)
            pools.append(out_filter.view(out_filter.size()[0], 1, 1, -1))
        x = torch.cat(pools, dim=3)

        x = x.view(x.size()[0], -1)
        x = F.dropout(x, p=dropout_prob, training=True)

        return x
class TextClassifyCnnNet(nn.Module):
    def __init__(self, embedding_size, sequence_length, num_classes, filter_sizes=[3, 4, 5], out_channels=128):
        super(TextClassifyCnnNet, self).__init__()

        self.flat_layer = FlatCnnLayer(embedding_size, sequence_length, filter_sizes=filter_sizes,
                                       out_channels=out_channels)

        self.model = nn.Sequential(
            self.flat_layer,
            nn.Linear(out_channels * len(filter_sizes), num_classes)
        )

    def forward(self, x):
        x = self.model(x)

        return x


def fit(net, data, save_path):
    if torch.cuda.is_available():
        net = net.cuda()

    for param in list(net.parameters()):
        print(type(param.data), param.size())

    optimizer = optim.Adam(net.parameters(), lr=0.01, weight_decay=0.1)

    X_train, X_test = data['X_train'], data['X_test']
    Y_train, Y_test = data['Y_train'], data['Y_test']

    X_valid, Y_valid = data['X_valid'], data['Y_valid']

    n_batch = len(X_train) // batch_size

    for epoch in range(1, n_epochs + 1):  # loop over the dataset multiple times
        net.train()
        start = 0
        end = batch_size

        for batch_idx in range(1, n_batch + 1):
            # get the inputs
            x, y = X_train[start:end], Y_train[start:end]
            start = end
            end = start + batch_size

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            predicts = _get_predict(net, x)
            loss = _get_loss(predicts, y)
            loss.backward()
            optimizer.step()

            if batch_idx % display_step == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(x), len(X_train), 100. * batch_idx / (n_batch + 1), loss.data[0]))

        # print statistics
        if epoch % display_step == 0 or epoch == 1:
            net.eval()
            valid_predicts = _get_predict(net, X_valid)
            valid_loss = _get_loss(valid_predicts, Y_valid)
            valid_accuracy = _get_accuracy(valid_predicts, Y_valid)
            print('\r[%d] loss: %.3f - accuracy: %.2f' % (epoch, valid_loss.data[0], valid_accuracy * 100))

    print('\rFinished Training\n')

    net.eval()

    test_predicts = _get_predict(net, X_test)
    test_loss = _get_loss(test_predicts, Y_test).data[0]
    test_accuracy = _get_accuracy(test_predicts, Y_test)
    print('Test loss: %.3f - Test accuracy: %.2f' % (test_loss, test_accuracy * 100))

    torch.save(net.flat_layer.state_dict(), save_path)


def _get_accuracy(predicts, labels):
    predicts = torch.max(predicts, 1)[1].data[0]
    return np.mean(predicts == labels)


def _get_predict(net, x):
    # wrap them in Variable
    inputs = torch.from_numpy(x).float()
    # convert to cuda tensors if cuda flag is true
    if torch.cuda.is_available:
        inputs = inputs.cuda()
    inputs = Variable(inputs)
    return net(inputs)


def _get_loss(predicts, labels):
    labels = torch.from_numpy(labels).long()
    # convert to cuda tensors if cuda flag is true
    if torch.cuda.is_available:
        labels = labels.cuda()
    labels = Variable(labels)
    return F.cross_entropy(predicts, labels)

It seems parameters remain not changed at all, I have tried to print .grad of them but it return None.

The first thing I would verify is that the loop that contains:

            # forward + backward + optimize
            predicts = self.get_predict(x)
            loss = self.get_loss(predicts, y)
            loss.backward()
            optimizer.step() 

Is running as you expect. (is n_batch set to 0?) Is it looping?

Assuming that is working correctly I would examine the actual output of the loss function:
self.get_loss(...)

And ensure it is outputting a loss signal that makes sense.

Assuming all of the above checks out I would examine the output of self.model.parameters() … and ensure it is returning the modules / weights as you expect.

for param in list(self.model.parameters()):
    print(type(param.data), param.size())

n_batch = len(X_train) // batch_size
print "n_batch: ", n_batch

Output:

(<class ‘torch.cuda.FloatTensor’>, (128L, 1L, 3L, 128L))
(<class ‘torch.cuda.FloatTensor’>, (128L,))
(<class ‘torch.cuda.FloatTensor’>, (128L, 1L, 4L, 128L))
(<class ‘torch.cuda.FloatTensor’>, (128L,))
(<class ‘torch.cuda.FloatTensor’>, (128L, 1L, 5L, 128L))
(<class ‘torch.cuda.FloatTensor’>, (128L,))
(<class ‘torch.cuda.FloatTensor’>, (13L, 384L))
(<class ‘torch.cuda.FloatTensor’>, (13L,))
n_batch: 23

it contains all parameters of conv layer, fc layer and bias

Did you verify that all the contents within fit(…) are being called correctly? Like, is n_epochs set > 0 ?

Perhaps add to:

for _ in range(n_batch):
# get the inputs
x, y = X_train[start:end], Y_train[start:end]
print(_) # ← add here, verify inner loop is being called

of course, I added print in there to ensure everything correct

I just try to replace cross_entropy with nll_loss and refactor forward function, but why it returns another loss value (but still wrong value)

    def forward(self, x):
        x = self.model(x)

        return F.log_softmax(x)

I have similar implementation in Tensorflow, so I know how loss value drop if doing correctly

I realised that L2 Loss make loss value remain unchange. It works when I remove L2 Loss:

# optimizer = optim.Adam(net.parameters(), lr=0.01, weight_decay=0.1)
optimizer = optim.Adam(model.parameters(), lr=0.001)

I think it’s a bug of Pytorch, but this result is still acceptable for me, so I will close this topic.