Your module has parameters that were not used in producing loss

why the gradient can not backward to self.fc3?
class MatchingMargin(nn.Module):
def init(self, s=1, m=0.1, type=‘prod’):
super(MatchingMargin, self).init()

    self.type = type
    self.s = nn.Parameter(torch.randn(1,)) if s is None else s
    self.m = m
    self.prototype = nn.Parameter(torch.randn(260, 256))
    self.fc3 = nn.Linear(272,128)

    alphahet = get_alphabet()
    dt_tree = DecomposeTree()

    label_encodings = []
    for char in alphahet:
        char_embedding = dt_tree.query_feature(char)
        label_encodings.append(char_embedding)

    # label_encodings = label_encodings[:template.size(0)]

    self.label_encodings = torch.from_numpy(
        np.stack(label_encodings, 0),
    ).float()

    # print(self.label_encodings.requires_grad)
    self.label_encodings = nn.Parameter(self.label_encodings)

    # self.register_buffer('W', self.label_encodings)

def forward(self, input, template, label=None,train=True):
    device = input.device
    # self.W = self.W[:template.size(0)].to(device)
    label_encodings = self.label_encodings[:template.size(0)].to(device)
    tree_vec = self.fc3(label_encodings)

    if self.type == 'prod':
        cosine = F.linear(input, template)
        dis = F.linear(input,tree_vec)
        cosine = cosine + dis
    elif self.type == 'cosine':
        cosine = F.linear(F.normalize(input), F.normalize(template))
    elif self.type == 'euclid':
        cosine = torch.norm(input.unsqueeze(1) - template.unsqueeze(0), p=2, dim=2)
        # print('input', input.size(), 'template', template.size(), 'cosine', cosine.size())
    else:
        raise NotImplementedError()

    if label is not None and np.fabs(self.m) > 1e-4:
        one_hot = torch.zeros_like(cosine)
        one_hot.scatter_(1, label.view(-1, 1), 1.0)
        cosine = (cosine - one_hot * self.m)

    output = cosine * self.s
    return output

Are the parameters in self.fc3 not receiving any gradients or is this layer not back propagating them? In the latter case I would guess the label_encodings might not be differentiable.