Post_training static quantization error

when I do static quantization in BERT with pytorch 1.6,an error occurs:
Could not run ‘quantized::layer_norm’ with arguments from the ‘CPU’ backend. 'quantized::layer_norm’ is only available for these backends: [QuantizedCPU]
My model codes are as bellow:

class BertSoftmaxForNerQuantized(BertPreTrainedModel):
    def __init__(self, config):
        super(BertSoftmaxForNerQuantized, self).__init__(config)
        self.quant = torch.quantization.QuantStub()
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.loss_type = config.loss_type
        self.dequant = torch.quantization.DeQuantStub()


    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, labels=None):
        # outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
        input_ids = self.quant(input_ids)
        attention_mask = self.quant(attention_mask)
        labels = self.quant(labels)
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        # sequence_output = self.quant(sequence_output)
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        logits = self.dequant(logits)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            assert self.loss_type in ['lsr', 'focal', 'ce']
            if self.loss_type == 'lsr':
                loss_fct = LabelSmoothingCrossEntropy(ignore_index=0)
            elif self.loss_type == 'focal':
                loss_fct = FocalLoss(ignore_index=0)
            else:
                loss_fct = CrossEntropyLoss(ignore_index=0)
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.contiguous().view(-1) == 1
                active_logits = logits.contiguous().view(-1, self.num_labels)[active_loss]
                active_labels = labels.contiguous().view(-1)[active_loss]
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.contiguous().view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs
        return outputs  # (loss), scores, (hidden_states), (attentions)

the evluate codes are as bellow:

quantized_model.eval()
    quantized_model.qconfig = torch.quantization.default_qconfig
    print('quantized_model.qconfig',quantized_model.qconfig)
    torch.quantization.prepare(quantized_model,inplace=True)
    train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='train')
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = SequentialSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,
                                  collate_fn=collate_fn)
    with torch.no_grad():
        for step, batch in enumerate(tqdm(train_dataloader,desc='post-training static quantization')):
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            outputs = quantized_model(**inputs)
            loss = outputs[0] 
            
    torch.quantization.convert(quantized_model, inplace=True)
    print_model_size(quantized_model)

    
    eval_output_dir = args.output_dir
    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir)
    eval_dataset = load_and_cache_examples(args, args.task_name,tokenizer, data_type='dev')
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size,
                                 collate_fn=collate_fn)
    for step, batch in enumerate(eval_dataloader):
        model.eval()
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if args.model_type != "distilbert":
            outputs = quantized_model(**inputs)

is something wrong in my codes? or It is just the pytorch vision is too latest?

Hi @HUSTHY, this error message means that a fp32 tensor is being passed into the quantized layernorm kernel. There are at least two options to address this:

  1. add a torch.quantization.QuantStub() to convert the tensor to int8 before it enters the quantized layernorm layer. For example,
# init
...
self.quant = torch.quantization.QuantStub()
self.layer_norm = torch.nn.LayerNorm(...)
...

# forward
...
x = self.quant(x) # convert a fp32 tensor to int8
x = self.layer_norm(x)
...
  1. leave the layernorm in fp32. This can be done by setting qconfig = None to prevent a layer from getting quantized.
1 Like

hi, @HUSTHY . Do you solve this problem? I meet the almost same issue about layer_norm.