Quantization of Transformer models in Fairseq

Given that the model loaded from PyTorch hub:

import torch
torch.cuda.is_available()
en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model')
en2de.translate('hello world') # [out]: 'Hallo Welt'

type(en2de.models[0])   # [out]: fairseq.models.transformer.TransformerModel

The quantization seems to be successful:

en2de_q0 = torch.quantization.quantize_dynamic(
    en2de.models[0], {torch.nn.Linear}, dtype=torch.qint8
)

type(en2de_q0) # [out]: fairseq.models.transformer.TransformerModel

But after trying to overwrite the model, the translate function inside TransformerModel fails:

en2de.models[0] = en2de_q0
en2de.translate('hello world')

[out]:


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-12-adf73479cb54> in <module>
----> 1 en2de.translate('hello world')

~/.cache/torch/hub/pytorch_fairseq_master/fairseq/hub_utils.py in translate(self, sentences, beam, verbose, **kwargs)
    120 
    121     def translate(self, sentences: List[str], beam: int = 5, verbose: bool = False, **kwargs) -> List[str]:
--> 122         return self.sample(sentences, beam, verbose, **kwargs)
    123 
    124     def sample(self, sentences: List[str], beam: int = 1, verbose: bool = False, **kwargs) -> List[str]:

~/.cache/torch/hub/pytorch_fairseq_master/fairseq/hub_utils.py in sample(self, sentences, beam, verbose, **kwargs)
    124     def sample(self, sentences: List[str], beam: int = 1, verbose: bool = False, **kwargs) -> List[str]:
    125         if isinstance(sentences, str):
--> 126             return self.sample([sentences], beam=beam, verbose=verbose, **kwargs)[0]
    127         tokenized_sentences = [self.encode(sentence) for sentence in sentences]
    128         batched_hypos = self.generate(tokenized_sentences, beam, verbose, **kwargs)

~/.cache/torch/hub/pytorch_fairseq_master/fairseq/hub_utils.py in sample(self, sentences, beam, verbose, **kwargs)
    126             return self.sample([sentences], beam=beam, verbose=verbose, **kwargs)[0]
    127         tokenized_sentences = [self.encode(sentence) for sentence in sentences]
--> 128         batched_hypos = self.generate(tokenized_sentences, beam, verbose, **kwargs)
    129         return [self.decode(hypos[0]['tokens']) for hypos in batched_hypos]
    130 

~/.cache/torch/hub/pytorch_fairseq_master/fairseq/hub_utils.py in generate(self, tokenized_sentences, beam, verbose, skip_invalid_size_inputs, **kwargs)
    159         for batch in self._build_batches(tokenized_sentences, skip_invalid_size_inputs):
    160             batch = utils.apply_to_sample(lambda t: t.to(self.device), batch)
--> 161             translations = self.task.inference_step(generator, self.models, batch)
    162             for id, hypos in zip(batch["id"].tolist(), translations):
    163                 results.append((id, hypos))

~/.cache/torch/hub/pytorch_fairseq_master/fairseq/tasks/fairseq_task.py in inference_step(self, generator, models, sample, prefix_tokens)
    304     def inference_step(self, generator, models, sample, prefix_tokens=None):
    305         with torch.no_grad():
--> 306             return generator.generate(models, sample, prefix_tokens=prefix_tokens)
    307 
    308     def update_step(self, num_updates):

/usr/local/lib/python3.7/site-packages/torch/autograd/grad_mode.py in decorate_no_grad(*args, **kwargs)
     47         def decorate_no_grad(*args, **kwargs):
     48             with self:
---> 49                 return func(*args, **kwargs)
     50         return decorate_no_grad
     51 

~/.cache/torch/hub/pytorch_fairseq_master/fairseq/sequence_generator.py in generate(self, models, sample, **kwargs)
     90         """
     91         model = EnsembleModel(models)
---> 92         return self._generate(model, sample, **kwargs)
     93 
     94     @torch.no_grad()

/usr/local/lib/python3.7/site-packages/torch/autograd/grad_mode.py in decorate_no_grad(*args, **kwargs)
     47         def decorate_no_grad(*args, **kwargs):
     48             with self:
---> 49                 return func(*args, **kwargs)
     50         return decorate_no_grad
     51 

~/.cache/torch/hub/pytorch_fairseq_master/fairseq/sequence_generator.py in _generate(self, model, sample, prefix_tokens, bos_token, **kwargs)
    130 
    131         # compute the encoder output for each beam
--> 132         encoder_outs = model.forward_encoder(encoder_input)
    133         new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1)
    134         new_order = new_order.to(src_tokens.device).long()

/usr/local/lib/python3.7/site-packages/torch/autograd/grad_mode.py in decorate_no_grad(*args, **kwargs)
     47         def decorate_no_grad(*args, **kwargs):
     48             with self:
---> 49                 return func(*args, **kwargs)
     50         return decorate_no_grad
     51 

~/.cache/torch/hub/pytorch_fairseq_master/fairseq/sequence_generator.py in forward_encoder(self, encoder_input)
    520         if not self.has_encoder():
    521             return None
--> 522         return [model.encoder(**encoder_input) for model in self.models]
    523 
    524     @torch.no_grad()

~/.cache/torch/hub/pytorch_fairseq_master/fairseq/sequence_generator.py in <listcomp>(.0)
    520         if not self.has_encoder():
    521             return None
--> 522         return [model.encoder(**encoder_input) for model in self.models]
    523 
    524     @torch.no_grad()

/usr/local/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    539             result = self._slow_forward(*input, **kwargs)
    540         else:
--> 541             result = self.forward(*input, **kwargs)
    542         for hook in self._forward_hooks.values():
    543             hook_result = hook(self, input, result)

~/.cache/torch/hub/pytorch_fairseq_master/fairseq/models/transformer.py in forward(self, src_tokens, src_lengths, cls_input, return_all_hiddens, **unused)
    404             dropout_probability = random.uniform(0, 1)
    405             if not self.training or (dropout_probability > self.encoder_layerdrop):
--> 406                 x = layer(x, encoder_padding_mask)
    407                 if return_all_hiddens:
    408                     encoder_states.append(x)

/usr/local/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    539             result = self._slow_forward(*input, **kwargs)
    540         else:
--> 541             result = self.forward(*input, **kwargs)
    542         for hook in self._forward_hooks.values():
    543             hook_result = hook(self, input, result)

~/.cache/torch/hub/pytorch_fairseq_master/fairseq/modules/transformer_layer.py in forward(self, x, encoder_padding_mask, attn_mask)
     93         # TODO: to formally solve this problem, we need to change fairseq's
     94         # MultiheadAttention. We will do this later on.
---> 95         x, _ = self.self_attn(query=x, key=x, value=x, key_padding_mask=encoder_padding_mask)
     96         x = F.dropout(x, p=self.dropout, training=self.training)
     97         x = residual + x

/usr/local/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    539             result = self._slow_forward(*input, **kwargs)
    540         else:
--> 541             result = self.forward(*input, **kwargs)
    542         for hook in self._forward_hooks.values():
    543             hook_result = hook(self, input, result)

~/.cache/torch/hub/pytorch_fairseq_master/fairseq/modules/multihead_attention.py in forward(self, query, key, value, key_padding_mask, incremental_state, need_weights, static_kv, attn_mask, before_softmax, need_head_weights)
    126                                                   self.embed_dim, self.num_heads,
    127                                                   torch.empty([0]),
--> 128                                                   torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
    129                                                   self.bias_k, self.bias_v,
    130                                                   self.add_zero_attn, self.dropout,

TypeError: expected Tensor as element 0 in argument 0, but got method

Anyone knows how to resolve this? And quantize the model and still make it translate?

Loooking at the parameters, they don’t look too different and I don’t see the quantized parameters:

list(en2de_q0.parameters())

[out]:

[Parameter containing:
 tensor([[ 0.0109,  0.0018,  0.0024,  ...,  0.0170, -0.0071, -0.0251],
         [ 0.0109,  0.0018,  0.0024,  ...,  0.0170, -0.0071, -0.0250],
         [-0.0906,  0.0405, -0.0281,  ..., -0.1006, -0.0363, -0.0373],
         ...,
         [ 0.0109,  0.0018,  0.0024,  ...,  0.0170, -0.0071, -0.0250],
         [ 0.0109,  0.0018,  0.0024,  ...,  0.0170, -0.0071, -0.0250],
         [ 0.0110,  0.0019,  0.0025,  ...,  0.0171, -0.0069, -0.0248]],
        requires_grad=True), Parameter containing:
 tensor([1.6650, 2.0582, 2.2051,  ..., 0.3772, 0.4279, 0.7772],
        requires_grad=True), Parameter containing:
 tensor([-0.0429,  0.4521, -0.2979,  ...,  0.2960,  0.0998,  0.5742],
        requires_grad=True), Parameter containing:
 tensor([0.3566, 0.3331, 0.4156,  ..., 0.3027, 0.3934, 0.4597],
        requires_grad=True), Parameter containing:
 tensor([ 0.0039, -0.0139,  0.0799,  ..., -0.0354,  0.0002, -0.0507],
        requires_grad=True), Parameter containing:
 tensor([0.7108, 0.6487, 0.8303,  ..., 0.5031, 0.6099, 0.8633],
        requires_grad=True), Parameter containing:
 tensor([-0.3873,  0.0271, -0.2536,  ..., -0.1306, -0.0470,  0.1739],
        requires_grad=True), Parameter containing:
 tensor([0.4754, 0.4790, 0.5809,  ..., 0.3982, 0.4633, 0.5181],
        requires_grad=True), Parameter containing:
 tensor([ 0.0498,  0.0161,  0.0720,  ...,  0.0221,  0.0262, -0.0039],
        requires_grad=True), Parameter containing:
 tensor([0.7605, 0.5692, 0.7406,  ..., 0.6080, 0.7313, 0.9108],
        requires_grad=True), Parameter containing:
 tensor([-0.2919,  0.0165, -0.1515,  ...,  0.0878,  0.1131, -0.0286],
        requires_grad=True), Parameter containing:
 tensor([0.4510, 0.4098, 0.5094,  ..., 0.4080, 0.4713, 0.5315],
        requires_grad=True), Parameter containing:
 tensor([0.0489, 0.0145, 0.0468,  ..., 0.0118, 0.0073, 0.0143],
        requires_grad=True), Parameter containing:
 tensor([0.7937, 0.6190, 0.7745,  ..., 0.6243, 0.7671, 0.9124],
        requires_grad=True), Parameter containing:
 tensor([-0.1167, -0.0649, -0.1412,  ...,  0.2262,  0.1255, -0.0199],
        requires_grad=True), Parameter containing:
 tensor([0.4677, 0.4157, 0.4952,  ..., 0.4077, 0.4837, 0.5452],
        requires_grad=True), Parameter containing:
 tensor([ 0.0330,  0.0138,  0.0262,  ..., -0.0178, -0.0088,  0.0108],
        requires_grad=True), Parameter containing:
 tensor([0.7569, 0.6327, 0.7345,  ..., 0.5937, 0.7591, 0.8758],
        requires_grad=True), Parameter containing:
 tensor([ 0.0460, -0.0602, -0.1474,  ..., -0.0467,  0.1447, -0.1098],
        requires_grad=True), Parameter containing:
 tensor([0.5098, 0.4489, 0.5239,  ..., 0.4316, 0.5104, 0.5754],
        requires_grad=True), Parameter containing:
 tensor([-0.0279, -0.0034,  0.0159,  ..., -0.0088, -0.0531, -0.0025],
        requires_grad=True), Parameter containing:
 tensor([0.6944, 0.6200, 0.6646,  ..., 0.6369, 0.6630, 0.7936],
        requires_grad=True), Parameter containing:
 tensor([ 0.0425, -0.1008, -0.0308,  ...,  0.0353, -0.0847, -0.1735],
        requires_grad=True), Parameter containing:
 tensor([0.2035, 0.1787, 0.1948,  ..., 0.1657, 0.2053, 0.2193],
        requires_grad=True), Parameter containing:
 tensor([-0.0103,  0.0007, -0.0053,  ..., -0.0091, -0.0024,  0.0073],
        requires_grad=True), Parameter containing:
 tensor([0.7927, 0.8630, 0.9708,  ..., 0.9663, 0.8602, 0.8438],
        requires_grad=True), Parameter containing:
 tensor([-0.5716,  0.2891,  0.2969,  ...,  0.0658,  0.3705,  0.4201],
        requires_grad=True), Parameter containing:
 tensor([0.9630, 0.8330, 0.8355,  ..., 0.9208, 0.8964, 0.9021],
        requires_grad=True), Parameter containing:
 tensor([-0.5408,  0.1112, -0.0177,  ..., -0.1004,  0.0218,  0.1819],
        requires_grad=True), Parameter containing:
 tensor([0.2730, 0.3580, 0.3483,  ..., 0.4263, 0.4327, 0.4214],
        requires_grad=True), Parameter containing:
 tensor([ 0.0389, -0.0258, -0.0110,  ..., -0.0194, -0.0296, -0.0428],
        requires_grad=True), Parameter containing:
 tensor([0.9382, 0.8844, 0.8857,  ..., 1.0070, 0.9959, 1.0378],
        requires_grad=True), Parameter containing:
 tensor([-0.1183, -0.1011,  0.1333,  ..., -0.1546,  0.1757,  0.0622],
        requires_grad=True), Parameter containing:
 tensor([0.9625, 0.8622, 0.9251,  ..., 0.8971, 0.9141, 0.9558],
        requires_grad=True), Parameter containing:
 tensor([-0.1533,  0.0882,  0.0403,  ..., -0.0873,  0.1071,  0.1290],
        requires_grad=True), Parameter containing:
 tensor([0.4511, 0.4763, 0.5090,  ..., 0.5148, 0.5195, 0.5114],
        requires_grad=True), Parameter containing:
 tensor([ 0.0111, -0.0262, -0.0249,  ..., -0.0013, -0.0405, -0.0426],
        requires_grad=True), Parameter containing:
 tensor([1.0736, 1.0671, 1.0402,  ..., 1.1028, 1.0349, 1.1180],
        requires_grad=True), Parameter containing:
 tensor([-0.0826,  0.1648,  0.0594,  ...,  0.0423,  0.0210,  0.1997],
        requires_grad=True), Parameter containing:
 tensor([0.9743, 0.8883, 0.8962,  ..., 0.9011, 0.8924, 0.9486],
        requires_grad=True), Parameter containing:
 tensor([-0.0676,  0.1354,  0.0881,  ..., -0.0470,  0.0819,  0.1521],
        requires_grad=True), Parameter containing:
 tensor([0.5064, 0.5148, 0.5453,  ..., 0.5547, 0.5714, 0.5448],
        requires_grad=True), Parameter containing:
 tensor([-0.0062, -0.0382, -0.0277,  ..., -0.0148, -0.0280, -0.0436],
        requires_grad=True), Parameter containing:
 tensor([1.1094, 1.1115, 1.0514,  ..., 1.1308, 1.0555, 1.1376],
        requires_grad=True), Parameter containing:
 tensor([-0.1110,  0.0899,  0.1120,  ..., -0.1669, -0.0578,  0.0723],
        requires_grad=True), Parameter containing:
 tensor([0.9829, 0.8915, 0.8747,  ..., 0.9197, 0.9132, 0.9313],
        requires_grad=True), Parameter containing:
 tensor([-0.2108,  0.0420,  0.0976,  ..., -0.0915,  0.0075, -0.0368],
        requires_grad=True), Parameter containing:
 tensor([0.5855, 0.6197, 0.6315,  ..., 0.6086, 0.6402, 0.6502],
        requires_grad=True), Parameter containing:
 tensor([ 0.0119, -0.0300, -0.0460,  ...,  0.0003, -0.0203, -0.0224],
        requires_grad=True), Parameter containing:
 tensor([1.1550, 1.1142, 1.1391,  ..., 1.1255, 1.0924, 1.1411],
        requires_grad=True), Parameter containing:
 tensor([-0.0424,  0.1162, -0.0843,  ..., -0.0932, -0.0413,  0.1667],
        requires_grad=True), Parameter containing:
 tensor([0.9111, 0.8723, 0.7811,  ..., 0.9060, 0.9096, 0.9097],
        requires_grad=True), Parameter containing:
 tensor([-0.2315,  0.0282, -0.2177,  ...,  0.0301, -0.0341, -0.2896],
        requires_grad=True), Parameter containing:
 tensor([0.6610, 0.6736, 0.6347,  ..., 0.6703, 0.6840, 0.7031],
        requires_grad=True), Parameter containing:
 tensor([-0.0064, -0.0492, -0.0049,  ..., -0.0257, -0.0278,  0.0006],
        requires_grad=True), Parameter containing:
 tensor([1.1620, 1.0839, 1.1276,  ..., 1.1419, 1.1669, 1.1023],
        requires_grad=True), Parameter containing:
 tensor([ 0.0491, -0.0879,  0.1410,  ...,  0.0672,  0.0349,  0.2100],
        requires_grad=True), Parameter containing:
 tensor([0.8549, 0.8141, 0.7377,  ..., 0.7198, 0.7090, 0.7332],
        requires_grad=True), Parameter containing:
 tensor([ 0.3034, -0.0049, -0.0414,  ...,  0.2019,  0.0855, -0.2630],
        requires_grad=True), Parameter containing:
 tensor([2.0453, 2.5422, 2.4132,  ..., 0.9301, 0.8366, 0.7796],
        requires_grad=True), Parameter containing:
 tensor([0.0008, 0.0556, 0.1556,  ..., 0.0142, 0.0115, 0.0733],
        requires_grad=True)]

And the original model’s parameter:

list(en2de.models[0].parameters())

[out]:

        requires_grad=True), Parameter containing:
 tensor([-0.3873,  0.0271, -0.2536,  ..., -0.1306, -0.0470,  0.1739],
        requires_grad=True), Parameter containing:
 tensor([0.4754, 0.4790, 0.5809,  ..., 0.3982, 0.4633, 0.5181],
        requires_grad=True), Parameter containing:
 tensor([ 0.0498,  0.0161,  0.0720,  ...,  0.0221,  0.0262, -0.0039],
        requires_grad=True), Parameter containing:
 tensor([0.7605, 0.5692, 0.7406,  ..., 0.6080, 0.7313, 0.9108],
        requires_grad=True), Parameter containing:
 tensor([-0.2919,  0.0165, -0.1515,  ...,  0.0878,  0.1131, -0.0286],
        requires_grad=True), Parameter containing:
 tensor([0.4510, 0.4098, 0.5094,  ..., 0.4080, 0.4713, 0.5315],
        requires_grad=True), Parameter containing:
 tensor([0.0489, 0.0145, 0.0468,  ..., 0.0118, 0.0073, 0.0143],
        requires_grad=True), Parameter containing:
 tensor([0.7937, 0.6190, 0.7745,  ..., 0.6243, 0.7671, 0.9124],
        requires_grad=True), Parameter containing:
 tensor([-0.1167, -0.0649, -0.1412,  ...,  0.2262,  0.1255, -0.0199],
        requires_grad=True), Parameter containing:
 tensor([0.4677, 0.4157, 0.4952,  ..., 0.4077, 0.4837, 0.5452],
        requires_grad=True), Parameter containing:
 tensor([ 0.0330,  0.0138,  0.0262,  ..., -0.0178, -0.0088,  0.0108],
        requires_grad=True), Parameter containing:
 tensor([0.7569, 0.6327, 0.7345,  ..., 0.5937, 0.7591, 0.8758],
        requires_grad=True), Parameter containing:
 tensor([ 0.0460, -0.0602, -0.1474,  ..., -0.0467,  0.1447, -0.1098],
        requires_grad=True), Parameter containing:
 tensor([0.5098, 0.4489, 0.5239,  ..., 0.4316, 0.5104, 0.5754],
        requires_grad=True), Parameter containing:
 tensor([-0.0279, -0.0034,  0.0159,  ..., -0.0088, -0.0531, -0.0025],
        requires_grad=True), Parameter containing:
 tensor([0.6944, 0.6200, 0.6646,  ..., 0.6369, 0.6630, 0.7936],
        requires_grad=True), Parameter containing:
 tensor([ 0.0425, -0.1008, -0.0308,  ...,  0.0353, -0.0847, -0.1735],
        requires_grad=True), Parameter containing:
 tensor([0.2035, 0.1787, 0.1948,  ..., 0.1657, 0.2053, 0.2193],
        requires_grad=True), Parameter containing:
 tensor([-0.0103,  0.0007, -0.0053,  ..., -0.0091, -0.0024,  0.0073],
        requires_grad=True), Parameter containing:
 tensor([0.7927, 0.8630, 0.9708,  ..., 0.9663, 0.8602, 0.8438],
        requires_grad=True), Parameter containing:
 tensor([-0.5716,  0.2891,  0.2969,  ...,  0.0658,  0.3705,  0.4201],
        requires_grad=True), Parameter containing:
 tensor([0.9630, 0.8330, 0.8355,  ..., 0.9208, 0.8964, 0.9021],
        requires_grad=True), Parameter containing:
 tensor([-0.5408,  0.1112, -0.0177,  ..., -0.1004,  0.0218,  0.1819],
        requires_grad=True), Parameter containing:
 tensor([0.2730, 0.3580, 0.3483,  ..., 0.4263, 0.4327, 0.4214],
        requires_grad=True), Parameter containing:
 tensor([ 0.0389, -0.0258, -0.0110,  ..., -0.0194, -0.0296, -0.0428],
        requires_grad=True), Parameter containing:
 tensor([0.9382, 0.8844, 0.8857,  ..., 1.0070, 0.9959, 1.0378],
        requires_grad=True), Parameter containing:
 tensor([-0.1183, -0.1011,  0.1333,  ..., -0.1546,  0.1757,  0.0622],
        requires_grad=True), Parameter containing:
 tensor([0.9625, 0.8622, 0.9251,  ..., 0.8971, 0.9141, 0.9558],
        requires_grad=True), Parameter containing:
 tensor([-0.1533,  0.0882,  0.0403,  ..., -0.0873,  0.1071,  0.1290],
        requires_grad=True), Parameter containing:
 tensor([0.4511, 0.4763, 0.5090,  ..., 0.5148, 0.5195, 0.5114],
        requires_grad=True), Parameter containing:
 tensor([ 0.0111, -0.0262, -0.0249,  ..., -0.0013, -0.0405, -0.0426],
        requires_grad=True), Parameter containing:
 tensor([1.0736, 1.0671, 1.0402,  ..., 1.1028, 1.0349, 1.1180],
        requires_grad=True), Parameter containing:
 tensor([-0.0826,  0.1648,  0.0594,  ...,  0.0423,  0.0210,  0.1997],
        requires_grad=True), Parameter containing:
 tensor([0.9743, 0.8883, 0.8962,  ..., 0.9011, 0.8924, 0.9486],
        requires_grad=True), Parameter containing:
 tensor([-0.0676,  0.1354,  0.0881,  ..., -0.0470,  0.0819,  0.1521],
        requires_grad=True), Parameter containing:
 tensor([0.5064, 0.5148, 0.5453,  ..., 0.5547, 0.5714, 0.5448],
        requires_grad=True), Parameter containing:
 tensor([-0.0062, -0.0382, -0.0277,  ..., -0.0148, -0.0280, -0.0436],
        requires_grad=True), Parameter containing:
 tensor([1.1094, 1.1115, 1.0514,  ..., 1.1308, 1.0555, 1.1376],
        requires_grad=True), Parameter containing:
 tensor([-0.1110,  0.0899,  0.1120,  ..., -0.1669, -0.0578,  0.0723],
        requires_grad=True), Parameter containing:
 tensor([0.9829, 0.8915, 0.8747,  ..., 0.9197, 0.9132, 0.9313],
        requires_grad=True), Parameter containing:
 tensor([-0.2108,  0.0420,  0.0976,  ..., -0.0915,  0.0075, -0.0368],
        requires_grad=True), Parameter containing:
 tensor([0.5855, 0.6197, 0.6315,  ..., 0.6086, 0.6402, 0.6502],
        requires_grad=True), Parameter containing:
 tensor([ 0.0119, -0.0300, -0.0460,  ...,  0.0003, -0.0203, -0.0224],
        requires_grad=True), Parameter containing:
 tensor([1.1550, 1.1142, 1.1391,  ..., 1.1255, 1.0924, 1.1411],
        requires_grad=True), Parameter containing:
 tensor([-0.0424,  0.1162, -0.0843,  ..., -0.0932, -0.0413,  0.1667],
        requires_grad=True), Parameter containing:
 tensor([0.9111, 0.8723, 0.7811,  ..., 0.9060, 0.9096, 0.9097],
        requires_grad=True), Parameter containing:
 tensor([-0.2315,  0.0282, -0.2177,  ...,  0.0301, -0.0341, -0.2896],
        requires_grad=True), Parameter containing:
 tensor([0.6610, 0.6736, 0.6347,  ..., 0.6703, 0.6840, 0.7031],
        requires_grad=True), Parameter containing:
 tensor([-0.0064, -0.0492, -0.0049,  ..., -0.0257, -0.0278,  0.0006],
        requires_grad=True), Parameter containing:
 tensor([1.1620, 1.0839, 1.1276,  ..., 1.1419, 1.1669, 1.1023],
        requires_grad=True), Parameter containing:
 tensor([ 0.0491, -0.0879,  0.1410,  ...,  0.0672,  0.0349,  0.2100],
        requires_grad=True), Parameter containing:
 tensor([0.8549, 0.8141, 0.7377,  ..., 0.7198, 0.7090, 0.7332],
        requires_grad=True), Parameter containing:
 tensor([ 0.3034, -0.0049, -0.0414,  ...,  0.2019,  0.0855, -0.2630],
        requires_grad=True), Parameter containing:
 tensor([2.0453, 2.5422, 2.4132,  ..., 0.9301, 0.8366, 0.7796],
        requires_grad=True), Parameter containing:
 tensor([0.0008, 0.0556, 0.1556,  ..., 0.0142, 0.0115, 0.0733],
        requires_grad=True)]
In [15]:
list(en2de.models[0].parameters())
list(en2de.models[0].parameters())
Out[15]:
[Parameter containing:
 tensor([[ 0.0109,  0.0018,  0.0024,  ...,  0.0170, -0.0071, -0.0251],
         [ 0.0109,  0.0018,  0.0024,  ...,  0.0170, -0.0071, -0.0250],
         [-0.0906,  0.0405, -0.0281,  ..., -0.1006, -0.0363, -0.0373],
         ...,
         [ 0.0109,  0.0018,  0.0024,  ...,  0.0170, -0.0071, -0.0250],
         [ 0.0109,  0.0018,  0.0024,  ...,  0.0170, -0.0071, -0.0250],
         [ 0.0110,  0.0019,  0.0025,  ...,  0.0171, -0.0069, -0.0248]],
        requires_grad=True), Parameter containing:
 tensor([1.6650, 2.0582, 2.2051,  ..., 0.3772, 0.4279, 0.7772],
        requires_grad=True), Parameter containing:
 tensor([-0.0429,  0.4521, -0.2979,  ...,  0.2960,  0.0998,  0.5742],
        requires_grad=True), Parameter containing:
 tensor([0.3566, 0.3331, 0.4156,  ..., 0.3027, 0.3934, 0.4597],
        requires_grad=True), Parameter containing:
 tensor([ 0.0039, -0.0139,  0.0799,  ..., -0.0354,  0.0002, -0.0507],
        requires_grad=True), Parameter containing:
 tensor([0.7108, 0.6487, 0.8303,  ..., 0.5031, 0.6099, 0.8633],
        requires_grad=True), Parameter containing:
 tensor([-0.3873,  0.0271, -0.2536,  ..., -0.1306, -0.0470,  0.1739],
        requires_grad=True), Parameter containing:
 tensor([0.4754, 0.4790, 0.5809,  ..., 0.3982, 0.4633, 0.5181],
        requires_grad=True), Parameter containing:
 tensor([ 0.0498,  0.0161,  0.0720,  ...,  0.0221,  0.0262, -0.0039],
        requires_grad=True), Parameter containing:
 tensor([0.7605, 0.5692, 0.7406,  ..., 0.6080, 0.7313, 0.9108],
        requires_grad=True), Parameter containing:
 tensor([-0.2919,  0.0165, -0.1515,  ...,  0.0878,  0.1131, -0.0286],
        requires_grad=True), Parameter containing:
 tensor([0.4510, 0.4098, 0.5094,  ..., 0.4080, 0.4713, 0.5315],
        requires_grad=True), Parameter containing:
 tensor([0.0489, 0.0145, 0.0468,  ..., 0.0118, 0.0073, 0.0143],
        requires_grad=True), Parameter containing:
 tensor([0.7937, 0.6190, 0.7745,  ..., 0.6243, 0.7671, 0.9124],
        requires_grad=True), Parameter containing:
 tensor([-0.1167, -0.0649, -0.1412,  ...,  0.2262,  0.1255, -0.0199],
        requires_grad=True), Parameter containing:
 tensor([0.4677, 0.4157, 0.4952,  ..., 0.4077, 0.4837, 0.5452],
        requires_grad=True), Parameter containing:
 tensor([ 0.0330,  0.0138,  0.0262,  ..., -0.0178, -0.0088,  0.0108],
        requires_grad=True), Parameter containing:
 tensor([0.7569, 0.6327, 0.7345,  ..., 0.5937, 0.7591, 0.8758],
        requires_grad=True), Parameter containing:
 tensor([ 0.0460, -0.0602, -0.1474,  ..., -0.0467,  0.1447, -0.1098],
        requires_grad=True), Parameter containing:
 tensor([0.5098, 0.4489, 0.5239,  ..., 0.4316, 0.5104, 0.5754],
        requires_grad=True), Parameter containing:
 tensor([-0.0279, -0.0034,  0.0159,  ..., -0.0088, -0.0531, -0.0025],
        requires_grad=True), Parameter containing:
 tensor([0.6944, 0.6200, 0.6646,  ..., 0.6369, 0.6630, 0.7936],
        requires_grad=True), Parameter containing:
 tensor([ 0.0425, -0.1008, -0.0308,  ...,  0.0353, -0.0847, -0.1735],
        requires_grad=True), Parameter containing:
 tensor([0.2035, 0.1787, 0.1948,  ..., 0.1657, 0.2053, 0.2193],
        requires_grad=True), Parameter containing:
 tensor([-0.0103,  0.0007, -0.0053,  ..., -0.0091, -0.0024,  0.0073],
        requires_grad=True), Parameter containing:
 tensor([0.7927, 0.8630, 0.9708,  ..., 0.9663, 0.8602, 0.8438],
        requires_grad=True), Parameter containing:
 tensor([-0.5716,  0.2891,  0.2969,  ...,  0.0658,  0.3705,  0.4201],
        requires_grad=True), Parameter containing:
 tensor([0.9630, 0.8330, 0.8355,  ..., 0.9208, 0.8964, 0.9021],
        requires_grad=True), Parameter containing:
 tensor([-0.5408,  0.1112, -0.0177,  ..., -0.1004,  0.0218,  0.1819],
        requires_grad=True), Parameter containing:
 tensor([0.2730, 0.3580, 0.3483,  ..., 0.4263, 0.4327, 0.4214],
        requires_grad=True), Parameter containing:
 tensor([ 0.0389, -0.0258, -0.0110,  ..., -0.0194, -0.0296, -0.0428],
        requires_grad=True), Parameter containing:
 tensor([0.9382, 0.8844, 0.8857,  ..., 1.0070, 0.9959, 1.0378],
        requires_grad=True), Parameter containing:
 tensor([-0.1183, -0.1011,  0.1333,  ..., -0.1546,  0.1757,  0.0622],
        requires_grad=True), Parameter containing:
 tensor([0.9625, 0.8622, 0.9251,  ..., 0.8971, 0.9141, 0.9558],
        requires_grad=True), Parameter containing:
 tensor([-0.1533,  0.0882,  0.0403,  ..., -0.0873,  0.1071,  0.1290],
        requires_grad=True), Parameter containing:
 tensor([0.4511, 0.4763, 0.5090,  ..., 0.5148, 0.5195, 0.5114],
        requires_grad=True), Parameter containing:
 tensor([ 0.0111, -0.0262, -0.0249,  ..., -0.0013, -0.0405, -0.0426],
        requires_grad=True), Parameter containing:
 tensor([1.0736, 1.0671, 1.0402,  ..., 1.1028, 1.0349, 1.1180],
        requires_grad=True), Parameter containing:
 tensor([-0.0826,  0.1648,  0.0594,  ...,  0.0423,  0.0210,  0.1997],
        requires_grad=True), Parameter containing:
 tensor([0.9743, 0.8883, 0.8962,  ..., 0.9011, 0.8924, 0.9486],
        requires_grad=True), Parameter containing:
 tensor([-0.0676,  0.1354,  0.0881,  ..., -0.0470,  0.0819,  0.1521],
        requires_grad=True), Parameter containing:
 tensor([0.5064, 0.5148, 0.5453,  ..., 0.5547, 0.5714, 0.5448],
        requires_grad=True), Parameter containing:
 tensor([-0.0062, -0.0382, -0.0277,  ..., -0.0148, -0.0280, -0.0436],
        requires_grad=True), Parameter containing:
 tensor([1.1094, 1.1115, 1.0514,  ..., 1.1308, 1.0555, 1.1376],
        requires_grad=True), Parameter containing:
 tensor([-0.1110,  0.0899,  0.1120,  ..., -0.1669, -0.0578,  0.0723],
        requires_grad=True), Parameter containing:
 tensor([0.9829, 0.8915, 0.8747,  ..., 0.9197, 0.9132, 0.9313],
        requires_grad=True), Parameter containing:
 tensor([-0.2108,  0.0420,  0.0976,  ..., -0.0915,  0.0075, -0.0368],
        requires_grad=True), Parameter containing:
 tensor([0.5855, 0.6197, 0.6315,  ..., 0.6086, 0.6402, 0.6502],
        requires_grad=True), Parameter containing:
 tensor([ 0.0119, -0.0300, -0.0460,  ...,  0.0003, -0.0203, -0.0224],
        requires_grad=True), Parameter containing:
 tensor([1.1550, 1.1142, 1.1391,  ..., 1.1255, 1.0924, 1.1411],
        requires_grad=True), Parameter containing:
 tensor([-0.0424,  0.1162, -0.0843,  ..., -0.0932, -0.0413,  0.1667],
        requires_grad=True), Parameter containing:
 tensor([0.9111, 0.8723, 0.7811,  ..., 0.9060, 0.9096, 0.9097],
        requires_grad=True), Parameter containing:
 tensor([-0.2315,  0.0282, -0.2177,  ...,  0.0301, -0.0341, -0.2896],
        requires_grad=True), Parameter containing:
 tensor([0.6610, 0.6736, 0.6347,  ..., 0.6703, 0.6840, 0.7031],
        requires_grad=True), Parameter containing:
 tensor([-0.0064, -0.0492, -0.0049,  ..., -0.0257, -0.0278,  0.0006],
        requires_grad=True), Parameter containing:
 tensor([1.1620, 1.0839, 1.1276,  ..., 1.1419, 1.1669, 1.1023],
        requires_grad=True), Parameter containing:
 tensor([ 0.0491, -0.0879,  0.1410,  ...,  0.0672,  0.0349,  0.2100],
        requires_grad=True), Parameter containing:
 tensor([0.8549, 0.8141, 0.7377,  ..., 0.7198, 0.7090, 0.7332],
        requires_grad=True), Parameter containing:
 tensor([ 0.3034, -0.0049, -0.0414,  ...,  0.2019,  0.0855, -0.2630],
        requires_grad=True), Parameter containing:
 tensor([2.0453, 2.5422, 2.4132,  ..., 0.9301, 0.8366, 0.7796],
        requires_grad=True), Parameter containing:
 tensor([0.0008, 0.0556, 0.1556,  ..., 0.0142, 0.0115, 0.0733],
        requires_grad=True)]

put quantize_dynamic in fairseq-generate’s code and you will observe the change.
ref :