disable quantization for the MTP fc projection to match FP8 model configs (#4572)

RunningLeon · web-flow · commit f5a986042aff · 2026-05-11T14:42:28.000+08:00
diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
@@ -389,6 +389,7 @@ def from_pretrained(
             dtype (str): user specified data type for model weights and
                 activations. Refer to `PyTorchEngineConfig` for details
             hf_overrides (dict[str, Any]): overrides for the HF config.
+            model_format (str): the quantization format of the model.
         """
         from transformers import AutoConfig
 
@@ -567,6 +568,7 @@ def from_config(
         target_model: str = None,
         dtype: str = 'auto',
         trust_remote_code: bool = False,
+        model_format: str = None,
         hf_overrides: dict[str, Any] = None,
     ):
         model = model or target_model
@@ -576,6 +578,7 @@ def from_config(
                                                    is_draft_model=True,
                                                    spec_method=method,
                                                    block_size=target_cache_cfg.block_size,
+                                                   model_format=model_format,
                                                    hf_overrides=hf_overrides,
                                                    )
         cache_config = None
@@ -590,6 +593,7 @@ def from_config(
                                        cache_max_entry_count=target_cache_cfg.cache_max_entry_count,
                                        max_prefill_token_num=target_cache_cfg.max_prefill_token_num,
                                        device_type=target_cache_cfg.device_type,
+                                       quant_policy=target_cache_cfg.quant_policy,
                                        migration_backend=target_cache_cfg.migration_backend)
         obj = cls(
             model=model,
diff --git a/lmdeploy/pytorch/engine/config_builder.py b/lmdeploy/pytorch/engine/config_builder.py
@@ -115,6 +115,7 @@ def build_specdecode_config(target_model, speculative_config: SpeculativeConfig,
                 target_cache_cfg=cache_config,
                 dtype=engine_config.dtype,
                 trust_remote_code=trust_remote_code,
+                model_format=engine_config.model_format,
                 hf_overrides=engine_config.hf_overrides,
             )
         return specdecode_config
diff --git a/lmdeploy/pytorch/models/qwen3_5_mtp.py b/lmdeploy/pytorch/models/qwen3_5_mtp.py
@@ -94,24 +94,23 @@ def __init__(
             for idx in range(self.num_mtp_layers)
         })
 
-        quantization_config = getattr(config, 'quantization_config', None)
-
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, dtype=dtype, device=device)
         self.pre_fc_norm_hidden = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, dtype=dtype, device=device)
         self.pre_fc_norm_embedding = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, dtype=dtype, device=device)
 
         # shared with target model
         self.embed_tokens = None
-
+        # do not quant fc as in https://huggingface.co/Qwen/Qwen3.5-27B-FP8/blob/main/config.json#L403
+        # and https://huggingface.co/Qwen/Qwen3.5-35B-A3B-FP8/blob/main/config.json#L409
         self.fc = build_colwise_linear(
             config.hidden_size * 2,
             config.hidden_size,
             bias=False,
             dtype=dtype,
             device=device,
             is_tp=False,
-            quant_config=quantization_config,
             dp_disable_tp=True,
+            prefix=add_prefix('fc', prefix=prefix),
         )
 
         # build rotary embedding
@@ -200,7 +199,7 @@ def __init__(self,
         self.model = Qwen3_5MultiTokenPredictor(config.text_config,
                                                 dtype=dtype,
                                                 device=device,
-                                                prefix=add_prefix('model', prefix=prefix))
+                                                prefix=add_prefix('mtp', prefix=prefix))
 
         self.num_experts = getattr(config.text_config, 'num_experts', None)
         self.enable_sci_mtp = getattr(config, 'enable_sci_mtp', False)

Original file line number	Diff line number	Diff line change
`@@ -115,6 +115,7 @@ def build_specdecode_config(target_model, speculative_config: SpeculativeConfig,`
`115`	`115`	`target_cache_cfg=cache_config,`
`116`	`116`	`dtype=engine_config.dtype,`
`117`	`117`	`trust_remote_code=trust_remote_code,`
	`118`	`+ model_format=engine_config.model_format,`
`118`	`119`	`hf_overrides=engine_config.hf_overrides,`
`119`	`120`	`)`
`120`	`121`	`return specdecode_config`