[nemotron_h] respect _no_reinit flag on dt_bias and out_proj.weight (#45591)

vai-minzhou · web-flow · commit eed95d8c445b · 2026-05-01T13:37:26.000Z
* [nemotron_h] respect _no_reinit flag on dt_bias and out_proj.weight _init_weights() on `NemotronHPreTrainedModel` unconditionally overwrites `dt_bias` (random `inv_softplus(dt)`) and `out_proj.weight` (kaiming_uniform scaled by 1/sqrt(n_layer)) every time it is invoked on a mamba block. It sets `module.dt_bias._no_reinit = True` after the copy, but the flag is never checked by either code path (only the Linear-bias branch reads it). On transformers>=5.0, `_init_weights` is triggered a second time after `from_pretrained()` has loaded the checkpoint (the post-load safety pass that initializes tensors staying on `meta`). For `NemotronHForCausalLM` that silently overwrites the checkpoint values for `dt_bias` and `out_proj.weight` with fresh random draws. The model then outputs repetitive stop-word streams like ` and and and and ,` for any input. Minimal repro with any Nemotron-H checkpoint: from transformers import AutoConfig, AutoModelForCausalLM from safetensors.torch import load_file import json, pathlib path = ".../NVIDIA-Nemotron-Cascade-2-30B-A3B-BF16" # or Nano cfg = AutoConfig.from_pretrained(path); cfg._attn_implementation='eager' m = AutoModelForCausalLM.from_pretrained(path, config=cfg, torch_dtype='bfloat16') idx = json.loads((pathlib.Path(path) / 'model.safetensors.index.json').read_text())['weight_map'] k = 'backbone.layers.0.mixer.dt_bias' on_disk = load_file(f'{path}/{idx[k]}')[k] in_mem = m.backbone.layers[0].mixer.dt_bias print((on_disk.float() - in_mem.float().cpu()).abs().max()) # ~26.8 This patch makes `_init_weights` honour `_no_reinit` on both `dt_bias` and `out_proj.weight` (the only two params that re-init unconditionally), and sets `_no_reinit = True` on `out_proj.weight` after the initial kaiming scale so a second pass is a no-op. Ordinary fresh-init training is unaffected; only the second invocation becomes idempotent. Signed-off-by: Min Zhou <minzhou@virtueai.com> * Switch to canonical _is_hf_initialized flag per review Per @Rocketknight1's review: replace the ad-hoc `_no_reinit` flag with the existing `_is_hf_initialized` flag that `from_pretrained` already sets on checkpoint-loaded parameters. Guard each Mamba2 init target (A_log / D / dt_bias) and the residual-scaled `out_proj.weight` independently, so parameters restored from a checkpoint survive any subsequent `_init_weights` pass. * Use _is_hf_initialized for nn.Linear.bias check too --------- Signed-off-by: Min Zhou <minzhou@virtueai.com>
diff --git a/src/transformers/models/nemotron_h/modeling_nemotron_h.py b/src/transformers/models/nemotron_h/modeling_nemotron_h.py
@@ -974,22 +974,27 @@ def _init_weights(self, module):
         """Initialize the weights."""
         super()._init_weights(module)
         if isinstance(module, NemotronHMamba2Mixer):
-            # Initialize A_log and D parameters
-            A = torch.arange(1, self.config.mamba_num_heads + 1)
-            init.copy_(module.A_log, torch.log(A))
-            init.ones_(module.D)
-
-            dt = torch.exp(
-                torch.rand(self.config.mamba_num_heads)
-                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
-                + math.log(self.config.time_step_min)
-            ).clamp(min=self.config.time_step_floor)
-
-            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-            inv_dt = dt + torch.log(-torch.expm1(-dt))
-            with torch.no_grad():
-                init.copy_(module.dt_bias, inv_dt)
-            module.dt_bias._no_reinit = True
+            # Only re-initialise params that were NOT loaded from a checkpoint.
+            # `_is_hf_initialized` is set by `from_pretrained` on each loaded
+            # parameter; without this guard a post-load safety pass of
+            # `_init_weights` would overwrite checkpoint values of
+            # A_log / D / dt_bias with fresh random draws.
+            if not getattr(module.A_log, "_is_hf_initialized", False):
+                A = torch.arange(1, self.config.mamba_num_heads + 1)
+                init.copy_(module.A_log, torch.log(A))
+            if not getattr(module.D, "_is_hf_initialized", False):
+                init.ones_(module.D)
+            if not getattr(module.dt_bias, "_is_hf_initialized", False):
+                dt = torch.exp(
+                    torch.rand(self.config.mamba_num_heads)
+                    * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                    + math.log(self.config.time_step_min)
+                ).clamp(min=self.config.time_step_floor)
+
+                # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+                inv_dt = dt + torch.log(-torch.expm1(-dt))
+                with torch.no_grad():
+                    init.copy_(module.dt_bias, inv_dt)
         elif isinstance(module, NemotronHTopkRouter):
             init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             init.zeros_(module.e_score_correction_bias)
@@ -1000,7 +1005,7 @@ def _init_weights(self, module):
 
         if isinstance(module, nn.Linear):
             if module.bias is not None:
-                if not getattr(module.bias, "_no_reinit", False):
+                if not getattr(module.bias, "_is_hf_initialized", False):
                     init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
             init.normal_(module.weight, std=self.config.initializer_range)
@@ -1014,10 +1019,12 @@ def _init_weights(self, module):
             # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
             for name, p in module.named_parameters():
                 if name == "out_proj.weight":
+                    # Skip checkpoint-loaded weights so a post-load safety
+                    # pass of `_init_weights` doesn't silently overwrite them.
+                    if getattr(p, "_is_hf_initialized", False):
+                        continue
                     # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
                     # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
-                    # We need to reinit p since this code could be called multiple times
-                    # Having just p *= scale would repeatedly scale it down
                     init.kaiming_uniform_(p, a=math.sqrt(5))
                     with torch.no_grad():
                         p_new = p / math.sqrt(self.config.num_hidden_layers)
diff --git a/src/transformers/models/nemotron_h/modular_nemotron_h.py b/src/transformers/models/nemotron_h/modular_nemotron_h.py
@@ -327,22 +327,27 @@ def _init_weights(self, module):
         """Initialize the weights."""
         super()._init_weights(module)
         if isinstance(module, NemotronHMamba2Mixer):
-            # Initialize A_log and D parameters
-            A = torch.arange(1, self.config.mamba_num_heads + 1)
-            init.copy_(module.A_log, torch.log(A))
-            init.ones_(module.D)
-
-            dt = torch.exp(
-                torch.rand(self.config.mamba_num_heads)
-                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
-                + math.log(self.config.time_step_min)
-            ).clamp(min=self.config.time_step_floor)
-
-            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-            inv_dt = dt + torch.log(-torch.expm1(-dt))
-            with torch.no_grad():
-                init.copy_(module.dt_bias, inv_dt)
-            module.dt_bias._no_reinit = True
+            # Only re-initialise params that were NOT loaded from a checkpoint.
+            # `_is_hf_initialized` is set by `from_pretrained` on each loaded
+            # parameter; without this guard a post-load safety pass of
+            # `_init_weights` would overwrite checkpoint values of
+            # A_log / D / dt_bias with fresh random draws.
+            if not getattr(module.A_log, "_is_hf_initialized", False):
+                A = torch.arange(1, self.config.mamba_num_heads + 1)
+                init.copy_(module.A_log, torch.log(A))
+            if not getattr(module.D, "_is_hf_initialized", False):
+                init.ones_(module.D)
+            if not getattr(module.dt_bias, "_is_hf_initialized", False):
+                dt = torch.exp(
+                    torch.rand(self.config.mamba_num_heads)
+                    * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                    + math.log(self.config.time_step_min)
+                ).clamp(min=self.config.time_step_floor)
+
+                # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+                inv_dt = dt + torch.log(-torch.expm1(-dt))
+                with torch.no_grad():
+                    init.copy_(module.dt_bias, inv_dt)
         elif isinstance(module, NemotronHTopkRouter):
             init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             init.zeros_(module.e_score_correction_bias)
@@ -353,7 +358,7 @@ def _init_weights(self, module):
 
         if isinstance(module, nn.Linear):
             if module.bias is not None:
-                if not getattr(module.bias, "_no_reinit", False):
+                if not getattr(module.bias, "_is_hf_initialized", False):
                     init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
             init.normal_(module.weight, std=self.config.initializer_range)
@@ -367,10 +372,12 @@ def _init_weights(self, module):
             # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
             for name, p in module.named_parameters():
                 if name == "out_proj.weight":
+                    # Skip checkpoint-loaded weights so a post-load safety
+                    # pass of `_init_weights` doesn't silently overwrite them.
+                    if getattr(p, "_is_hf_initialized", False):
+                        continue
                     # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
                     # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
-                    # We need to reinit p since this code could be called multiple times
-                    # Having just p *= scale would repeatedly scale it down
                     init.kaiming_uniform_(p, a=math.sqrt(5))
                     with torch.no_grad():
                         p_new = p / math.sqrt(self.config.num_hidden_layers)