nvidia
/

NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16

Text Generation

Model card Files Files and versions

Update modeling_nemotron_h.py

#2

by jaeminh - opened 16 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

Files changed (1) hide show

modeling_nemotron_h.py +2 -1

modeling_nemotron_h.py CHANGED Viewed

@@ -852,7 +852,8 @@ class NemotronHMOE(nn.Module):
                 final_hidden_states.index_add_(0, token_indices, weighted_output)
             else:
                 # Local empty expert: no-op compute that still marks params as used.
-                dummy_out = expert(torch.zeros_like(hidden_states[0]).unsqueeze(0).to(final_hidden_states.dtype))
                 final_hidden_states = final_hidden_states + dummy_out
         # in original deepseek, the output of the experts are gathered once we leave this module

                 final_hidden_states.index_add_(0, token_indices, weighted_output)
             else:
                 # Local empty expert: no-op compute that still marks params as used.
+                expert_dtype = expert.down_proj.weight.dtype
+                dummy_out = expert(torch.zeros_like(hidden_states[0]).unsqueeze(0).to(expert_dtype))
                 final_hidden_states = final_hidden_states + dummy_out
         # in original deepseek, the output of the experts are gathered once we leave this module