adds low precision stuff

jomitchellnv · jomitchellnv · commit c51b8b55ad79 · 2026-03-21T12:58:55.000-06:00
Signed-off-by: Jonathan Mitchell &lt;jomitchell@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/codonfm_native_te/hydra_config/defaults.yaml b/bionemo-recipes/recipes/codonfm_native_te/hydra_config/defaults.yaml
@@ -17,6 +17,19 @@ dataset:
 wandb_init_args:
   name: ???
 
+# TransformerEngine FP8 config
+fp8_config:
+  enabled: false
+  fp8_recipe: transformer_engine.common.recipe.DelayedScaling
+  fp8_format: "HYBRID"
+  fp8_recipe_kwargs: {}
+
+fp4_config:
+  enabled: false
+  fp4_recipe: transformer_engine.common.recipe.NVFP4BlockScaling
+  fp4_format: "E2M1"
+  fp4_recipe_kwargs: {}
+
 # Optimizer config
 adamw_kwargs:
   lr: 4e-4
@@ -40,3 +53,14 @@ checkpoint:
 
 logger:
   frequency: 100
+
+quant_stats_config:
+  enabled: false
+  quant_stats_file: ./fp8_debugging_stats.yaml
+  quant_log_dir: ./log_quant_stats
+  log_to_wandb: false
+
+# Note: The layers are going to come in 1 indexed and we convert them to be 0 indexed at runtime.
+fp8_layers: null
+fp4_layers: null
+use_fp32_master_weights: null
diff --git a/bionemo-recipes/recipes/codonfm_native_te/modeling_codonfm_te.py b/bionemo-recipes/recipes/codonfm_native_te/modeling_codonfm_te.py
@@ -19,11 +19,14 @@
 
 import json
 import math
+import warnings
+from contextlib import nullcontext
 from dataclasses import asdict, dataclass
-from typing import Optional
+from typing import ContextManager, Optional
 
 import torch
 import torch.nn as nn
+import transformer_engine.common.recipe
 import transformer_engine.pytorch
 from torch.nn import CrossEntropyLoss
 from transformer_engine.pytorch.attention.rope import RotaryPositionEmbedding
@@ -50,6 +53,9 @@ class CodonFMConfig:
     # TE-specific options
     qkv_weight_interleaved: bool = True
     fuse_qkv_params: bool = True
+    # Layer-wise precision options
+    layer_precision: list[str | None] | None = None
+    use_quantized_model_init: bool = False
 
     def __post_init__(self):
         """Validate configuration."""
@@ -60,6 +66,15 @@ def __post_init__(self):
             )
         if self.hidden_act not in ("gelu", "relu", "silu"):
             raise ValueError(f"hidden_act must be one of: gelu, relu, silu, got {self.hidden_act}")
+        if self.layer_precision is not None:
+            if len(self.layer_precision) != self.num_hidden_layers:
+                raise ValueError(
+                    f"layer_precision must be a list of length {self.num_hidden_layers}, "
+                    f"got {len(self.layer_precision)}"
+                )
+            for precision in self.layer_precision:
+                if precision not in {"fp8", "fp4", None}:
+                    raise ValueError(f'layer_precision element must be "fp8", "fp4", or None, got {precision!r}')
 
     def save_json(self, path: str):
         """Save config as JSON."""
@@ -142,44 +157,111 @@ def forward(self, input_ids: torch.LongTensor) -> torch.Tensor:
 class CodonFMEncoder(nn.Module):
     """CodonFM encoder using standard TransformerEngine TransformerLayer."""
 
-    def __init__(self, config: CodonFMConfig):
+    def __init__(
+        self,
+        config: CodonFMConfig,
+        fp8_recipe: transformer_engine.common.recipe.Recipe | None = None,
+        fp4_recipe: transformer_engine.common.recipe.Recipe | None = None,
+    ):
         """Initialize the encoder.
 
         Args:
             config: Model configuration.
+            fp8_recipe: The FP8 recipe for the encoder.
+            fp4_recipe: The FP4 recipe for the encoder.
         """
         super().__init__()
         self.config = config
+        self._fp8_recipe: transformer_engine.common.recipe.Recipe | None = fp8_recipe
+        self._fp4_recipe: transformer_engine.common.recipe.Recipe | None = fp4_recipe
+
+        if self.config.layer_precision is None:
+            if fp8_recipe is not None and fp4_recipe is not None:
+                raise RuntimeError("Both FP8 and FP4 recipes provided, but no layer precision provided.")
+            if fp8_recipe is not None:
+                warnings.warn("No layer precision provided, using FP8 recipe for all layers.", UserWarning)
+                self.config.layer_precision = ["fp8"] * self.config.num_hidden_layers
+            elif fp4_recipe is not None:
+                raise RuntimeError(
+                    "FP4 recipe provided but no layer_precision configured. "
+                    "Set layer_precision explicitly when using FP4."
+                )
+
+        if self.config.layer_precision is not None and "fp4" in self.config.layer_precision and fp4_recipe is None:
+            raise RuntimeError("layer_precision contains 'fp4' entries but no fp4_recipe was provided.")
 
         device = "meta" if torch.get_default_device() == torch.device("meta") else "cpu"
 
-        self.layers = nn.ModuleList(
-            [
-                transformer_engine.pytorch.TransformerLayer(
-                    hidden_size=config.hidden_size,
-                    ffn_hidden_size=config.intermediate_size,
-                    num_attention_heads=config.num_attention_heads,
-                    layernorm_epsilon=config.layer_norm_eps,
-                    hidden_dropout=config.hidden_dropout_prob,
-                    attention_dropout=config.attention_probs_dropout_prob,
-                    qkv_weight_interleaved=config.qkv_weight_interleaved,
-                    layer_number=i + 1,
-                    layer_type="encoder",
-                    self_attn_mask_type="padding",
-                    activation=config.hidden_act,
-                    attn_input_format=config.attn_input_format,
-                    seq_length=config.max_position_embeddings,
-                    num_gqa_groups=config.num_attention_heads,
-                    fuse_qkv_params=config.fuse_qkv_params,
-                    window_size=(-1, -1),
-                    device=device,
+        layers: list[transformer_engine.pytorch.TransformerLayer] = []
+        for i in range(config.num_hidden_layers):
+            with self.get_autocast_context(i, init=True):
+                layers.append(
+                    transformer_engine.pytorch.TransformerLayer(
+                        hidden_size=config.hidden_size,
+                        ffn_hidden_size=config.intermediate_size,
+                        num_attention_heads=config.num_attention_heads,
+                        layernorm_epsilon=config.layer_norm_eps,
+                        hidden_dropout=config.hidden_dropout_prob,
+                        attention_dropout=config.attention_probs_dropout_prob,
+                        qkv_weight_interleaved=config.qkv_weight_interleaved,
+                        layer_number=i + 1,
+                        layer_type="encoder",
+                        self_attn_mask_type="padding",
+                        activation=config.hidden_act,
+                        attn_input_format=config.attn_input_format,
+                        seq_length=config.max_position_embeddings,
+                        num_gqa_groups=config.num_attention_heads,
+                        fuse_qkv_params=config.fuse_qkv_params,
+                        window_size=(-1, -1),
+                        device=device,
+                    )
                 )
-                for i in range(config.num_hidden_layers)
-            ]
-        )
 
+        self.layers = nn.ModuleList(layers)
         self.rotary_embeddings = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads)
 
+    def get_autocast_context(
+        self, layer_number: int | None, init: bool = False, outer: bool = False
+    ) -> ContextManager:
+        """Return the appropriate TE autocast context manager for a given layer.
+
+        Handles both the quantized_model_init during layer creation and the te.autocast() during forward.
+
+        Args:
+            layer_number: The 0-indexed layer number.
+            init: Whether to return a ``quantized_model_init`` context for layer initialization.
+            outer: Whether to return a global te.autocast() context to wrap the entire encoder stack.
+        """
+        if self.config.layer_precision is None:
+            return nullcontext()
+
+        if outer:
+            if "fp8" not in self.config.layer_precision:
+                return nullcontext()
+            if self._fp8_recipe is None:
+                warnings.warn("No FP8 recipe provided, using default recipe.", UserWarning)
+            return transformer_engine.pytorch.autocast(enabled=True, recipe=self._fp8_recipe)
+
+        precision = self.config.layer_precision[layer_number]
+        recipe = {"fp8": self._fp8_recipe, "fp4": self._fp4_recipe}.get(precision)
+
+        if init and self.config.use_quantized_model_init:
+            if precision == "fp4" and recipe is None:
+                raise RuntimeError("No FP4 recipe provided, but layer precision is set to FP4.")
+            if precision in ("fp8", "fp4"):
+                return transformer_engine.pytorch.quantized_model_init(recipe=recipe)
+            return nullcontext()
+
+        if precision == "fp8":
+            if recipe is None:
+                warnings.warn("No FP8 recipe provided, using default recipe.", UserWarning)
+            return transformer_engine.pytorch.autocast(enabled=True, recipe=recipe)
+        if precision == "fp4":
+            if recipe is None:
+                raise RuntimeError("No FP4 recipe provided, but layer precision is set to FP4.")
+            return transformer_engine.pytorch.autocast(enabled=True, recipe=recipe)
+        return transformer_engine.pytorch.autocast(enabled=False)
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -203,23 +285,25 @@ def forward(
             te_rope_emb = self.rotary_embeddings(max_seq_len=self.config.max_position_embeddings)
             te_rope_emb = te_rope_emb.to(hidden_states.device, non_blocking=True)
 
-        for layer_module in self.layers:
-            if self.config.attn_input_format == "bshd":
-                hidden_states = layer_module(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    rotary_pos_emb=te_rope_emb,
-                )
-            else:
-                hidden_states = layer_module(
-                    hidden_states,
-                    attention_mask=None,
-                    rotary_pos_emb=te_rope_emb,
-                    cu_seqlens_q=kwargs.get("cu_seq_lens_q"),
-                    cu_seqlens_kv=kwargs.get("cu_seq_lens_k"),
-                    max_seqlen_q=kwargs.get("max_length_q"),
-                    max_seqlen_kv=kwargs.get("max_length_k"),
-                )
+        with self.get_autocast_context(None, outer=True):
+            for layer_idx, layer_module in enumerate(self.layers):
+                with self.get_autocast_context(layer_idx):
+                    if self.config.attn_input_format == "bshd":
+                        hidden_states = layer_module(
+                            hidden_states,
+                            attention_mask=attention_mask,
+                            rotary_pos_emb=te_rope_emb,
+                        )
+                    else:
+                        hidden_states = layer_module(
+                            hidden_states,
+                            attention_mask=None,
+                            rotary_pos_emb=te_rope_emb,
+                            cu_seqlens_q=kwargs.get("cu_seq_lens_q"),
+                            cu_seqlens_kv=kwargs.get("cu_seq_lens_k"),
+                            max_seqlen_q=kwargs.get("max_length_q"),
+                            max_seqlen_kv=kwargs.get("max_length_k"),
+                        )
 
         return hidden_states
 
@@ -236,18 +320,20 @@ def __init__(self, config: CodonFMConfig):
         super().__init__()
         device = "meta" if torch.get_default_device() == torch.device("meta") else "cpu"
 
-        self.dense = transformer_engine.pytorch.Linear(
-            config.hidden_size,
-            config.hidden_size,
-            device=device,
-        )
-        self.layer_norm_linear = transformer_engine.pytorch.LayerNormLinear(
-            config.hidden_size,
-            config.vocab_size,
-            bias=True,
-            eps=config.layer_norm_eps,
-            device=device,
-        )
+        # Disable quantization for the LM head to avoid numerical instability.
+        with transformer_engine.pytorch.quantized_model_init(enabled=False):
+            self.dense = transformer_engine.pytorch.Linear(
+                config.hidden_size,
+                config.hidden_size,
+                device=device,
+            )
+            self.layer_norm_linear = transformer_engine.pytorch.LayerNormLinear(
+                config.hidden_size,
+                config.vocab_size,
+                bias=True,
+                eps=config.layer_norm_eps,
+                device=device,
+            )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """Forward pass.
@@ -258,25 +344,34 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         Returns:
             Logits of shape [..., vocab_size].
         """
-        x = self.dense(hidden_states)
-        x = torch.nn.functional.gelu(x)
-        x = self.layer_norm_linear(x)
+        # Keep the LM head in higher precision to avoid numerical instability.
+        with transformer_engine.pytorch.autocast(enabled=False):
+            x = self.dense(hidden_states)
+            x = torch.nn.functional.gelu(x)
+            x = self.layer_norm_linear(x)
         return x
 
 
 class CodonFMForMaskedLM(nn.Module):
     """CodonFM model for masked language modeling with TransformerEngine layers."""
 
-    def __init__(self, config: CodonFMConfig):
+    def __init__(
+        self,
+        config: CodonFMConfig,
+        fp8_recipe: transformer_engine.common.recipe.Recipe | None = None,
+        fp4_recipe: transformer_engine.common.recipe.Recipe | None = None,
+    ):
         """Initialize the model.
 
         Args:
             config: Model configuration.
+            fp8_recipe: The FP8 recipe for the encoder.
+            fp4_recipe: The FP4 recipe for the encoder.
         """
         super().__init__()
         self.config = config
         self.embeddings = CodonEmbedding(config)
-        self.encoder = CodonFMEncoder(config)
+        self.encoder = CodonFMEncoder(config, fp8_recipe=fp8_recipe, fp4_recipe=fp4_recipe)
         self.lm_head = CodonFMLMHead(config)
         self._init_weights()
 
diff --git a/bionemo-recipes/recipes/codonfm_native_te/perf_logger.py b/bionemo-recipes/recipes/codonfm_native_te/perf_logger.py
@@ -18,6 +18,7 @@
 import logging
 import time
 
+import nvdlfw_inspect.api as debug_api
 import torch
 import torchmetrics
 import torchmetrics.text
@@ -74,6 +75,9 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig):
             wandb.init(**args.wandb_init_args, config=self._run_config)
             self._progress_bar = tqdm(total=args.num_train_steps, desc="Training")
 
+        # Whether to step debug_api.step() after each step
+        self.quant_stats_config = args.quant_stats_config.enabled
+
     def log_step(
         self,
         step: int,
@@ -95,6 +99,9 @@ def log_step(
             if isinstance(grad_norm, DTensor):
                 grad_norm = grad_norm.to_local()
 
+            if self.quant_stats_config:
+                debug_api.step()
+
             if step % self.logging_frequency == 0 and step > 0:
                 num_tokens = batch["input_ids"].numel()
                 num_unpadded_tokens = batch["input_ids"][batch["input_ids"] != PAD_TOKEN_ID].numel()
@@ -142,6 +149,9 @@ def log_step(
 
     def finish(self):
         """Finish the logger."""
+        if self.quant_stats_config:
+            debug_api.end_debug()
+
         if not self._dist_config.is_main_process():
             return
         wandb.finish()
diff --git a/bionemo-recipes/recipes/codonfm_native_te/requirements.txt b/bionemo-recipes/recipes/codonfm_native_te/requirements.txt
@@ -1,7 +1,9 @@
 hydra-core
+nvdlfw_inspect
 pandas
 pyarrow
 pytest
+pyyaml
 safetensors
 torch
 torchmetrics
diff --git a/bionemo-recipes/recipes/codonfm_native_te/tests/test_train.py b/bionemo-recipes/recipes/codonfm_native_te/tests/test_train.py
diff --git a/bionemo-recipes/recipes/codonfm_native_te/train_fsdp2.py b/bionemo-recipes/recipes/codonfm_native_te/train_fsdp2.py