addressing coderabbit review

pstjohn · pstjohn · commit 07481731fce8 · 2026-02-25T07:25:50.000-08:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -42,6 +42,9 @@ pre-commit run --all-files
 pre-commit run
 ```
 
+Do not copy license headers from other files, instead allow the license-check.py script to add the license header during
+pre-commit to ensure the proper year is used.
+
 Pre-commit includes:
 
 - Ruff linting/formatting (line-length: 119, Google-style docstrings)
diff --git a/bionemo-recipes/models/esm2/collator.py b/bionemo-recipes/models/esm2/collator.py
@@ -842,7 +842,7 @@ def _process_tensor_bshd(
     total_chunks = 2 * cp_world_size
     chunk_size = seq_len // total_chunks
 
-    if chunk_size == 0:
+    if seq_len % total_chunks != 0:
         raise ValueError(
             f"Sequence length {seq_len} must be divisible by {total_chunks} "
             f"(2 * cp_world_size) for BSHD context parallelism"
diff --git a/bionemo-recipes/models/esm2/tests/common/__init__.py b/bionemo-recipes/models/esm2/tests/common/__init__.py
@@ -18,7 +18,8 @@
 This package provides reusable test infrastructure following HuggingFace
 transformers patterns, including:
 
-- BaseModelTest: Base test class with all common test methods                                                                                                                                                                                                                                              - TestTolerances: Dataclass for model-specific numerical tolerances
+- BaseModelTest: Base test class with all common test methods
+- TestTolerances: Dataclass for model-specific numerical tolerances
 - Distributed testing utilities for multi-GPU tests
 - Shared fixtures for common test requirements
 
diff --git a/bionemo-recipes/models/llama3/collator.py b/bionemo-recipes/models/llama3/collator.py
@@ -842,7 +842,7 @@ def _process_tensor_bshd(
     total_chunks = 2 * cp_world_size
     chunk_size = seq_len // total_chunks
 
-    if chunk_size == 0:
+    if seq_len % total_chunks != 0:
         raise ValueError(
             f"Sequence length {seq_len} must be divisible by {total_chunks} "
             f"(2 * cp_world_size) for BSHD context parallelism"
diff --git a/bionemo-recipes/models/llama3/tests/common/__init__.py b/bionemo-recipes/models/llama3/tests/common/__init__.py
@@ -18,7 +18,8 @@
 This package provides reusable test infrastructure following HuggingFace
 transformers patterns, including:
 
-- BaseModelTest: Base test class with all common test methods                                                                                                                                                                                                                                              - TestTolerances: Dataclass for model-specific numerical tolerances
+- BaseModelTest: Base test class with all common test methods
+- TestTolerances: Dataclass for model-specific numerical tolerances
 - Distributed testing utilities for multi-GPU tests
 - Shared fixtures for common test requirements
 
diff --git a/bionemo-recipes/models/mixtral/README.md b/bionemo-recipes/models/mixtral/README.md
@@ -22,6 +22,9 @@ The Mixtral implementation natively supports the following TransformerEngine-pro
 
 ### Quick start: convert and run
 
+> **Note:** The snippets below use bare imports (e.g., `from convert import ...`). Run them from the
+> `bionemo-recipes/models/mixtral` directory, or install dependencies first with `pip install -r requirements.txt`.
+
 ```python
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -44,7 +47,7 @@ inputs = tokenizer("The quick brown fox", return_tensors="pt")
 inputs = {k: v.to("cuda") for k, v in inputs.items()}
 
 with torch.no_grad():
-    output_ids = model_te.generate(**inputs, max_new_tokens=16, use_cache=False)
+    output_ids = model_te.generate(**inputs, max_new_tokens=16)
 
 print(tokenizer.decode(output_ids[0], skip_special_tokens=True))
 ```
@@ -57,6 +60,9 @@ inference, and back to Hugging Face Transformers format for sharing and deployme
 
 ### Converting from HF Transformers to TE
 
+> **Note:** Run from the `bionemo-recipes/models/mixtral` directory, or install dependencies first with
+> `pip install -r requirements.txt`.
+
 ```python
 from transformers import AutoModelForCausalLM
 
@@ -69,6 +75,9 @@ model_te.save_pretrained("/path/to/te_checkpoint")
 
 ### Converting from TE back to HF Transformers
 
+> **Note:** Run from the `bionemo-recipes/models/mixtral` directory, or install dependencies first with
+> `pip install -r requirements.txt`.
+
 ```python
 from convert import convert_mixtral_te_to_hf
 from modeling_mixtral_te import NVMixtralForCausalLM
@@ -80,9 +89,18 @@ model_hf.save_pretrained("/path/to/hf_checkpoint")
 
 ### Validating Converted Models
 
-To validate the converted models, refer to the commands in [Inference Examples](#inference-examples) above to load and
-test both the original and converted models to ensure loss and logit values are similar. Additionally, refer to the
-golden value tests in [test_modeling_mixtral.py](tests/test_modeling_mixtral.py).
+The golden value tests in [test_modeling_mixtral.py](tests/test_modeling_mixtral.py) verify that the converted TE model
+produces numerically equivalent outputs to the original HuggingFace model. Specifically:
+
+- `test_golden_values_bshd` — loads both models, runs a forward pass on the same input, and asserts that logits and
+  loss match within tolerance.
+- `test_round_trip_conversion` — converts HF → TE → HF and verifies the round-tripped model produces identical outputs.
+
+To run these tests locally:
+
+```bash
+./ci/scripts/recipes_local_test.py bionemo-recipes/models/mixtral/
+```
 
 ## Developer Guide
 
@@ -94,6 +112,18 @@ To run tests locally, run `recipes_local_test.py` from the repository root with
 ./ci/scripts/recipes_local_test.py bionemo-recipes/models/mixtral/
 ```
 
+### Exporting to Hugging Face Hub
+
+The model directory includes an `export.py` script that bundles all files needed for Hugging Face Hub distribution. To
+create the export bundle, run from the model directory:
+
+```bash
+python export.py
+```
+
+Before publishing, validate the export by running the local test suite via
+[recipes_local_test.py](../../ci/scripts/recipes_local_test.py).
+
 ### Development container
 
 To use the provided devcontainer, use "Dev Containers: Reopen in Container" from the VSCode menu, and choose the
diff --git a/bionemo-recipes/models/mixtral/collator.py b/bionemo-recipes/models/mixtral/collator.py
@@ -842,7 +842,7 @@ def _process_tensor_bshd(
     total_chunks = 2 * cp_world_size
     chunk_size = seq_len // total_chunks
 
-    if chunk_size == 0:
+    if seq_len % total_chunks != 0:
         raise ValueError(
             f"Sequence length {seq_len} must be divisible by {total_chunks} "
             f"(2 * cp_world_size) for BSHD context parallelism"
diff --git a/bionemo-recipes/models/mixtral/convert.py b/bionemo-recipes/models/mixtral/convert.py
@@ -71,6 +71,13 @@ def _make_merge_experts_fn(num_experts: int):
 
     Since the number of experts is dynamic (varies per model config), we use ``exec()`` to generate
     a function with exactly ``num_experts`` named parameters (weight0, weight1, ..., weightN-1).
+
+    Args:
+        num_experts: The number of expert weight parameters the generated function will accept.
+
+    Returns:
+        A callable ``(weight0, weight1, ..., weight{N-1}) -> torch.Tensor`` that stacks the
+        per-expert weight tensors into a single tensor of shape ``[num_experts, ...]``.
     """
     param_names = [f"weight{i}" for i in range(num_experts)]
     code = f"def merge_experts({', '.join(param_names)}):\n    return torch.stack([{', '.join(param_names)}])"
diff --git a/bionemo-recipes/models/mixtral/tests/common/__init__.py b/bionemo-recipes/models/mixtral/tests/common/__init__.py
@@ -18,7 +18,8 @@
 This package provides reusable test infrastructure following HuggingFace
 transformers patterns, including:
 
-- BaseModelTest: Base test class with all common test methods                                                                                                                                                                                                                                              - TestTolerances: Dataclass for model-specific numerical tolerances
+- BaseModelTest: Base test class with all common test methods
+- TestTolerances: Dataclass for model-specific numerical tolerances
 - Distributed testing utilities for multi-GPU tests
 - Shared fixtures for common test requirements
 
diff --git a/bionemo-recipes/models/mixtral/tests/test_export.py b/bionemo-recipes/models/mixtral/tests/test_export.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-Apache2
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/bionemo-recipes/models/mixtral/tests/test_modeling_mixtral.py b/bionemo-recipes/models/mixtral/tests/test_modeling_mixtral.py
@@ -263,17 +263,7 @@ def test_te_mixtral_model_generate_with_cache_beam_search(self):
 
         num_beams = 2
         config = model_te.config
-        past_key_values = HFInferenceParams(
-            max_batch_size=2 * num_beams,
-            max_sequence_length=256,
-            num_heads_kv=config.num_key_value_heads,
-            head_dim_k=config.hidden_size // config.num_attention_heads,
-            dtype=torch.bfloat16,
-            qkv_format="thd",
-            max_ctx_len=256,
-        )
-        for layer_number in range(1, config.num_hidden_layers + 1):
-            past_key_values.allocate_memory(layer_number)
+        past_key_values = self._create_inference_params(config, batch_size=2, num_beams=num_beams)
 
         with torch.no_grad():
             output_ids = model_te.generate(
diff --git a/bionemo-recipes/recipes/esm2_native_te/checkpoint.py b/bionemo-recipes/recipes/esm2_native_te/checkpoint.py
@@ -587,7 +587,7 @@ def load_dataloader(
         )
         return dataloader
 
-    dataloader_state = torch.load(dataloader_path)
+    dataloader_state = torch.load(dataloader_path, weights_only=True)
 
     if (
         dataloader.num_workers != dataloader_state["num_workers"]
diff --git a/bionemo-recipes/recipes/esm2_native_te/collator.py b/bionemo-recipes/recipes/esm2_native_te/collator.py
@@ -842,7 +842,7 @@ def _process_tensor_bshd(
     total_chunks = 2 * cp_world_size
     chunk_size = seq_len // total_chunks
 
-    if chunk_size == 0:
+    if seq_len % total_chunks != 0:
         raise ValueError(
             f"Sequence length {seq_len} must be divisible by {total_chunks} "
             f"(2 * cp_world_size) for BSHD context parallelism"
diff --git a/bionemo-recipes/recipes/esm2_native_te/tests/test_checkpoint_pruning.py b/bionemo-recipes/recipes/esm2_native_te/tests/test_checkpoint_pruning.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-Apache2
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -101,3 +101,6 @@ def test_should_save_checkpoint():
 
     # save_every_n_steps=0 should never save
     assert should_save_checkpoint(step=10, save_every_n_steps=0) is False
+
+    # save_every_n_steps < 0 should never save
+    assert should_save_checkpoint(step=10, save_every_n_steps=-1) is False
diff --git a/bionemo-recipes/recipes/esm2_native_te/tests/test_scheduler.py b/bionemo-recipes/recipes/esm2_native_te/tests/test_scheduler.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-Apache2
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/bionemo-recipes/recipes/esm2_peft_te/collator.py b/bionemo-recipes/recipes/esm2_peft_te/collator.py
@@ -842,7 +842,7 @@ def _process_tensor_bshd(
     total_chunks = 2 * cp_world_size
     chunk_size = seq_len // total_chunks
 
-    if chunk_size == 0:
+    if seq_len % total_chunks != 0:
         raise ValueError(
             f"Sequence length {seq_len} must be divisible by {total_chunks} "
             f"(2 * cp_world_size) for BSHD context parallelism"
diff --git a/bionemo-recipes/recipes/llama3_native_te/collator.py b/bionemo-recipes/recipes/llama3_native_te/collator.py
@@ -842,7 +842,7 @@ def _process_tensor_bshd(
     total_chunks = 2 * cp_world_size
     chunk_size = seq_len // total_chunks
 
-    if chunk_size == 0:
+    if seq_len % total_chunks != 0:
         raise ValueError(
             f"Sequence length {seq_len} must be divisible by {total_chunks} "
             f"(2 * cp_world_size) for BSHD context parallelism"

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
	`1`	`+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
`2`	`2`	`# SPDX-License-Identifier: LicenseRef-Apache2`
`3`	`3`	`#`
`4`	`4`	`# Licensed under the Apache License, Version 2.0 (the "License");`