Add weight tying logic to LM head, i.e. Lingua does not tie weights.

cspades · cspades · commit 1a6f2b481245 · 2026-03-19T14:04:04.000-07:00
Signed-off-by: Cory Ye &lt;cye@nvidia.com&gt;
diff --git a/bionemo-recipes/models/llama3/modeling_llama_te.py b/bionemo-recipes/models/llama3/modeling_llama_te.py
@@ -465,10 +465,16 @@ def __init__(
                 tp_size=config.tp_size,
             )
             if config.tensor_parallel:
-                # If using tensor parallelism, the head weights have already been tied
-                # to the embedding weights. Just set the tensor parallel group for TE.
-                # No parameter quantization either, so no need for weight_mesh.
-                self.lm_head.set_tensor_parallel_group(self.tp_mesh.get_group())
+                if config.tie_word_embeddings:
+                    # Head weights have already been tied to the embedding weights.
+                    # Just set the tensor parallel group for TE.
+                    # No parameter quantization either, so no need for weight_mesh.
+                    self.lm_head.set_tensor_parallel_group(self.tp_mesh.get_group())
+                else:
+                    # Head weights are not tied to the embedding weights. Need to
+                    # wrap the LM head weight as a DTensor with TE.
+                    # No parameter quantization either, so no need for weight_mesh.
+                    self.lm_head.set_device_mesh(tp_mesh=self.tp_mesh)
 
         # Initialize weights and apply final processing. Ties weights.
         self.post_init()
diff --git a/bionemo-recipes/recipes/llama3_native_te/README.md b/bionemo-recipes/recipes/llama3_native_te/README.md
@@ -18,7 +18,7 @@ bionemo-framework repository. You can download a zipped directory of this folder
 
 | Model                                    | BF16 | FP8<sup>[1]</sup> | THD Input Format | FP8 with THD Input Format | MXFP8<sup>[2]</sup> | Context Parallelism | Tensor Parallelism |
 | ---------------------------------------- | ---- | ----------------- | ---------------- | ------------------------- | ------------------- | ------------------- | ------------------ |
-| [Llama 3](../../models/llama3/README.md) | ✅   | ✅                | ✅               | ✅                        | ✅                  | ✅                  | 🚧                 |
+| [Llama 3](../../models/llama3/README.md) | ✅   | ✅                | ✅               | ✅                        | ✅                  | ✅                  | ✅                 |
 
 ✅: Supported <br/>
 🚧: Under development <br/>
diff --git a/bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py b/bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py
@@ -471,10 +471,16 @@ def __init__(
                 tp_size=config.tp_size,
             )
             if config.tensor_parallel:
-                # If using tensor parallelism, the head weights have already been tied
-                # to the embedding weights. Just set the tensor parallel group for TE.
-                # No parameter quantization either, so no need for weight_mesh.
-                self.lm_head.set_tensor_parallel_group(self.tp_mesh.get_group())
+                if config.tie_word_embeddings:
+                    # Head weights have already been tied to the embedding weights.
+                    # Just set the tensor parallel group for TE.
+                    # No parameter quantization either, so no need for weight_mesh.
+                    self.lm_head.set_tensor_parallel_group(self.tp_mesh.get_group())
+                else:
+                    # Head weights are not tied to the embedding weights. Need to
+                    # wrap the LM head weight as a DTensor with TE.
+                    # No parameter quantization either, so no need for weight_mesh.
+                    self.lm_head.set_device_mesh(tp_mesh=self.tp_mesh)
 
         # Initialize weights and apply final processing. Ties weights.
         self.post_init()
diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/modeling_llama_te.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/modeling_llama_te.py
@@ -471,10 +471,16 @@ def __init__(
                 tp_size=config.tp_size,
             )
             if config.tensor_parallel:
-                # If using tensor parallelism, the head weights have already been tied
-                # to the embedding weights. Just set the tensor parallel group for TE.
-                # No parameter quantization either, so no need for weight_mesh.
-                self.lm_head.set_tensor_parallel_group(self.tp_mesh.get_group())
+                if config.tie_word_embeddings:
+                    # Head weights have already been tied to the embedding weights.
+                    # Just set the tensor parallel group for TE.
+                    # No parameter quantization either, so no need for weight_mesh.
+                    self.lm_head.set_tensor_parallel_group(self.tp_mesh.get_group())
+                else:
+                    # Head weights are not tied to the embedding weights. Need to
+                    # wrap the LM head weight as a DTensor with TE.
+                    # No parameter quantization either, so no need for weight_mesh.
+                    self.lm_head.set_device_mesh(tp_mesh=self.tp_mesh)
 
         # Initialize weights and apply final processing. Ties weights.
         self.post_init()