finetuning batch split (#994)

ananthsub · web-flow · commit 70b55b03f1c4 · 2025-10-21T01:07:51.000Z
Signed-off-by: Ananth Subramaniam &lt;ansubramania@nvidia.com&gt;
diff --git a/src/megatron/bridge/data/finetuning.py b/src/megatron/bridge/data/finetuning.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Finetuning-specific data handling utilities."""
+
+from typing import Any, Iterator
+
+import torch
+
+
+def split_batch_into_microbatches(
+    batch: dict[str, Any], num_microbatches: int, enforce_divisible: bool = True
+) -> list[dict[str, Any]]:
+    """Split a batch dictionary into microbatches.
+
+    Takes a global batch (e.g., [16, 240] for tokens) and splits it into
+    num_microbatches smaller batches (e.g., 4 batches of [4, 240]).
+
+    Args:
+        batch: Dictionary containing tensors with batch_size = num_microbatches * micro_batch_size
+        num_microbatches: Number of microbatches to split into
+        enforce_divisible: Whether to enforce batch_size % num_microbatches == 0
+
+    Returns:
+        List of microbatch dictionaries, each containing the same keys as the input batch
+
+    Example:
+        >>> batch = {'tokens': torch.rand(16, 240), 'labels': torch.rand(16, 240)}
+        >>> microbatches = split_batch_into_microbatches(batch, num_microbatches=4)
+        >>> len(microbatches)  # 4
+        >>> microbatches[0]['tokens'].shape  # torch.Size([4, 240])
+    """
+    # Identify tensor items vs other items (like metadata)
+    tensor_items = {k: v for k, v in batch.items() if isinstance(v, torch.Tensor)}
+    other_items = {k: v for k, v in batch.items() if not isinstance(v, torch.Tensor)}
+
+    if len(tensor_items) == 0:
+        raise ValueError("Batch must contain at least one tensor")
+
+    # Get batch size from first tensor
+    first_key = next(iter(tensor_items.keys()))
+    batch_size = tensor_items[first_key].shape[0]
+
+    if enforce_divisible and batch_size % num_microbatches != 0:
+        raise ValueError(
+            f"Batch size {batch_size} is not divisible by num_microbatches {num_microbatches}. "
+            f"Cannot split evenly into microbatches."
+        )
+
+    # Split all tensors along batch dimension (dim=0)
+    split_tensors = {}
+    for key, tensor in tensor_items.items():
+        split_tensors[key] = torch.tensor_split(tensor, num_microbatches, dim=0)
+
+    # Create microbatch dictionaries
+    microbatches = []
+    for i in range(num_microbatches):
+        microbatch = {}
+
+        # Add split tensors
+        for key, splits in split_tensors.items():
+            microbatch[key] = splits[i]
+
+        # Handle non-tensor items (metadata, etc.)
+        for key, value in other_items.items():
+            if isinstance(value, list) and len(value) == batch_size:
+                # If it's a list with length matching batch size, split it too
+                micro_batch_size = batch_size // num_microbatches
+                start_idx = i * micro_batch_size
+                end_idx = start_idx + micro_batch_size
+                microbatch[key] = value[start_idx:end_idx]
+            else:
+                # Otherwise copy as-is (e.g., global metadata)
+                microbatch[key] = value
+
+        microbatches.append(microbatch)
+
+    return microbatches
+
+
+def prepare_finetuning_batch(
+    data_iterator: Iterator,
+    num_microbatches: int,
+    default_seq_length: int,
+    seq_key: str = "tokens",
+) -> tuple[Iterator, int]:
+    """Prepare a finetuning batch by getting global batch and splitting into microbatches.
+
+    This function handles the finetuning-specific data flow:
+    1. Gets the full global batch from the iterator
+    2. Extracts the dynamic sequence length from the batch
+    3. Splits the batch into microbatches with consistent sequence length
+    4. Returns an iterator over microbatches and the extracted sequence length
+
+    Args:
+        data_iterator: Iterator that yields global batches (e.g., from DataLoader with batch sampler)
+        num_microbatches: Number of microbatches to split each global batch into
+        default_seq_length: Fallback sequence length if it cannot be extracted from batch
+        seq_key: Key in batch dict containing the sequence tensor (default: 'tokens')
+
+    Returns:
+        Tuple of:
+        - Iterator over microbatches (each microbatch is a dict with same keys as global batch)
+        - Sequence length extracted from the global batch (or default_seq_length if not found)
+
+    Example:
+        >>> # DataLoader yields global batch of shape [16, 240]
+        >>> microbatch_iter, seq_len = prepare_finetuning_batch(
+        ...     data_iterator=iter(dataloader),
+        ...     num_microbatches=4,
+        ...     default_seq_length=2048
+        ... )
+        >>> seq_len  # 240 (extracted from batch)
+        >>> batch1 = next(microbatch_iter)
+        >>> batch1['tokens'].shape  # torch.Size([4, 240])
+    """
+    # Get full global batch from dataloader
+    global_batch = next(data_iterator)
+
+    # Extract dynamic seq_length from the full batch
+    seq_length = default_seq_length
+    if seq_key in global_batch and isinstance(global_batch[seq_key], torch.Tensor):
+        seq_length = global_batch[seq_key].size(1)
+
+    # Split into microbatches
+    microbatches = split_batch_into_microbatches(global_batch, num_microbatches)
+
+    # Return iterator over microbatches and the extracted seq_length
+    return iter(microbatches), seq_length
diff --git a/src/megatron/bridge/data/samplers.py b/src/megatron/bridge/data/samplers.py
@@ -250,29 +250,32 @@ def __init__(
         self._global_batch_size_on_this_data_parallel_rank = self._num_micro_batches * self.micro_batch_size
 
     def __len__(self) -> int:
-        """Return the number of microbatches this sampler will yield.
+        """Return the number of batches this sampler will yield.
 
-        Since we yield one microbatch per global batch × num_micro_batches,
-        multiply by num_micro_batches to get the total number of yields.
+        Since we now yield the full global batch at once (not split into microbatches),
+        this returns the number of global batches.
         """
         num_available_samples = self.total_samples - self.consumed_samples % self.total_samples
         if self.drop_last:
             num_global_batches = num_available_samples // self._global_batch_size
         else:
             num_global_batches = (num_available_samples + self._global_batch_size - 1) // self._global_batch_size
 
-        # Each global batch yields num_micro_batches microbatches
-        return num_global_batches * self._num_micro_batches
+        # Each call to __iter__ yields one global batch
+        return num_global_batches
 
     def __iter__(self) -> Iterator[list[int]]:
-        """Yields lists of indices for each microbatch assigned to this rank.
+        """Yields lists of indices for the full global batch assigned to this rank.
 
         Accumulates a full global batch, then distributes indices in interleaved fashion
-        to data parallel ranks, yielding one microbatch at a time for megatron-core compatibility.
-
-        This ensures all samples in a global batch can be padded to the same max length
-        (important for variable-length finetuning) while being compatible with megatron-core's
-        microbatch loop that calls next() multiple times per training step.
+        to data parallel ranks, yielding ALL indices for this rank at once. This allows
+        the DataLoader's collate_fn to receive the full global batch and determine optimal
+        padding across all samples before the training loop splits into microbatches.
+
+        This is essential for variable-length finetuning where we need to:
+        1. Compute max_length across the entire global batch
+        2. Pad all samples to the same length
+        3. Then split into microbatches with consistent sequence length
         """
         batch = []
         # Last batch will be dropped if drop_last is True
@@ -290,11 +293,9 @@ def __iter__(self) -> Iterator[list[int]]:
                 ]
                 assert len(all_indices) == self._global_batch_size_on_this_data_parallel_rank
 
-                # Yield one microbatch at a time
-                for microbatch_idx in range(self._num_micro_batches):
-                    start = microbatch_idx * self.micro_batch_size
-                    end = start + self.micro_batch_size
-                    yield all_indices[start:end]
+                # Yield ALL indices at once (not split into microbatches)
+                # The training loop will handle splitting after collation
+                yield all_indices
 
                 batch = []
 
@@ -306,12 +307,8 @@ def __iter__(self) -> Iterator[list[int]]:
                 num_pad = self._global_batch_size // self.data_parallel_size - len(all_indices)
                 all_indices = all_indices + [-1] * num_pad
 
-            # Yield one microbatch at a time
-            for microbatch_idx in range(self._num_micro_batches):
-                start = microbatch_idx * self.micro_batch_size
-                end = start + self.micro_batch_size
-                if start < len(all_indices):
-                    yield all_indices[start:end]
+            # Yield ALL indices at once
+            yield all_indices
 
 
 class RandomSeedDataset(Dataset):
diff --git a/src/megatron/bridge/training/train.py b/src/megatron/bridge/training/train.py
@@ -522,16 +522,31 @@ def train_step(
             overlap_param_gather=cfg.ddp.overlap_param_gather,
         )
 
+        # Handle finetuning vs pretraining data consumption
+        seq_length = model_config.seq_length  # Default for pretraining
+        forward_backward_data_iterator = data_iterator  # Default for pretraining
+
+        if cfg.dataset.dataloader_type == "batch":
+            # Finetuning path to support variable-length sequences
+            from megatron.bridge.data.finetuning import prepare_finetuning_batch
+
+            forward_backward_data_iterator, seq_length = prepare_finetuning_batch(
+                data_iterator=data_iterator,
+                num_microbatches=get_num_microbatches(),
+                default_seq_length=model_config.seq_length,
+                seq_key="tokens",
+            )
+
         # Forward pass.
         forward_backward_func = get_forward_backward_func()
         losses_reduced = forward_backward_func(
             forward_step_func=forward_step_func,
-            data_iterator=data_iterator,
+            data_iterator=forward_backward_data_iterator,
             model=model,
             num_microbatches=get_num_microbatches(),
-            seq_length=model_config.seq_length,
+            seq_length=seq_length,
             micro_batch_size=train_config.micro_batch_size,
-            decoder_seq_length=model_config.seq_length,
+            decoder_seq_length=seq_length,
             forward_only=False,
         )
     should_checkpoint, should_exit, exit_code = rerun_state_machine.should_checkpoint_and_exit()
@@ -1074,14 +1089,20 @@ def _dummy_train_step(
         global_state: Global state containing configuration
         train_data_iterator: Iterator over training data
     """
+    cfg = global_state.cfg
     num_microbatches = get_num_microbatches()
     rerun_state_machine = get_rerun_state_machine()
 
     while rerun_state_machine.should_run_forward_backward(train_data_iterator):
-        for _ in range(num_microbatches):
-            if parallel_state.is_pipeline_first_stage() or parallel_state.is_pipeline_last_stage():
-                if train_data_iterator is not None:
+        if parallel_state.is_pipeline_first_stage() or parallel_state.is_pipeline_last_stage():
+            if train_data_iterator is not None:
+                if cfg.dataset.dataloader_type == "batch":
+                    # Finetuning: Consume global batch once
                     _ = next(train_data_iterator)
+                else:
+                    # Pretrain: Consume microbatches one at a time
+                    for _ in range(num_microbatches):
+                        _ = next(train_data_iterator)
 
 
 def _handle_mxfp8_param_buffer_copy(
diff --git a/tests/functional_tests/data/test_samplers.py b/tests/functional_tests/data/test_samplers.py