Trigger unit tests for docker images upload workflow

xibinliu · xibinliu · commit 0e378d20f78f · 2026-01-22T23:24:26.000Z
diff --git a/.github/workflows/UploadDockerImages.yml b/.github/workflows/UploadDockerImages.yml
@@ -15,7 +15,7 @@
 # This workflow builds and pushes MaxText images for both TPU and GPU devices.
 # It runs automatically daily at 12am UTC, on Pull Requests, or manually via Workflow Dispatch.
 
-name: Build Images
+name: Build and Test Images
 
 on:
   schedule:
@@ -128,3 +128,133 @@ jobs:
       dockerfile: ${{ matrix.dockerfile }}
       maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
       image_date: ${{ needs.setup.outputs.image_date }}
+
+  pre-training-images-tpu-unit-tests:
+    needs: [setup, tpu-pre-training]
+    uses: ./.github/workflows/run_tests_against_package.yml
+    strategy:
+        fail-fast: false
+        matrix:
+          image_name: [maxtext_jax_stable, maxtext_jax_nightly]
+    with:
+      device_type: tpu
+      device_name: v6e-4
+      base_image: ${{  matrix.image_name }}:${{ needs.setup.outputs.image_date }}
+      cloud_runner: linux-x86-ct6e-180-4tpu
+      pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
+      xla_python_client_mem_fraction: 0.75
+      tf_force_gpu_allow_growth: false
+      container_resource_option: "--privileged"
+      is_scheduled_run: ${{ github.event_name == 'schedule' }}
+      maxtext_installed: true
+
+  pre-training-images-tpu-integration-tests:
+    needs: [setup, tpu-pre-training]
+    uses: ./.github/workflows/run_tests_against_package.yml
+    strategy:
+        fail-fast: false
+        matrix:
+          image_name: [maxtext_jax_stable, maxtext_jax_nightly]
+    with:
+      device_type: tpu
+      device_name: v6e-4
+      base_image: ${{  matrix.image_name }}:${{ needs.setup.outputs.image_date }}
+      cloud_runner: linux-x86-ct6e-180-4tpu
+      pytest_marker: 'not cpu_only and not gpu_only and integration_test'
+      xla_python_client_mem_fraction: 0.75
+      tf_force_gpu_allow_growth: false
+      container_resource_option: "--privileged"
+      is_scheduled_run: ${{ github.event_name == 'schedule' }}
+      maxtext_installed: true
+
+  post-training-images-tpu-unit-tests:
+    needs: [setup, tpu-post-training]
+    uses: ./.github/workflows/run_tests_against_package.yml
+    strategy:
+        fail-fast: false
+        matrix:
+          image_name: [maxtext_post_training_stable, maxtext_post_training_nightly]
+    with:
+      device_type: tpu
+      device_name: v6e-4
+      base_image: ${{  matrix.image_name }}:${{ needs.setup.outputs.image_date }}
+      cloud_runner: linux-x86-ct6e-180-4tpu
+      pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
+      xla_python_client_mem_fraction: 0.75
+      tf_force_gpu_allow_growth: false
+      container_resource_option: "--privileged"
+      is_scheduled_run: ${{ github.event_name == 'schedule' }}
+      maxtext_installed: true
+
+  post-training-images-tpu-integration-tests:
+    needs: [setup, tpu-post-training]
+    uses: ./.github/workflows/run_tests_against_package.yml
+    strategy:
+        fail-fast: false
+        matrix:
+          image_name: [maxtext_post_training_stable, maxtext_post_training_nightly]
+    with:
+      device_type: tpu
+      device_name: v6e-4
+      base_image: ${{  matrix.image_name }}:${{ needs.setup.outputs.image_date }}
+      cloud_runner: linux-x86-ct6e-180-4tpu
+      pytest_marker: 'not cpu_only and not gpu_only and integration_test'
+      xla_python_client_mem_fraction: 0.75
+      tf_force_gpu_allow_growth: false
+      container_resource_option: "--privileged"
+      is_scheduled_run: ${{ github.event_name == 'schedule' }}
+      maxtext_installed: true
+
+  pre-training-images-gpu-unit-tests:
+    needs: [setup, gpu-pre-training]
+    uses: ./.github/workflows/run_tests_against_package.yml
+    strategy:
+        fail-fast: false
+        matrix:
+          image_name: [maxtext_gpu_jax_stable, maxtext_gpu_jax_nightly]
+    with:
+      device_type: ${{ matrix.cuda }}
+      device_name: a100-40gb-4
+      base_image: ${{  matrix.image_name }}:${{ needs.setup.outputs.image_date }}
+      cloud_runner: linux-x86-a2-48-a100-4gpu
+      pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
+      xla_python_client_mem_fraction: 0.65
+      tf_force_gpu_allow_growth: true
+      container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
+      is_scheduled_run: ${{ github.event_name == 'schedule' }}
+      maxtext_installed: true
+
+  pre-training-images-gpu-integration-tests:
+    needs: [setup, gpu-pre-training]
+    uses: ./.github/workflows/run_tests_against_package.yml
+    strategy:
+        fail-fast: false
+        matrix:
+          image_name: [maxtext_gpu_jax_stable, maxtext_gpu_jax_nightly]
+    with:
+      device_type: ${{ matrix.cuda }}
+      device_name: a100-40gb-4
+      base_image: ${{  matrix.image_name }}:${{ needs.setup.outputs.image_date }}
+      cloud_runner: linux-x86-a2-48-a100-4gpu
+      pytest_marker: 'not cpu_only and not tpu_only and integration_test'
+      xla_python_client_mem_fraction: 0.65
+      tf_force_gpu_allow_growth: true
+      container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
+      is_scheduled_run: ${{ github.event_name == 'schedule' }}
+      maxtext_installed: true
+
+  post-training-images-tpu-notebook-tests:
+    needs: [setup, tpu-post-training]
+    uses: ./.github/workflows/run_jupyter_notebooks.yml
+    strategy:
+        fail-fast: false
+        matrix:
+          image_name: [maxtext_post_training_stable, maxtext_post_training_nightly]
+    with:
+      device_type: tpu
+      device_name: v6e-4
+      base_image: ${{  matrix.image_name }}:${{ needs.setup.outputs.image_date }}
+      cloud_runner: linux-x86-ct6e-180-4tpu
+      maxtext_installed: true
+    secrets:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
diff --git a/.github/workflows/build_and_test_maxtext.yml b/.github/workflows/build_and_test_maxtext.yml
@@ -113,7 +113,7 @@ jobs:
     with:
       device_type: tpu
       device_name: v6e-4
-      image_type: ${{ matrix.image_type }}
+      base_image: maxtext-unit-test-tpu:${{ matrix.image_type }}
       cloud_runner: linux-x86-ct6e-180-4tpu
     secrets:
       HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -131,7 +131,7 @@ jobs:
       device_type: cpu
       device_name: X64
       cloud_runner: linux-x86-n2-16
-      image_type: ${{ matrix.image_type }}
+      base_image: maxtext-unit-test-tpu:${{ matrix.image_type }}
       pytest_marker: 'cpu_only'
       xla_python_client_mem_fraction: 0.75
       tf_force_gpu_allow_growth: false
@@ -151,7 +151,7 @@ jobs:
     with:
       device_type: tpu
       device_name: v6e-4
-      image_type: ${{ matrix.image_type }}
+      base_image: maxtext-unit-test-tpu:${{ matrix.image_type }}
       cloud_runner: linux-x86-ct6e-180-4tpu
       pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
       xla_python_client_mem_fraction: 0.75
@@ -170,7 +170,7 @@ jobs:
     with:
       device_type: tpu
       device_name: v6e-4
-      image_type: ${{ matrix.image_type }}
+      base_image: maxtext-unit-test-tpu:${{ matrix.image_type }}
       cloud_runner: linux-x86-ct6e-180-4tpu
       pytest_marker: 'not cpu_only and not gpu_only and integration_test'
       xla_python_client_mem_fraction: 0.75
@@ -189,7 +189,7 @@ jobs:
     with:
       device_type: tpu
       device_name: v6e-4
-      image_type: ${{ matrix.image_type }}
+      base_image: maxtext-unit-test-tpu:${{ matrix.image_type }}
       cloud_runner: linux-x86-ct6e-180-4tpu
       pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
       xla_python_client_mem_fraction: 0.75
@@ -208,7 +208,7 @@ jobs:
     with:
       device_type: tpu
       device_name: v6e-4
-      image_type: ${{ matrix.image_type }}
+      base_image: maxtext-unit-test-tpu:${{ matrix.image_type }}
       cloud_runner: linux-x86-ct6e-180-4tpu
       pytest_marker: 'not cpu_only and not gpu_only and integration_test'
       xla_python_client_mem_fraction: 0.75
@@ -228,7 +228,7 @@ jobs:
     with:
       device_type: ${{ matrix.cuda }}
       device_name: a100-40gb-4
-      image_type: ${{ matrix.image_type }}
+      base_image: maxtext-unit-test-${{ matrix.cuda }}:${{ matrix.image_type }}
       cloud_runner: linux-x86-a2-48-a100-4gpu
       pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
       xla_python_client_mem_fraction: 0.65
@@ -248,7 +248,7 @@ jobs:
     with:
       device_type: ${{ matrix.cuda }}
       device_name: a100-40gb-4
-      image_type: ${{ matrix.image_type }}
+      base_image: maxtext-unit-test-${{ matrix.cuda }}:${{ matrix.image_type }}
       cloud_runner: linux-x86-a2-48-a100-4gpu
       pytest_marker: 'not cpu_only and not tpu_only and integration_test'
       xla_python_client_mem_fraction: 0.65
diff --git a/.github/workflows/run_jupyter_notebooks.yml b/.github/workflows/run_jupyter_notebooks.yml
@@ -25,12 +25,17 @@ on:
       device_name:
         required: true
         type: string
-      image_type:
+      base_image:
         required: false
         type: string
       cloud_runner:
         required: false
         type: string
+      # Flag to skip source checkout and wheel installation
+      maxtext_installed:
+        required: false
+        type: boolean
+        default: false
     secrets:
       HF_TOKEN:
         required: true
@@ -41,14 +46,17 @@ jobs:
   run:
     runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }}
     container:
-      image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}:${{ inputs.image_type != '' && inputs.image_type }}
+      image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }}
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        if: ${{ !inputs.maxtext_installed }}
       - name: Download the MaxText wheel
+        if: ${{ !inputs.maxtext_installed }}
         uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0
         with:
           name: maxtext-wheel
       - name: Install MaxText and Dependencies
+        if: ${{ !inputs.maxtext_installed }}
         shell: bash
         run: |
           python3 -m uv venv --seed
@@ -59,10 +67,6 @@ jobs:
           uv pip install ${maxtext_wheel}[${MAXTEXT_PACKAGE_EXTRA}] --resolution=lowest
           uv pip install -r src/install_maxtext_extra_deps/extra_deps_from_github.txt
 
-          # Install dependencies for running notebooks
-          uv pip install papermill ipykernel ipywidgets
-          .venv/bin/python3 -m ipykernel install --user --name maxtext_venv
-
           # Install Tunix for post-training notebooks
           uv pip install git+https://github.com/google/tunix
           
@@ -82,9 +86,24 @@ jobs:
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
+          if [ "${{ inputs.maxtext_installed }}" == "true" ]; then
+            # Move to the directory where code is baked into the image. See the Dockerfile.
+            # This is necessary because GHA sets an empty workspace by default.
+            cd /deps
+            PYTHON_EXE="python3"
+            PAPERMILL_EXE="papermill"
+          else
+            PYTHON_EXE=".venv/bin/python3"
+            PAPERMILL_EXE=".venv/bin/papermill"
+          fi
+
           MAXTEXT_REPO_ROOT=$(pwd)
           MAXTEXT_NOTEBOOKS_ROOT="$MAXTEXT_REPO_ROOT/src/MaxText/examples"
 
+          # Install dependencies for running notebooks
+          $PYTHON_EXE -m pip install papermill ipykernel ipywidgets
+          $PYTHON_EXE -m ipykernel install --user --name maxtext_venv
+
           for notebook in "$MAXTEXT_NOTEBOOKS_ROOT"/{sft,rl}*.ipynb; do
             filename=$(basename "$notebook")
             output_name="${filename%.ipynb}_output.ipynb"
@@ -93,7 +112,7 @@ jobs:
             echo "Running $filename ..."
             echo "------------------------------------------------------"
 
-            .venv/bin/papermill "$notebook" "$output_name" -k maxtext_venv
+            $PAPERMILL_EXE "$notebook" "$output_name" -k maxtext_venv
           done
       - name: Upload Outputs
         if: always()
diff --git a/.github/workflows/run_pathways_tests.yml b/.github/workflows/run_pathways_tests.yml
@@ -25,8 +25,8 @@ on:
       device_name:
         required: true
         type: string
-      image_type:
-        required: false
+      base_image:
+        required: true
         type: string
       pytest_marker:
         required: true
@@ -58,7 +58,7 @@ jobs:
   run:
     runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }}
     container:
-      image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-tpu:${{ inputs.image_type != '' && inputs.image_type }}
+      image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }}
       env:
         XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ inputs.xla_python_client_mem_fraction }}
         TF_FORCE_GPU_ALLOW_GROWTH: ${{ inputs.tf_force_gpu_allow_growth }}
diff --git a/.github/workflows/run_tests_against_package.yml b/.github/workflows/run_tests_against_package.yml