Back out "Re-apply D101260086: Android unified error reporting" (#19137) #582
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: cuda-perf | |
| on: | |
| push: | |
| branches: | |
| - main | |
| - release/* | |
| tags: | |
| - ciflow/cuda-perf/* | |
| pull_request: | |
| paths: | |
| - .github/workflows/cuda-perf.yml | |
| - .ci/scripts/cuda_benchmark.py | |
| - .ci/scripts/cuda_perf_prompts/** | |
| workflow_dispatch: | |
| inputs: | |
| models: | |
| description: Models to be benchmarked (comma-separated HuggingFace model IDs) | |
| required: false | |
| type: string | |
| quantizations: | |
| description: Quantization types (comma-separated) | |
| required: false | |
| type: string | |
| num_runs: | |
| description: Number of benchmark runs per model | |
| required: false | |
| type: string | |
| default: "50" | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} | |
| cancel-in-progress: true | |
| jobs: | |
| set-parameters: | |
| runs-on: ubuntu-22.04 | |
| outputs: | |
| benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }} | |
| steps: | |
| - uses: actions/checkout@v3 | |
| with: | |
| submodules: 'false' | |
| - uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.10' | |
| - name: Set parameters | |
| id: set-parameters | |
| shell: bash | |
| env: | |
| ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it,nvidia/parakeet-tdt,SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4' | |
| ALL_QUANTIZATIONS: 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only' | |
| NUM_RUNS: ${{ inputs.num_runs || '50' }} | |
| run: | | |
| set -eux | |
| MODELS="${{ inputs.models }}" | |
| QUANTIZATIONS="${{ inputs.quantizations }}" | |
| # Use all models/quantizations unless overridden by workflow_dispatch | |
| if [ -z "$MODELS" ]; then | |
| MODELS="$ALL_MODELS" | |
| fi | |
| if [ -z "$QUANTIZATIONS" ]; then | |
| QUANTIZATIONS="$ALL_QUANTIZATIONS" | |
| fi | |
| # Split models and quantizations into arrays | |
| IFS=',' read -ra MODEL_ARRAY <<< "$MODELS" | |
| IFS=',' read -ra QUANT_ARRAY <<< "$QUANTIZATIONS" | |
| # Generate benchmark configs (skip invalid model/quant combinations) | |
| CONFIGS='{"include":[' | |
| FIRST=true | |
| for MODEL in "${MODEL_ARRAY[@]}"; do | |
| for QUANT in "${QUANT_ARRAY[@]}"; do | |
| # Qwen3.5 MoE only supports quantized-int4-tile-packed | |
| if [[ "$MODEL" == *"Qwen3.5-35B-A3B"* ]] && [ "$QUANT" != "quantized-int4-tile-packed" ]; then | |
| continue | |
| fi | |
| if [ "$FIRST" = true ]; then | |
| FIRST=false | |
| else | |
| CONFIGS+=',' | |
| fi | |
| # Sanitize model name for use in artifact paths | |
| MODEL_SAFE=$(echo "$MODEL" | sed 's/\//_/g') | |
| CONFIGS+="{\"model\":\"$MODEL\",\"quant\":\"$QUANT\",\"model_safe\":\"$MODEL_SAFE\",\"num_runs\":\"$NUM_RUNS\"}" | |
| done | |
| done | |
| CONFIGS+=']}' | |
| echo "benchmark_configs=$CONFIGS" >> $GITHUB_OUTPUT | |
| echo "Generated benchmark configs:" | |
| echo "$CONFIGS" | python -m json.tool | |
| export-models: | |
| name: export-models | |
| needs: set-parameters | |
| uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | |
| permissions: | |
| id-token: write | |
| contents: read | |
| secrets: inherit | |
| strategy: | |
| matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} | |
| fail-fast: false | |
| with: | |
| timeout: 90 | |
| secrets-env: EXECUTORCH_HF_TOKEN | |
| runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} | |
| gpu-arch-type: cuda | |
| gpu-arch-version: "12.6" | |
| use-custom-docker-registry: false | |
| submodules: recursive | |
| upload-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }} | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| script: | | |
| set -eux | |
| echo "::group::Setup ExecuTorch" | |
| # Disable MKL to avoid duplicate target error when conda has multiple MKL installations | |
| export USE_MKL=OFF | |
| ./install_executorch.sh | |
| echo "::endgroup::" | |
| echo "::group::Setup Huggingface" | |
| pip install -U "huggingface_hub[cli]<1.0" accelerate | |
| huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN | |
| OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) | |
| pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} | |
| echo "::endgroup::" | |
| echo "::group::Exporting model ${{ matrix.model }} with quantization ${{ matrix.quant }}" | |
| OUTPUT_DIR="model_artifacts" | |
| mkdir -p "$OUTPUT_DIR" | |
| bash .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" "$OUTPUT_DIR" | |
| # Move artifacts to RUNNER_ARTIFACT_DIR for upload | |
| mv "$OUTPUT_DIR"/* "${RUNNER_ARTIFACT_DIR}/" | |
| ls -lah "${RUNNER_ARTIFACT_DIR}" | |
| echo "::endgroup::" | |
| benchmark-cuda: | |
| name: benchmark-cuda | |
| needs: | |
| - set-parameters | |
| - export-models | |
| if: always() | |
| uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | |
| permissions: | |
| id-token: write | |
| contents: read | |
| strategy: | |
| matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} | |
| fail-fast: false | |
| with: | |
| timeout: 90 | |
| runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} | |
| gpu-arch-type: cuda | |
| gpu-arch-version: "12.6" | |
| use-custom-docker-registry: false | |
| submodules: recursive | |
| download-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }} | |
| upload-artifact: results-${{ matrix.model_safe }}-${{ matrix.quant }} | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| script: | | |
| set -eux | |
| echo "::group::Setup environment" | |
| ./install_requirements.sh | |
| pip list | |
| echo "::endgroup::" | |
| echo "::group::Prepare model artifacts" | |
| mkdir -p model_artifacts | |
| cp "${RUNNER_ARTIFACT_DIR}/model.pte" model_artifacts/model.pte | |
| cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" model_artifacts/aoti_cuda_blob.ptd | |
| # Copy additional files if they exist | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" model_artifacts/ | |
| fi | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/whisper_preprocessor.pte" model_artifacts/ | |
| fi | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/tekken.json" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/tekken.json" model_artifacts/ | |
| fi | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/poem.wav" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/poem.wav" model_artifacts/ | |
| fi | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/output.wav" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/output.wav" model_artifacts/ | |
| fi | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/tokenizer.model" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/tokenizer.model" model_artifacts/ | |
| fi | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/test_audio.wav" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/test_audio.wav" model_artifacts/ | |
| fi | |
| # Copy tokenizer files | |
| for file in tokenizer.json tokenizer_config.json special_tokens_map.json; do | |
| if [ -f "${RUNNER_ARTIFACT_DIR}/$file" ]; then | |
| cp "${RUNNER_ARTIFACT_DIR}/$file" model_artifacts/ | |
| fi | |
| done | |
| ls -lah model_artifacts/ | |
| echo "::endgroup::" | |
| echo "::group::Build runner" | |
| bash .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model }}" "${{ matrix.quant }}" model_artifacts | |
| echo "::endgroup::" | |
| echo "::group::Running benchmark for ${{ matrix.model }} (${{ matrix.quant }}) with ${{ matrix.num_runs }} runs" | |
| export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH | |
| # Get GPU name using nvidia-smi | |
| GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1) | |
| echo "Detected GPU: $GPU_NAME" | |
| # Get CUDA driver version | |
| CUDA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1) | |
| echo "CUDA Driver Version: $CUDA_DRIVER_VERSION" | |
| # Create results directory (separate from model artifacts) | |
| RESULTS_DIR="benchmark_results" | |
| mkdir -p "$RESULTS_DIR" | |
| # Determine model name and runner command based on model | |
| case "${{ matrix.model }}" in | |
| mistralai/Voxtral-Mini-3B-2507) | |
| RUNNER="cmake-out/examples/models/voxtral/voxtral_runner" | |
| PREPROCESSOR="model_artifacts/voxtral_preprocessor.pte" | |
| TOKENIZER="model_artifacts/tekken.json" | |
| AUDIO="model_artifacts/poem.wav" | |
| RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0" | |
| MODEL_NAME="voxtral_${{ matrix.quant }}" | |
| ;; | |
| openai/whisper-*) | |
| RUNNER="cmake-out/examples/models/whisper/whisper_runner" | |
| PREPROCESSOR="model_artifacts/whisper_preprocessor.pte" | |
| AUDIO="model_artifacts/output.wav" | |
| RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --audio_path $AUDIO --processor_path $PREPROCESSOR --temperature 0" | |
| MODEL_NAME=$(echo "${{ matrix.model }}" | sed 's/openai\///')_${{ matrix.quant }} | |
| ;; | |
| google/gemma-3-4b-it) | |
| RUNNER="cmake-out/examples/models/gemma3/gemma3_e2e_runner" | |
| IMAGE="docs/source/_static/img/et-logo.png" | |
| RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --image_path $IMAGE --temperature 0" | |
| MODEL_NAME="gemma3_${{ matrix.quant }}" | |
| ;; | |
| nvidia/parakeet-tdt) | |
| RUNNER="cmake-out/examples/models/parakeet/parakeet_runner" | |
| AUDIO="model_artifacts/test_audio.wav" | |
| TOKENIZER="model_artifacts/tokenizer.model" | |
| RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --audio_path $AUDIO --tokenizer_path $TOKENIZER" | |
| MODEL_NAME="parakeet_${{ matrix.quant }}" | |
| ;; | |
| SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4) | |
| RUNNER="cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner" | |
| TOKENIZER="model_artifacts/tokenizer.json" | |
| # Use a checked-in long prompt (>1000 tokens) for benchmarking. A | |
| # static, meaningful prompt avoids the degenerate / repetitive | |
| # outputs that can result from synthetic prompts built by | |
| # repeating the same sentence. | |
| PROMPT_FILE=".ci/scripts/cuda_perf_prompts/qwen3_5_moe_long_prompt.txt" | |
| RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --prompt_file $PROMPT_FILE --max_new_tokens 512 --temperature 0" | |
| MODEL_NAME="qwen3_5_moe_${{ matrix.quant }}" | |
| ;; | |
| *) | |
| echo "Error: Unsupported model '${{ matrix.model }}'" | |
| exit 1 | |
| ;; | |
| esac | |
| # Run benchmark using cuda_benchmark.py | |
| python .ci/scripts/cuda_benchmark.py \ | |
| --runner_command "$RUNNER_CMD" \ | |
| --model_name "$MODEL_NAME" \ | |
| --num_runs "${{ matrix.num_runs }}" \ | |
| --output_json "$RESULTS_DIR/benchmark_results.json" \ | |
| --output_v3 "$RESULTS_DIR/benchmark_results_v3.json" \ | |
| --model "${{ matrix.model }}" \ | |
| --quantization "${{ matrix.quant }}" \ | |
| --git_sha "${{ github.sha }}" \ | |
| --workflow_run_id "${{ github.run_id }}" \ | |
| --workflow_run_url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ | |
| --gpu_name "$GPU_NAME" \ | |
| --cuda_driver_version "$CUDA_DRIVER_VERSION" | |
| # Save additional metadata | |
| cat > "$RESULTS_DIR/metadata.json" <<EOF | |
| { | |
| "model": "${{ matrix.model }}", | |
| "quantization": "${{ matrix.quant }}", | |
| "num_runs": ${{ matrix.num_runs }}, | |
| "runner": "$RUNNER", | |
| "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", | |
| "git_sha": "${{ github.sha }}", | |
| "workflow_run_id": "${{ github.run_id }}", | |
| "workflow_run_url": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| } | |
| EOF | |
| # Only copy benchmark results to RUNNER_ARTIFACT_DIR for upload (not the entire model) | |
| # First, clean up the downloaded model artifacts from RUNNER_ARTIFACT_DIR | |
| rm -rf "${RUNNER_ARTIFACT_DIR}"/* | |
| # Then copy only the benchmark result JSON files | |
| cp "$RESULTS_DIR"/*.json "${RUNNER_ARTIFACT_DIR}/" | |
| echo "Benchmark results prepared for upload:" | |
| ls -lah "${RUNNER_ARTIFACT_DIR}" | |
| echo "::endgroup::" | |
| upload-benchmark-results: | |
| needs: | |
| - benchmark-cuda | |
| if: always() | |
| runs-on: ubuntu-22.04 | |
| environment: upload-benchmark-results | |
| permissions: | |
| id-token: write | |
| contents: read | |
| steps: | |
| - uses: actions/checkout@v3 | |
| with: | |
| submodules: false | |
| - name: Setup Python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.10' | |
| - name: Download all benchmark results | |
| uses: actions/download-artifact@v4 | |
| with: | |
| pattern: results-* | |
| path: all_results/ | |
| - name: Process and display results | |
| shell: bash | |
| run: | | |
| set -eux | |
| echo "::group::Benchmark Results Summary" | |
| for RESULT_DIR in all_results/results-*/; do | |
| if [ -f "$RESULT_DIR/benchmark_results.json" ]; then | |
| echo "" | |
| echo "================================" | |
| echo "Results from: $(basename "$RESULT_DIR")" | |
| echo "================================" | |
| # Display benchmark results (mean performance) | |
| cat "$RESULT_DIR/benchmark_results.json" | python -m json.tool | |
| # Display metadata | |
| if [ -f "$RESULT_DIR/metadata.json" ]; then | |
| echo "" | |
| echo "--- Metadata ---" | |
| cat "$RESULT_DIR/metadata.json" | python -m json.tool | |
| fi | |
| echo "" | |
| fi | |
| done | |
| echo "::endgroup::" | |
| - name: Authenticate with AWS | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results | |
| role-duration-seconds: 18000 | |
| aws-region: us-east-1 | |
| - name: Upload to S3 | |
| shell: bash | |
| env: | |
| S3_BUCKET: gha-artifacts | |
| S3_PREFIX: executorch-cuda-perf/${{ github.run_id }}/${{ github.run_attempt }} | |
| run: | | |
| set -eux | |
| pip install awscli | |
| echo "Uploading benchmark results to S3..." | |
| aws s3 sync all_results/ "s3://${S3_BUCKET}/${S3_PREFIX}/" \ | |
| --exclude "*" \ | |
| --include "*.json" \ | |
| --include "*.log" | |
| echo "Results uploaded to: s3://${S3_BUCKET}/${S3_PREFIX}/" | |
| - name: Prepare v3 results for dashboard upload | |
| shell: bash | |
| run: | | |
| set -eux | |
| echo "::group::Prepare v3 results" | |
| mkdir -p benchmark-results/v3 | |
| # Collect all v3 results into a single directory | |
| for RESULT_DIR in all_results/results-*/; do | |
| if [ -f "$RESULT_DIR/benchmark_results_v3.json" ]; then | |
| # Generate unique filename based on directory name | |
| FILENAME=$(basename "$RESULT_DIR") | |
| cp "$RESULT_DIR/benchmark_results_v3.json" "benchmark-results/v3/${FILENAME}.json" | |
| echo "✓ Copied $FILENAME v3 results" | |
| fi | |
| done | |
| echo "V3 results prepared:" | |
| ls -lah benchmark-results/v3/ | |
| echo "::endgroup::" | |
| - name: Upload benchmark results to dashboard | |
| uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main | |
| with: | |
| benchmark-results-dir: benchmark-results/v3 | |
| dry-run: false | |
| schema-version: v3 | |
| github-token: ${{ secrets.GITHUB_TOKEN }} |