Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/pytorch-probot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ ciflow_push_tags:
- ciflow/tutorials
- ciflow/rocm
- ciflow/4xh100
- ciflow/xpu
9 changes: 9 additions & 0 deletions .github/scripts/ci_test_xpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

python3 -m pip install torch torchvision torchaudio pytorch-triton-xpu --index-url https://download.pytorch.org/whl/nightly/xpu --force-reinstall --no-cache-dir
python3 setup.py install

pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'

cd test/quantization
pytest -v -s *.py
224 changes: 224 additions & 0 deletions .github/workflows/pr-test-xpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
# TODO: this looks sort of similar to _linux-test, but there are like a dozen
# places where you would have to insert an if statement. Probably it's better to
# just use a different workflow altogether

name: xpu-test

on:
push:
tags:
- ciflow/xpu/*

permissions:
id-token: write
contents: read

concurrency:
group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

jobs:
test:
# Don't run on forked repos or empty test matrix
# if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
timeout-minutes: 60
runs-on: linux.idc.xpu
env:
DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3
PYTORCH_RETRY_TEST_CASES: 1
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
steps:
# [see note: pytorch repo ref]
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main

- name: Checkout Torchao
uses: actions/checkout@v4

- name: Clean all stopped docker containers
if: always()
shell: bash
run: |
# Prune all stopped containers.
# If other runner is pruning on this node, will skip.
nprune=$(ps -ef | grep -c "docker container prune")
if [[ $nprune -eq 1 ]]; then
docker container prune -f
fi

- name: Runner health check system info
if: always()
shell: bash
run: |
cat /etc/os-release || true
cat /etc/apt/sources.list.d/oneAPI.list || true
cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true
whoami

- name: Runner health check xpu-smi
if: always()
shell: bash
run: |
timeout 30 xpu-smi discovery || true

- name: Runner health check GPU count
if: always()
shell: bash
run: |
ngpu=$(timeout 30 xpu-smi discovery | grep -c -E 'Device Name' || true)
msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
if [[ $ngpu -eq 0 ]]; then
echo "Error: Failed to detect any GPUs on the runner"
echo "$msg"
exit 1
fi

- name: Runner diskspace health check
uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
if: always()

- name: Runner health check disconnect on failure
if: ${{ failure() }}
shell: bash
run: |
killall runsvc.sh

- name: Preserve github env variables for use in docker
shell: bash
run: |
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"

- name: XPU set GPU_FLAG
shell: bash
run: |
# Add render group for container creation.
render_gid=`cat /etc/group | grep render | cut -d: -f3`
echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}"

- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1

- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1

- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-image-name: ${{ env.DOCKER_IMAGE }}
docker-build-dir: pytorch/pytorch/.ci/docker

- name: Use following to pull public copy of the image
id: print-ghcr-mirror
env:
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
shell: bash
run: |
tag=${ECR_DOCKER_IMAGE##*:}
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

- name: Runner health check GPU count
if: always()
shell: bash
run: |
ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true)
msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
if [[ $ngpu -eq 0 ]]; then
echo "Error: Failed to detect any GPUs on the runner"
echo "$msg"
exit 1
fi

- name: Test
id: test
env:
TEST_COMMAND: .github/scripts/ci_test_xpu.sh
DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3
PR_NUMBER: ${{ github.event.pull_request.number }}
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_WORKFLOW: ${{ github.workflow }}
GITHUB_JOB: ${{ github.job }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
timeout-minutes: 60
run: |
set -x

# detached container should get cleaned up by teardown_ec2_linux
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086,SC2090
container_name=$(docker run \
${GPU_FLAG:-} \
-e PR_NUMBER \
-e GITHUB_ACTIONS \
-e GITHUB_REPOSITORY \
-e GITHUB_WORKFLOW \
-e GITHUB_JOB \
-e GITHUB_RUN_ID \
-e GITHUB_RUN_NUMBER \
-e GITHUB_RUN_ATTEMPT \
-e JOB_ID \
-e BRANCH \
-e SHA1 \
--user $(id -u):$(id -g) \
--ulimit stack=10485760:83886080 \
--ulimit core=0 \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="8g" \
--tty \
--detach \
--name="${container_name}" \
--user jenkins \
--privileged \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
# save container name for later step
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
# jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
docker exec -t "${container_name}" sh -c "bash ${env.TEST_COMMAND}"

- name: Change permissions
if: ${{ always() && steps.test.conclusion }}
run: |
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R jenkins:jenkins test"

- name: Collect backtraces from coredumps (if any)
if: always()
run: |
# shellcheck disable=SC2156
find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;

- name: Stop container before exit
if: always()
run: |
# Workaround for multiple runners on same IDC node
docker stop "${{ env.CONTAINER_NAME }}"

- name: Store Core dumps on GitHub
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
if: failure()
with:
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
retention-days: 14
if-no-files-found: ignore
path: ./**/core.[1-9]*

- name: Teardown XPU
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can reuse the action in pytorch directly

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, this is literally ported from pytorch

uses: pytorch/pytorch/.github/actions/teardown-xpu@main
Loading