1086 lines
42 KiB
YAML
1086 lines
42 KiB
YAML
name: PR Test ROCm 7.2 (AMD)
|
|
# Dynamic run-name for /rerun-stage commands to enable URL lookup
|
|
# Format: "[stage-name] sha" for fork PRs, "[stage-name]" for non-fork, default for normal runs
|
|
run-name: ${{ (inputs.target_stage || inputs.target_stage_select) && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage || inputs.target_stage_select, inputs.pr_head_sha) || format('[{0}]', inputs.target_stage || inputs.target_stage_select)) || '' }}
|
|
|
|
on:
|
|
schedule:
|
|
- cron: '30 17 * * *'
|
|
# push:
|
|
# branches: [ main ]
|
|
# paths:
|
|
# - "python/**"
|
|
# - "scripts/ci/**"
|
|
# - "test/**"
|
|
# - "sgl-kernel/**"
|
|
# - ".github/workflows/pr-test-amd-rocm720.yml"
|
|
# - "docker/rocm.Dockerfile"
|
|
# pull_request:
|
|
# branches: [ main ]
|
|
# paths:
|
|
# - "python/**"
|
|
# - "scripts/ci/**"
|
|
# - "test/**"
|
|
# - "sgl-kernel/**"
|
|
# - ".github/workflows/pr-test-amd-rocm720.yml"
|
|
# - "docker/rocm.Dockerfile"
|
|
workflow_dispatch:
|
|
inputs:
|
|
target_stage_select:
|
|
description: "Select a stage to run from dropdown (leave empty for auto-detect)"
|
|
required: false
|
|
type: choice
|
|
default: ''
|
|
options:
|
|
- ''
|
|
- sgl-kernel-unit-test-amd
|
|
- sgl-kernel-unit-test-2-gpu-amd
|
|
- stage-a-test-1-gpu-small-amd
|
|
- jit-kernel-unit-test-amd
|
|
- stage-b-test-1-gpu-small-amd
|
|
- stage-b-test-1-gpu-small-amd-nondeterministic
|
|
- stage-b-test-1-gpu-small-amd-mi35x
|
|
- stage-b-test-1-gpu-large-amd
|
|
- stage-b-test-2-gpu-large-amd
|
|
- multimodal-gen-test-1-gpu-amd
|
|
- multimodal-gen-test-2-gpu-amd
|
|
- stage-c-test-large-8-gpu-amd
|
|
- stage-c-test-large-8-gpu-amd-mi35x
|
|
- stage-b-test-large-8-gpu-disaggregation-amd
|
|
- stage-c-test-4-gpu-amd
|
|
target_stage:
|
|
description: "Or type comma-separated stage names (overrides dropdown if non-empty)"
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
pr_head_sha:
|
|
description: "PR head SHA to checkout (for /rerun-stage on fork PRs)"
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
aiter_ref:
|
|
description: 'Override AITER commit (optional, leave empty to use Dockerfile default)'
|
|
required: false
|
|
type: string
|
|
default: ''
|
|
continue_on_error:
|
|
description: 'Continue on error (do not fail the workflow on test failures)'
|
|
required: false
|
|
type: boolean
|
|
default: true
|
|
workflow_call:
|
|
inputs:
|
|
ref:
|
|
description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.'
|
|
required: false
|
|
type: string
|
|
default: ''
|
|
run_all_tests:
|
|
description: "Run all tests (for releasing or testing purpose)"
|
|
required: false
|
|
type: boolean
|
|
default: false
|
|
aiter_ref:
|
|
description: 'Override AITER commit (optional, leave empty to use Dockerfile default)'
|
|
required: false
|
|
type: string
|
|
default: ''
|
|
continue_on_error:
|
|
description: 'Continue on error (do not fail the workflow on test failures)'
|
|
required: false
|
|
type: boolean
|
|
default: true
|
|
|
|
env:
|
|
AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }}
|
|
|
|
concurrency:
|
|
# When called via workflow_call with run_all_tests=true, use a unique group per run to
|
|
# avoid collisions with direct schedule/workflow_dispatch triggers. We use run_all_tests
|
|
# (not github.event_name) to detect this, because github.event_name inherits from the caller.
|
|
group: pr-test-amd-rocm720-${{ inputs.run_all_tests && format('full-{0}', github.run_id) || inputs.pr_head_sha || inputs.ref || github.ref }}
|
|
cancel-in-progress: ${{ !inputs.run_all_tests && github.event_name != 'workflow_call' }}
|
|
|
|
jobs:
|
|
call-gate:
|
|
uses: ./.github/workflows/pr-gate.yml
|
|
secrets: inherit
|
|
check-changes:
|
|
needs: [call-gate]
|
|
runs-on: ubuntu-latest
|
|
outputs:
|
|
main_package: ${{ steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }}
|
|
sgl_kernel: ${{ steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }}
|
|
jit_kernel: ${{ steps.filter.outputs.jit_kernel || steps.run-mode.outputs.run_all_tests }}
|
|
multimodal_gen: ${{ steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Determine run mode
|
|
id: run-mode
|
|
run: |
|
|
# Run all tests for workflow_call (when ref input is provided)
|
|
# Note: github.event_name is inherited from caller, so we detect workflow_call by checking inputs.ref
|
|
if [[ "${{ inputs.run_all_tests }}" == "true" ]]; then
|
|
echo "run_all_tests=true" >> $GITHUB_OUTPUT
|
|
echo "Run mode: ALL TESTS (run_all_tests=${{ inputs.run_all_tests }})"
|
|
else
|
|
echo "run_all_tests=false" >> $GITHUB_OUTPUT
|
|
echo "Run mode: FILTERED (triggered by ${{ github.event_name }})"
|
|
fi
|
|
|
|
- name: Detect file changes
|
|
id: filter
|
|
uses: dorny/paths-filter@v3
|
|
if: steps.run-mode.outputs.run_all_tests != 'true'
|
|
with:
|
|
filters: |
|
|
main_package:
|
|
- "python/sglang/!(multimodal_gen)/**/!(*.md)"
|
|
- "python/pyproject_rocm.toml"
|
|
- "python/pyproject_other.toml"
|
|
- "scripts/ci/amd/*"
|
|
- "scripts/ci/utils/*"
|
|
- "test/**/!(*.md)"
|
|
- ".github/workflows/pr-test-amd-rocm720.yml"
|
|
sgl_kernel:
|
|
- "sgl-kernel/**/*.!(md|txt)"
|
|
- ".github/workflows/pr-test-amd-rocm720.yml"
|
|
jit_kernel:
|
|
- "python/sglang/jit_kernel/**"
|
|
- ".github/workflows/pr-test-amd-rocm720.yml"
|
|
multimodal_gen:
|
|
- "python/sglang/multimodal_gen/**/*.!(md|ipynb)"
|
|
- "python/sglang/cli/**"
|
|
- "python/sglang/jit_kernel/diffusion/**"
|
|
- "python/sglang/jit_kernel/tests/diffusion/**"
|
|
- "python/sglang/jit_kernel/benchmark/diffusion/**"
|
|
- "python/pyproject_rocm.toml"
|
|
- "python/pyproject_other.toml"
|
|
|
|
# =============================================== sgl-kernel ====================================================
|
|
sgl-kernel-unit-test-amd:
|
|
needs: [check-changes]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-amd,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
needs.check-changes.outputs.sgl_kernel == 'true'
|
|
)
|
|
)
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
runner: [linux-mi325-1gpu-sglang]
|
|
runs-on: ${{matrix.runner}}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
|
|
|
|
- name: Start CI container
|
|
run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_install_dependency.sh
|
|
- name: Run test
|
|
timeout-minutes: 14
|
|
run: |
|
|
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py
|
|
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py
|
|
docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py
|
|
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py
|
|
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py
|
|
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_topk.py
|
|
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py
|
|
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_sigmoid.py
|
|
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_torch_defaults_reset.py
|
|
|
|
sgl-kernel-unit-test-2-gpu-amd:
|
|
needs: [check-changes]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-2-gpu-amd,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
needs.check-changes.outputs.sgl_kernel == 'true'
|
|
)
|
|
)
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
runner: [linux-mi325-2gpu-sglang]
|
|
runs-on: ${{matrix.runner}}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
|
|
|
|
- name: Start CI container
|
|
run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_install_dependency.sh
|
|
- name: Run test
|
|
timeout-minutes: 20
|
|
run: |
|
|
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_deterministic_custom_allreduce.py
|
|
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_nccl_allreduce_determinism.py
|
|
|
|
# =============================================== primary ====================================================
|
|
|
|
stage-a-test-1-gpu-small-amd:
|
|
needs: [check-changes]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-a-test-1-gpu-small-amd,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
(!failure() && !cancelled()) &&
|
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
|
)
|
|
)
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
runner: [linux-mi325-1gpu-sglang]
|
|
runs-on: ${{matrix.runner}}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
|
|
|
|
- name: Start CI container
|
|
run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_install_dependency.sh
|
|
- name: Run test
|
|
timeout-minutes: 10
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-gpu-small-amd ${{ inputs.continue_on_error && '--continue-on-error' || '' }}
|
|
|
|
jit-kernel-unit-test-amd:
|
|
needs: [check-changes]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',jit-kernel-unit-test-amd,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
needs.check-changes.outputs.jit_kernel == 'true'
|
|
)
|
|
)
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
runner: [linux-mi325-1gpu-sglang]
|
|
runs-on: ${{matrix.runner}}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
|
|
|
|
- name: Start CI container
|
|
run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_install_dependency.sh
|
|
- name: Run JIT kernel unit tests
|
|
timeout-minutes: 10
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout" python3 -m pytest -q python/sglang/jit_kernel/tests/test_store_cache.py
|
|
|
|
stage-b-test-1-gpu-small-amd:
|
|
needs: [check-changes]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
(!failure() && !cancelled()) &&
|
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
|
)
|
|
)
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
runner: [linux-mi325-1gpu-sglang]
|
|
part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
|
|
runs-on: ${{matrix.runner}}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
|
|
|
|
- name: Start CI container
|
|
run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
|
|
- name: Run test
|
|
timeout-minutes: 30
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}
|
|
|
|
stage-b-test-1-gpu-small-amd-nondeterministic:
|
|
needs: [check-changes]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd-nondeterministic,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
(!failure() && !cancelled()) &&
|
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
|
)
|
|
)
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
runner: [linux-mi325-1gpu-sglang]
|
|
runs-on: ${{matrix.runner}}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
|
|
|
|
- name: Start CI container
|
|
run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
|
|
- name: Run test
|
|
timeout-minutes: 30
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd-nondeterministic --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}
|
|
|
|
stage-b-test-1-gpu-small-amd-mi35x:
|
|
needs: [check-changes]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd-mi35x,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
(!failure() && !cancelled()) &&
|
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
|
)
|
|
)
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
runner: [linux-mi35x-gpu-1]
|
|
runs-on: ${{matrix.runner}}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
|
|
|
|
- name: Start CI container
|
|
run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
|
|
- name: Run test
|
|
timeout-minutes: 30
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd-mi35x ${{ inputs.continue_on_error && '--continue-on-error' || '' }}
|
|
|
|
stage-b-test-1-gpu-large-amd:
|
|
needs: [check-changes]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-large-amd,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
(!failure() && !cancelled()) &&
|
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
|
)
|
|
)
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
runner: [linux-mi325-1gpu-sglang]
|
|
part: [0, 1]
|
|
runs-on: ${{matrix.runner}}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
|
|
|
|
- name: Start CI container
|
|
run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
|
|
- name: Run test
|
|
timeout-minutes: 30
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-large-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}
|
|
|
|
stage-b-test-2-gpu-large-amd:
|
|
needs: [check-changes]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-2-gpu-large-amd,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
(!failure() && !cancelled()) &&
|
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
|
)
|
|
)
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
runner: [linux-mi325-2gpu-sglang]
|
|
part: [0, 1]
|
|
runs-on: ${{matrix.runner}}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
|
|
|
|
- name: Start CI container
|
|
run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
|
|
- name: Run test
|
|
timeout-minutes: 30
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-2-gpu-large-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}
|
|
|
|
multimodal-gen-test-1-gpu-amd:
|
|
needs: [check-changes]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-1-gpu-amd,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
(!failure() && !cancelled()) &&
|
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
|
)
|
|
)
|
|
strategy:
|
|
fail-fast: false
|
|
max-parallel: 1 # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT
|
|
matrix:
|
|
runner: [linux-mi325-1gpu-sglang]
|
|
part: [0, 1, 2, 3]
|
|
runs-on: ${{matrix.runner}}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
|
|
|
|
- name: Download artifacts
|
|
if: needs.check-changes.outputs.sgl_kernel == 'true'
|
|
uses: actions/download-artifact@v4
|
|
with:
|
|
path: sgl-kernel/dist/
|
|
merge-multiple: true
|
|
pattern: wheel-python3.10-cuda12.9
|
|
|
|
- name: Start CI container
|
|
run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion
|
|
docker exec ci_sglang pip install amdsmi
|
|
|
|
- name: Setup kernel caches
|
|
run: |
|
|
# Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data)
|
|
# This directory persists across container restarts on the self-hosted runner
|
|
docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub
|
|
|
|
# Clear pre-built AITER kernels from Docker image to avoid segfaults
|
|
# The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/
|
|
echo "Clearing pre-built AITER kernels from Docker image..."
|
|
docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true
|
|
docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true
|
|
echo "AITER kernels cleared - will be rebuilt on first use"
|
|
|
|
# Create persistent cache marker if /sgl-data is a real mount (not ephemeral)
|
|
# This tells the test cleanup code to NOT delete downloaded models
|
|
if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then
|
|
docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache
|
|
echo "Created .persistent_cache marker - HF cache will persist"
|
|
else
|
|
echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test"
|
|
fi
|
|
|
|
# Check MIOpen cache (VAE convolution kernels)
|
|
miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0")
|
|
echo "Found ${miopen_files} MIOpen cache files"
|
|
|
|
- name: Diagnose HF cache and system resources
|
|
run: |
|
|
echo "=== System Memory Status ==="
|
|
free -h
|
|
echo ""
|
|
echo "=== Disk Space ==="
|
|
df -h /home/runner/sgl-data 2>/dev/null || df -h
|
|
echo ""
|
|
echo "=== HF Cache Directory Structure ==="
|
|
docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found"
|
|
docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found"
|
|
echo ""
|
|
echo "=== Checking for cached diffusion models (1-GPU tests) ==="
|
|
# Models used in 1-GPU tests: Wan2.1-T2V-1.3B, HunyuanVideo, Qwen-Image, FLUX.1, FLUX.2
|
|
for model in "Wan-AI--Wan2.1-T2V-1.3B-Diffusers" "tencent--HunyuanVideo" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev" "black-forest-labs--FLUX.2-dev"; do
|
|
cache_path="/sgl-data/hf-cache/hub/models--${model}"
|
|
if docker exec ci_sglang test -d "$cache_path"; then
|
|
size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1)
|
|
echo "✓ CACHED: $model ($size)"
|
|
else
|
|
echo "✗ NOT CACHED: $model"
|
|
fi
|
|
done
|
|
echo ""
|
|
echo "=== GPU Memory Status ==="
|
|
docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available"
|
|
|
|
- name: Run diffusion server tests (1-GPU)
|
|
timeout-minutes: 60
|
|
run: |
|
|
# AMD CI: All 1-GPU tests except FLUX.2 (FLUX.1 covers same code path)
|
|
# Tests: T2V, T2I, I2V, LoRA
|
|
#
|
|
# HF download env vars:
|
|
# - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available)
|
|
# - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings
|
|
docker exec \
|
|
-e SGLANG_E2E_TOLERANCE=0.3 \
|
|
-e SGLANG_STAGE_TIME_TOLERANCE=0.2 \
|
|
-e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \
|
|
-e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \
|
|
-e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \
|
|
-e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \
|
|
-e AITER_JIT_DIR=/sgl-data/aiter-kernels \
|
|
-e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \
|
|
-e HF_HUB_ENABLE_HF_TRANSFER=1 \
|
|
-e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
|
|
-w /sglang-checkout/python \
|
|
ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \
|
|
--suite 1-gpu \
|
|
--partition-id ${{ matrix.part }} \
|
|
--total-partitions 4 \
|
|
-k "not flux_2"
|
|
|
|
# Post-test diagnostics
|
|
echo "=== Post-test System Memory Status ==="
|
|
free -h
|
|
|
|
multimodal-gen-test-2-gpu-amd:
|
|
needs: [check-changes]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-2-gpu-amd,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
(!failure() && !cancelled()) &&
|
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
|
)
|
|
)
|
|
strategy:
|
|
fail-fast: false
|
|
max-parallel: 1 # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT
|
|
matrix:
|
|
runner: [linux-mi325-2gpu-sglang]
|
|
part: [0, 1] # 2 partitions: 9 tests ÷ 2 = ~4-5 tests each
|
|
runs-on: ${{matrix.runner}}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
|
|
|
|
- name: Download artifacts
|
|
if: needs.check-changes.outputs.sgl_kernel == 'true'
|
|
uses: actions/download-artifact@v4
|
|
with:
|
|
path: sgl-kernel/dist/
|
|
merge-multiple: true
|
|
pattern: wheel-python3.10-cuda12.9
|
|
|
|
- name: Start CI container
|
|
run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion
|
|
docker exec ci_sglang pip install amdsmi
|
|
|
|
- name: Setup kernel caches
|
|
run: |
|
|
# Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data)
|
|
docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub
|
|
|
|
# Clear pre-built AITER kernels from Docker image to avoid segfaults
|
|
# The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/
|
|
echo "Clearing pre-built AITER kernels from Docker image..."
|
|
docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true
|
|
docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true
|
|
echo "AITER kernels cleared - will be rebuilt on first use"
|
|
|
|
# Create persistent cache marker if /sgl-data is a real mount (not ephemeral)
|
|
# This tells the test cleanup code to NOT delete downloaded models
|
|
if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then
|
|
docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache
|
|
echo "Created .persistent_cache marker - HF cache will persist"
|
|
else
|
|
echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test"
|
|
fi
|
|
|
|
# Check MIOpen cache (VAE convolution kernels)
|
|
miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0")
|
|
echo "Found ${miopen_files} MIOpen cache files"
|
|
|
|
- name: Diagnose HF cache and system resources
|
|
run: |
|
|
echo "=== System Memory Status ==="
|
|
free -h
|
|
echo ""
|
|
echo "=== Disk Space ==="
|
|
df -h /home/runner/sgl-data 2>/dev/null || df -h
|
|
echo ""
|
|
echo "=== HF Cache Directory Structure ==="
|
|
docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found"
|
|
docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found"
|
|
echo ""
|
|
echo "=== Checking for cached diffusion models (2-GPU tests) ==="
|
|
# Models used in 2-GPU tests: Wan2.2-T2V-A14B, Wan2.1-T2V-14B, Qwen-Image, FLUX.1
|
|
for model in "Wan-AI--Wan2.2-T2V-A14B-Diffusers" "Wan-AI--Wan2.1-T2V-14B-Diffusers" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev"; do
|
|
cache_path="/sgl-data/hf-cache/hub/models--${model}"
|
|
if docker exec ci_sglang test -d "$cache_path"; then
|
|
size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1)
|
|
echo "✓ CACHED: $model ($size)"
|
|
else
|
|
echo "✗ NOT CACHED: $model"
|
|
fi
|
|
done
|
|
echo ""
|
|
echo "=== GPU Memory Status ==="
|
|
docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available"
|
|
|
|
- name: Run diffusion server tests (2-GPU)
|
|
timeout-minutes: 80
|
|
run: |
|
|
# AMD CI: All 2-GPU tests including LoRA
|
|
# Tests: T2V, T2I, I2V, LoRA
|
|
#
|
|
# HF download env vars:
|
|
# - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available)
|
|
# - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings
|
|
docker exec \
|
|
-e SGLANG_E2E_TOLERANCE=0.3 \
|
|
-e SGLANG_STAGE_TIME_TOLERANCE=0.2 \
|
|
-e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \
|
|
-e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \
|
|
-e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \
|
|
-e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \
|
|
-e AITER_JIT_DIR=/sgl-data/aiter-kernels \
|
|
-e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \
|
|
-e HF_HUB_ENABLE_HF_TRANSFER=1 \
|
|
-e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
|
|
-w /sglang-checkout/python \
|
|
ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \
|
|
--suite 2-gpu \
|
|
--partition-id ${{ matrix.part }} \
|
|
--total-partitions 2
|
|
|
|
# Post-test diagnostics
|
|
echo "=== Post-test System Memory Status ==="
|
|
free -h
|
|
|
|
|
|
stage-c-test-4-gpu-amd:
|
|
needs: [check-changes, stage-b-test-1-gpu-small-amd, stage-b-test-2-gpu-large-amd]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-4-gpu-amd,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
(!failure() && !cancelled()) &&
|
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
|
)
|
|
)
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
runner: [linux-mi325-4gpu-sglang]
|
|
part: [0]
|
|
runs-on: ${{matrix.runner}}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ensure_vram_clear.sh rocm
|
|
|
|
- name: Start CI container
|
|
run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
|
|
|
|
- name: Run test
|
|
timeout-minutes: 60
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_exec.sh \
|
|
-e NCCL_CUMEM_ENABLE=0 \
|
|
-e NCCL_NVLS_ENABLE=0 \
|
|
-e RCCL_MSCCL_ENABLE=0 \
|
|
-e SGLANG_USE_ROCM700A=1 \
|
|
-w "/sglang-checkout/test" \
|
|
python3 run_suite.py \
|
|
--hw amd \
|
|
--suite stage-c-test-4-gpu-amd \
|
|
--auto-partition-id ${{ matrix.part }} \
|
|
--auto-partition-size 1 \
|
|
--timeout-per-file 1800 \
|
|
--enable-retry \
|
|
--max-attempts 2 \
|
|
--retry-wait-seconds 120 \
|
|
--retry-timeout-increase 0 \
|
|
${{ inputs.continue_on_error && '--continue-on-error' || '' }}
|
|
|
|
stage-c-test-large-8-gpu-amd:
|
|
needs: [check-changes]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
(!failure() && !cancelled()) &&
|
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
|
)
|
|
)
|
|
env:
|
|
RUNNER_LABELS: linux-mi325-8gpu-sglang
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
runner: [linux-mi325-8gpu-sglang]
|
|
part: [0, 1, 2]
|
|
runs-on: ${{matrix.runner}}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
|
|
|
|
- name: Start CI container
|
|
run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
|
|
- name: Test RCCL multi-GPU communication
|
|
timeout-minutes: 5
|
|
run: |
|
|
echo "Testing RCCL multi-GPU communication with debug info..."
|
|
docker exec ci_sglang bash -c "cd /sglang-checkout && NCCL_DEBUG=INFO RCCL_DEBUG=INFO torchrun --nproc_per_node=8 scripts/ci/amd/test_rccl_multi_gpu.py"
|
|
|
|
- name: Run test
|
|
timeout-minutes: 60
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}
|
|
|
|
stage-c-test-large-8-gpu-amd-mi35x:
|
|
needs: [check-changes]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd-mi35x,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
(!failure() && !cancelled()) &&
|
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
|
)
|
|
)
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
runner: [linux-mi35x-gpu-8]
|
|
part: [0, 1]
|
|
runs-on: ${{matrix.runner}}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
|
|
|
|
- name: Start CI container
|
|
run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
|
|
- name: Run test
|
|
timeout-minutes: 60
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}
|
|
|
|
# =============================================== Disaggregation ====================================================
|
|
stage-b-test-large-8-gpu-35x-disaggregation-amd:
|
|
needs: [check-changes]
|
|
if: |
|
|
always() &&
|
|
(
|
|
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-8-gpu-disaggregation-amd,')) ||
|
|
(
|
|
!(inputs.target_stage || inputs.target_stage_select) &&
|
|
(!failure() && !cancelled()) &&
|
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
|
)
|
|
)
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
runner: [linux-mi35x-gpu-8.fabric]
|
|
|
|
runs-on: ${{matrix.runner}}
|
|
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
|
|
|
|
- name: Ensure VRAM is clear
|
|
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
|
|
|
|
- name: Check Host RDMA Environment
|
|
id: rdma_detect
|
|
run: |
|
|
set +e
|
|
echo "=== Checking Host RDMA Environment ==="
|
|
|
|
echo ""
|
|
echo "=== 1. Ionic driver library check ==="
|
|
ls -l /usr/lib/x86_64-linux-gnu/libibverbs/libionic* 2>/dev/null || echo "libionic not found in standard path"
|
|
|
|
echo ""
|
|
echo "=== 2. Infiniband devices ==="
|
|
ls -la /dev/infiniband/ 2>/dev/null || echo "/dev/infiniband not found"
|
|
ls -la /sys/class/infiniband/ 2>/dev/null || echo "/sys/class/infiniband not found"
|
|
|
|
echo ""
|
|
echo "=== 3. ibv_devinfo ==="
|
|
which ibv_devinfo 2>/dev/null && ibv_devinfo 2>&1 || echo "ibv_devinfo not available"
|
|
|
|
echo ""
|
|
echo "=== 4. Kernel modules ==="
|
|
lsmod 2>/dev/null | grep -E "ib_|rdma|ionic" || echo "No RDMA kernel modules loaded"
|
|
|
|
echo ""
|
|
echo "=== 5. Detect RDMA Devices for test environment ==="
|
|
if [ -d "/sys/class/infiniband" ]; then
|
|
RDMA_DEVS=$(ls /sys/class/infiniband | paste -sd "," -)
|
|
echo "Detected RDMA Devices: $RDMA_DEVS"
|
|
echo "SGLANG_TEST_RDMA_DEVICE=$RDMA_DEVS" >> $GITHUB_ENV
|
|
else
|
|
echo "No RDMA devices found in /sys/class/infiniband"
|
|
echo "SGLANG_TEST_RDMA_DEVICE=" >> $GITHUB_ENV
|
|
fi
|
|
|
|
echo ""
|
|
echo "=== Host RDMA Check Complete ==="
|
|
|
|
- name: Start Special Container
|
|
run: bash scripts/ci/amd/amd_ci_start_container_disagg.sh --rocm-version rocm720
|
|
env:
|
|
GITHUB_WORKSPACE: ${{ github.workspace }}
|
|
|
|
- name: Install dependencies
|
|
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
|
|
|
|
- name: Verify RDMA in Container
|
|
run: |
|
|
docker exec -u root ci_sglang bash -c '
|
|
echo "=== Container RDMA Verification ==="
|
|
echo "Device nodes:"
|
|
ls -la /dev/infiniband/
|
|
echo ""
|
|
echo "Provider libraries:"
|
|
ls /usr/lib/x86_64-linux-gnu/libibverbs/ | grep -E "ionic|mlx" || echo "No Ionic/Mellanox providers"
|
|
echo ""
|
|
echo "HCA devices:"
|
|
HCA_COUNT=$(ibv_devinfo -list 2>&1 | grep -oE "^[0-9]+ HCAs? found" | grep -oE "^[0-9]+" || echo "0")
|
|
ibv_devinfo -list
|
|
if [ "$HCA_COUNT" -gt 0 ]; then
|
|
echo ""
|
|
echo "=== SUCCESS: RDMA setup complete. Found $HCA_COUNT HCA(s) ==="
|
|
else
|
|
echo ""
|
|
echo "=== WARNING: No HCAs detected. RDMA tests may fail ==="
|
|
fi
|
|
'
|
|
|
|
- name: Run Aiter Op Test (RMSNorm)
|
|
timeout-minutes: 10
|
|
run: |
|
|
echo "Running pre-check: test_rmsnorm2d.py"
|
|
docker exec \
|
|
-e MAX_JOBS=192 \
|
|
ci_sglang \
|
|
python /sgl-workspace/aiter/op_tests/test_rmsnorm2d.py
|
|
|
|
- name: Run test_disaggregation
|
|
timeout-minutes: 60
|
|
run: |
|
|
bash scripts/ci/amd/amd_ci_exec.sh \
|
|
-e SGLANG_TEST_RDMA_DEVICE="${{ env.SGLANG_TEST_RDMA_DEVICE }}" \
|
|
-w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-35x-disaggregation-amd --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}
|
|
|
|
pr-test-amd-finish:
|
|
needs:
|
|
[
|
|
call-gate,
|
|
check-changes,
|
|
|
|
sgl-kernel-unit-test-amd,
|
|
sgl-kernel-unit-test-2-gpu-amd,
|
|
multimodal-gen-test-1-gpu-amd,
|
|
multimodal-gen-test-2-gpu-amd,
|
|
|
|
stage-a-test-1-gpu-small-amd,
|
|
jit-kernel-unit-test-amd,
|
|
stage-b-test-1-gpu-small-amd,
|
|
stage-b-test-1-gpu-small-amd-nondeterministic,
|
|
stage-b-test-1-gpu-small-amd-mi35x,
|
|
stage-b-test-1-gpu-large-amd,
|
|
stage-b-test-2-gpu-large-amd,
|
|
stage-b-test-large-8-gpu-35x-disaggregation-amd,
|
|
stage-c-test-4-gpu-amd,
|
|
stage-c-test-large-8-gpu-amd,
|
|
stage-c-test-large-8-gpu-amd-mi35x,
|
|
]
|
|
if: always()
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
- name: Check all dependent job statuses
|
|
run: |
|
|
# Convert the 'needs' context to a JSON string
|
|
json_needs='${{ toJson(needs) }}'
|
|
|
|
# Get a list of all job names from the JSON keys
|
|
job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')
|
|
|
|
for job in $job_names; do
|
|
# For each job, extract its result
|
|
result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')
|
|
|
|
# Print the job name and its result
|
|
echo "$job: $result"
|
|
|
|
# Check for failure or cancellation and exit if found
|
|
if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
|
|
echo "The above jobs failed."
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
# If the loop completes, all jobs were successful
|
|
echo "All jobs completed successfully"
|
|
exit 0
|