429 lines
19 KiB
YAML
429 lines
19 KiB
YAML
name: Nightly Test (NPU)
|
|
|
|
on:
|
|
schedule:
|
|
- cron: '0 18 * * *' # Execute at 2:00 a.m. Beijing Time every day
|
|
pull_request:
|
|
branches:
|
|
- main
|
|
paths:
|
|
- ".github/workflows/nightly-test-npu.yml"
|
|
workflow_dispatch:
|
|
workflow_call:
|
|
inputs:
|
|
ref:
|
|
description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.'
|
|
required: false
|
|
type: string
|
|
default: ''
|
|
job_filter:
|
|
description: 'Select which job to run (leave empty or "all" to run all jobs)'
|
|
required: false
|
|
type: string
|
|
default: 'all'
|
|
image_a3:
|
|
description: 'The a3 running docker image of the test task.'
|
|
required: false
|
|
type: string
|
|
default: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-a3-ubuntu22.04-py3.11'
|
|
skip_install_flag:
|
|
description: 'Indicates whether to skip the installation of sglang, defaulting to false.'
|
|
required: false
|
|
type: string
|
|
default: 'false'
|
|
|
|
|
|
concurrency:
|
|
group: nightly-test-npu-${{ inputs.ref || github.ref }}
|
|
cancel-in-progress: ${{ github.event_name != 'workflow_call' }}
|
|
|
|
jobs:
|
|
set-image-config:
|
|
runs-on: ubuntu-latest
|
|
outputs:
|
|
ref: ${{ steps.set-vars.outputs.ref }}
|
|
job_filter: ${{ steps.set-vars.outputs.job_filter }}
|
|
image_a3: ${{ steps.set-vars.outputs.image_a3 }}
|
|
skip_install_flag: ${{ steps.set-vars.outputs.skip_install_flag }}
|
|
steps:
|
|
# When triggered by PR, no inputs parameters are used. The latest community code is tested by default.
|
|
- name: Set image config
|
|
id: set-vars
|
|
run: |
|
|
if [ -z "${{ inputs.ref }}" ]; then
|
|
echo "ref=" >> $GITHUB_OUTPUT
|
|
else
|
|
echo "ref=${{ inputs.ref }}" >> $GITHUB_OUTPUT
|
|
fi
|
|
|
|
if [ -z "${{ inputs.job_filter }}" ]; then
|
|
echo "job_filter=all" >> $GITHUB_OUTPUT
|
|
else
|
|
echo "job_filter=${{ inputs.job_filter }}" >> $GITHUB_OUTPUT
|
|
fi
|
|
|
|
if [ -z "${{ inputs.image_a3 }}" ]; then
|
|
echo "image_a3=swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-a3-ubuntu22.04-py3.11" >> $GITHUB_OUTPUT
|
|
else
|
|
echo "image_a3=${{ inputs.image_a3 }}" >> $GITHUB_OUTPUT
|
|
fi
|
|
|
|
if [ -z "${{ inputs.skip_install_flag }}" ]; then
|
|
echo "skip_install_flag=false" >> $GITHUB_OUTPUT
|
|
else
|
|
echo "skip_install_flag=${{ inputs.skip_install_flag }}" >> $GITHUB_OUTPUT
|
|
fi
|
|
|
|
nightly-1-npu-a3:
|
|
needs: [set-image-config]
|
|
if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
|
|
runs-on: linux-aarch64-a3-2
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
part: [0, 1]
|
|
container:
|
|
image: ${{ needs.set-image-config.outputs.image_a3 }}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ needs.set-image-config.outputs.ref || github.ref }}
|
|
|
|
- name: Install dependencies
|
|
env:
|
|
TORCH_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/whl/cpu"
|
|
PYPI_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple"
|
|
GITHUB_PROXY_URL: "https://gh-proxy.test.osinfra.cn/"
|
|
run: |
|
|
# speed up by using infra cache services
|
|
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
|
|
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
|
|
pip config set global.index-url http://${CACHING_URL}/pypi/simple
|
|
pip config set global.trusted-host "${CACHING_URL}"
|
|
|
|
if [ ${{ needs.set-image-config.outputs.skip_install_flag }} != "true" ];then
|
|
bash scripts/ci/npu/npu_ci_install_dependency.sh a3
|
|
fi
|
|
|
|
# copy required file from our daily cache
|
|
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
|
|
# copy gsm8k dataset
|
|
cp ~/.cache/modelscope/hub/datasets/tmp/test.jsonl /tmp
|
|
|
|
- name: Print Log Information
|
|
run: |
|
|
bash scripts/ci/npu/npu_log_print.sh
|
|
|
|
- name: Run test
|
|
timeout-minutes: 240
|
|
env:
|
|
SGLANG_USE_MODELSCOPE: true
|
|
SGLANG_IS_IN_CI: true
|
|
HF_ENDPOINT: https://hf-mirror.com
|
|
TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
|
|
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
|
STREAMS_PER_DEVICE: 32
|
|
run: |
|
|
pip install sglang_router
|
|
hf download lmms-lab/MMMU --repo-type dataset
|
|
pip install sentence_transformers torchaudio==2.8.0
|
|
pip install protobuf==6.31.1 zss pre-commit wandb>=0.16.0 tenacity==8.3.0 loguru openpyxl latex2sympy2 zstandard transformers-stream-generator tqdm-multiprocess pycocoevalcap
|
|
pip install yt-dlp sentencepiece==0.1.99 nltk av ftfy sqlitedict==2.1.0 sacrebleu>=1.5.0 pytablewriter black==24.1.0 isort==5.13.2 peft>=0.2.0 accelerate>=0.29.1
|
|
pip install jsonlines httpx==0.25.0 evaluate>=0.4.0 datasets==2.16.1 numexpr xgrammar==0.1.32 numpy==1.26.4 dotenv
|
|
git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
|
|
cd ./lmms-eval
|
|
nohup pip install . > lmmslog.txt 2>&1 &
|
|
sleep 120
|
|
export PYTHONPATH=$PYTHONPATH:$(pwd)
|
|
cd ../
|
|
cd test
|
|
python3 run_suite.py --hw npu --suite nightly-1-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
|
|
|
|
nightly-2-npu-a3:
|
|
needs: [set-image-config]
|
|
if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
|
|
runs-on: linux-aarch64-a3-2
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
part: [0]
|
|
container:
|
|
image: ${{ needs.set-image-config.outputs.image_a3 }}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ needs.set-image-config.outputs.ref || github.ref }}
|
|
|
|
- name: Install dependencies
|
|
env:
|
|
TORCH_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/whl/cpu"
|
|
PYPI_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple"
|
|
GITHUB_PROXY_URL: "https://gh-proxy.test.osinfra.cn/"
|
|
run: |
|
|
# speed up by using infra cache services
|
|
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
|
|
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
|
|
pip config set global.index-url http://${CACHING_URL}/pypi/simple
|
|
pip config set global.trusted-host "${CACHING_URL}"
|
|
|
|
if [ ${{ needs.set-image-config.outputs.skip_install_flag }} != "true" ];then
|
|
bash scripts/ci/npu/npu_ci_install_dependency.sh a3
|
|
fi
|
|
|
|
# copy required file from our daily cache
|
|
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
|
|
# copy gsm8k dataset
|
|
cp ~/.cache/modelscope/hub/datasets/tmp/test.jsonl /tmp
|
|
|
|
- name: Print Log Information
|
|
run: |
|
|
bash scripts/ci/npu/npu_log_print.sh
|
|
- name: Run test
|
|
timeout-minutes: 240
|
|
env:
|
|
SGLANG_USE_MODELSCOPE: true
|
|
SGLANG_IS_IN_CI: true
|
|
HF_ENDPOINT: https://hf-mirror.com
|
|
TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
|
|
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
|
STREAMS_PER_DEVICE: 32
|
|
run: |
|
|
pip install sglang_router
|
|
hf download lmms-lab/MMMU --repo-type dataset
|
|
pip install sentence_transformers torchaudio==2.8.0
|
|
pip install protobuf==6.31.1 zss pre-commit wandb>=0.16.0 tenacity==8.3.0 loguru openpyxl latex2sympy2 zstandard transformers-stream-generator tqdm-multiprocess pycocoevalcap
|
|
pip install yt-dlp sentencepiece==0.1.99 nltk av ftfy sqlitedict==2.1.0 sacrebleu>=1.5.0 pytablewriter black==24.1.0 isort==5.13.2 peft>=0.2.0 accelerate>=0.29.1
|
|
pip install jsonlines httpx==0.25.0 evaluate>=0.4.0 datasets==2.16.1 numexpr xgrammar==0.1.32 numpy==1.26.4 dotenv
|
|
git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
|
|
cd ./lmms-eval
|
|
nohup pip install . > lmmslog.txt 2>&1 &
|
|
sleep 120
|
|
export PYTHONPATH=$PYTHONPATH:$(pwd)
|
|
cd ../
|
|
cd test
|
|
python3 run_suite.py --hw npu --suite nightly-2-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
|
|
|
|
nightly-4-npu-a3:
|
|
needs: [set-image-config]
|
|
if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
|
|
runs-on: linux-aarch64-a3-4
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
part: [0]
|
|
container:
|
|
image: ${{ needs.set-image-config.outputs.image_a3 }}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ needs.set-image-config.outputs.ref|| github.ref }}
|
|
|
|
- name: Install dependencies
|
|
env:
|
|
TORCH_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/whl/cpu"
|
|
PYPI_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple"
|
|
GITHUB_PROXY_URL: "https://gh-proxy.test.osinfra.cn/"
|
|
run: |
|
|
# speed up by using infra cache services
|
|
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
|
|
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
|
|
pip config set global.index-url http://${CACHING_URL}/pypi/simple
|
|
pip config set global.trusted-host "${CACHING_URL}"
|
|
|
|
if [ ${{ needs.set-image-config.outputs.skip_install_flag }} != "true" ];then
|
|
bash scripts/ci/npu/npu_ci_install_dependency.sh a3
|
|
fi
|
|
|
|
# copy required file from our daily cache
|
|
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
|
|
# copy gsm8k dataset
|
|
cp ~/.cache/modelscope/hub/datasets/tmp/test.jsonl /tmp
|
|
|
|
- name: Print Log Information
|
|
run: |
|
|
bash scripts/ci/npu/npu_log_print.sh
|
|
|
|
- name: Run test
|
|
timeout-minutes: 240
|
|
env:
|
|
SGLANG_USE_MODELSCOPE: true
|
|
SGLANG_IS_IN_CI: true
|
|
HF_ENDPOINT: https://hf-mirror.com
|
|
TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
|
|
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
|
STREAMS_PER_DEVICE: 32
|
|
run: |
|
|
pip install sglang_router
|
|
hf download lmms-lab/MMMU --repo-type dataset
|
|
pip install sentence_transformers torchaudio==2.8.0
|
|
pip install protobuf==6.31.1 zss pre-commit wandb>=0.16.0 tenacity==8.3.0 loguru openpyxl latex2sympy2 zstandard transformers-stream-generator tqdm-multiprocess pycocoevalcap
|
|
pip install yt-dlp sentencepiece==0.1.99 nltk av ftfy sqlitedict==2.1.0 sacrebleu>=1.5.0 pytablewriter black==24.1.0 isort==5.13.2 peft>=0.2.0 accelerate>=0.29.1
|
|
pip install jsonlines httpx==0.25.0 evaluate>=0.4.0 datasets==2.16.1 numexpr xgrammar==0.1.32 numpy==1.26.4 dotenv
|
|
git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
|
|
cd ./lmms-eval
|
|
nohup pip install . > lmmslog.txt 2>&1 &
|
|
sleep 120
|
|
export PYTHONPATH=$PYTHONPATH:$(pwd)
|
|
cd ../
|
|
cd test
|
|
python3 run_suite.py --hw npu --suite nightly-4-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
|
|
|
|
nightly-8-npu-a3:
|
|
needs: [set-image-config]
|
|
if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
|
|
runs-on: linux-aarch64-a3-8
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
part: [0]
|
|
container:
|
|
image: ${{ needs.set-image-config.outputs.image_a3 }}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ needs.set-image-config.outputs.ref || github.ref }}
|
|
|
|
- name: Install dependencies
|
|
env:
|
|
TORCH_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/whl/cpu"
|
|
PYPI_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple"
|
|
GITHUB_PROXY_URL: "https://gh-proxy.test.osinfra.cn/"
|
|
run: |
|
|
# speed up by using infra cache services
|
|
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
|
|
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
|
|
pip config set global.index-url http://${CACHING_URL}/pypi/simple
|
|
pip config set global.trusted-host "${CACHING_URL}"
|
|
|
|
if [ ${{ needs.set-image-config.outputs.skip_install_flag }} != "true" ];then
|
|
bash scripts/ci/npu/npu_ci_install_dependency.sh a3
|
|
fi
|
|
|
|
# copy required file from our daily cache
|
|
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
|
|
# copy gsm8k dataset
|
|
cp ~/.cache/modelscope/hub/datasets/tmp/test.jsonl /tmp
|
|
|
|
- name: Print Log Information
|
|
run: |
|
|
bash scripts/ci/npu/npu_log_print.sh
|
|
|
|
- name: Run test
|
|
timeout-minutes: 240
|
|
env:
|
|
SGLANG_USE_MODELSCOPE: true
|
|
SGLANG_IS_IN_CI: true
|
|
HF_ENDPOINT: https://hf-mirror.com
|
|
TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
|
|
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
|
STREAMS_PER_DEVICE: 32
|
|
run: |
|
|
pip install sglang_router
|
|
hf download lmms-lab/MMMU --repo-type dataset
|
|
pip install sentence_transformers torchaudio==2.8.0
|
|
pip install protobuf==6.31.1 zss pre-commit wandb>=0.16.0 tenacity==8.3.0 loguru openpyxl latex2sympy2 zstandard transformers-stream-generator tqdm-multiprocess pycocoevalcap
|
|
pip install yt-dlp sentencepiece==0.1.99 nltk av ftfy sqlitedict==2.1.0 sacrebleu>=1.5.0 pytablewriter black==24.1.0 isort==5.13.2 peft>=0.2.0 accelerate>=0.29.1
|
|
pip install jsonlines httpx==0.25.0 evaluate>=0.4.0 datasets==2.16.1 numexpr xgrammar==0.1.32 numpy==1.26.4 dotenv
|
|
git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
|
|
cd ./lmms-eval
|
|
nohup pip install . > lmmslog.txt 2>&1 &
|
|
sleep 120
|
|
export PYTHONPATH=$PYTHONPATH:$(pwd)
|
|
cd ../
|
|
cd test
|
|
python3 run_suite.py --hw npu --suite nightly-8-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
|
|
|
|
nightly-16-npu-a3:
|
|
needs: [set-image-config]
|
|
if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
|
|
runs-on: linux-aarch64-a3-16
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
part: [0, 1]
|
|
container:
|
|
image: ${{ needs.set-image-config.outputs.image_a3 }}
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ needs.set-image-config.outputs.ref || github.ref }}
|
|
|
|
- name: Install dependencies
|
|
env:
|
|
TORCH_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/whl/cpu"
|
|
PYPI_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple"
|
|
GITHUB_PROXY_URL: "https://gh-proxy.test.osinfra.cn/"
|
|
run: |
|
|
# speed up by using infra cache services
|
|
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
|
|
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
|
|
pip config set global.index-url http://${CACHING_URL}/pypi/simple
|
|
pip config set global.trusted-host "${CACHING_URL}"
|
|
|
|
if [ ${{ needs.set-image-config.outputs.skip_install_flag }} != "true" ];then
|
|
bash scripts/ci/npu/npu_ci_install_dependency.sh a3
|
|
fi
|
|
|
|
# copy required file from our daily cache
|
|
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
|
|
# copy gsm8k dataset
|
|
cp ~/.cache/modelscope/hub/datasets/tmp/test.jsonl /tmp
|
|
|
|
- name: Print Log Information
|
|
run: |
|
|
bash scripts/ci/npu/npu_log_print.sh
|
|
|
|
- name: Run test
|
|
timeout-minutes: 240
|
|
env:
|
|
SGLANG_USE_MODELSCOPE: true
|
|
SGLANG_IS_IN_CI: true
|
|
HF_ENDPOINT: https://hf-mirror.com
|
|
TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
|
|
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
|
STREAMS_PER_DEVICE: 32
|
|
run: |
|
|
pip install sglang_router
|
|
hf download lmms-lab/MMMU --repo-type dataset
|
|
pip install sentence_transformers torchaudio==2.8.0
|
|
pip install protobuf==6.31.1 zss pre-commit wandb>=0.16.0 tenacity==8.3.0 loguru openpyxl latex2sympy2 zstandard transformers-stream-generator tqdm-multiprocess pycocoevalcap
|
|
pip install yt-dlp sentencepiece==0.1.99 nltk av ftfy sqlitedict==2.1.0 sacrebleu>=1.5.0 pytablewriter black==24.1.0 isort==5.13.2 peft>=0.2.0 accelerate>=0.29.1
|
|
pip install jsonlines httpx==0.25.0 evaluate>=0.4.0 datasets==2.16.1 numexpr xgrammar==0.1.32 numpy==1.26.4 dotenv
|
|
git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
|
|
cd ./lmms-eval
|
|
nohup pip install . > lmmslog.txt 2>&1 &
|
|
sleep 120
|
|
export PYTHONPATH=$PYTHONPATH:$(pwd)
|
|
cd ../
|
|
cd test
|
|
python3 run_suite.py --hw npu --suite nightly-16-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
|
|
|
|
check-all-jobs:
|
|
if: github.repository == 'sgl-project/sglang' && always()
|
|
needs:
|
|
- nightly-1-npu-a3
|
|
- nightly-2-npu-a3
|
|
- nightly-4-npu-a3
|
|
- nightly-8-npu-a3
|
|
- nightly-16-npu-a3
|
|
runs-on: ubuntu-latest
|
|
container:
|
|
image: docker.m.daocloud.io/ubuntu:22.04
|
|
steps:
|
|
- name: Check if any job failed
|
|
run: |
|
|
if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then
|
|
echo "One or more nightly test jobs failed"
|
|
exit 1
|
|
fi
|
|
if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then
|
|
echo "One or more nightly test jobs were cancelled"
|
|
exit 1
|
|
fi
|
|
echo "All nightly test jobs passed"
|