name: CI Failure Monitor on: schedule: - cron: '0 */12 * * *' # Every 12 hour workflow_dispatch: concurrency: group: ci-failure-monitor-${{ github.ref }} cancel-in-progress: true permissions: contents: read actions: read jobs: failure-analysis: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.14' - name: Install dependencies run: | python -m pip install --upgrade pip pip install requests slack_sdk - name: Run Failure Analysis env: GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }} GH_PAT_FOR_RUNNER_ADMIN: ${{ secrets.GH_PAT_FOR_RUNNER_ADMIN }} PYTHONUNBUFFERED: 1 PYTHONIOENCODING: utf-8 run: | cd scripts/ci_monitor python ci_failures_analysis.py \ --token $GITHUB_TOKEN \ --limit 100 \ --output ci_failure_analysis_$(date +%Y%m%d_%H%M%S).json - name: Upload Analysis Results uses: actions/upload-artifact@v4 with: name: ci-failure-analysis-${{ github.run_number }} path: | scripts/ci_monitor/ci_failure_analysis_*.json retention-days: 7 - name: Send Slack Notification if: always() env: SGLANG_DIFFUSION_SLACK_TOKEN: ${{ secrets.SGLANG_DIFFUSION_SLACK_TOKEN }} run: | cd scripts/ci_monitor LATEST_REPORT=$(ls -t ci_failure_analysis_*.json | head -1) if [ ! -f "$LATEST_REPORT" ]; then echo "No report found, so skipping Slack notification" exit 0 fi if [ -n "$SGLANG_DIFFUSION_SLACK_TOKEN" ]; then python3 post_ci_failures_to_slack.py --report-file "$LATEST_REPORT" else echo "SGLANG_DIFFUSION_SLACK_TOKEN not configured, skipping notification" fi