agentic-pd-hybrid/scripts/setup_env.sh

#!/usr/bin/env bash
# Source this file in every shell that will run agentic-pd-hybrid.
#
#   source scripts/setup_env.sh
#
# Why all three are needed:
# - CUDA_HOME / PATH point tvm_ffi (vendor sglang JIT compiler) at cu12.8 nvcc.
#   Without this it falls back to /usr/local/cuda-13.0/bin/nvcc and the
#   resulting .so links libcudart.so.13 which driver 570 (cu12.8 API) rejects
#   with cudaErrorInsufficientDriver.
# - LD_LIBRARY_PATH must expose libcudart.so.12 for mooncake.engine (cu12 wheel)
#   AND ~/cuda-12.8/lib64 for tvm_ffi compile-time linker searches.
#
# See docs/H200_DRIVER570_SETUP_ZH.md for the full rationale.

REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"

if [ ! -x "$HOME/cuda-12.8/bin/nvcc" ]; then
  echo "ERROR: $HOME/cuda-12.8/bin/nvcc not found." >&2
  echo "Install cu12.8 toolkit first (see docs/H200_DRIVER570_SETUP_ZH.md §3)." >&2
  return 1 2>/dev/null || exit 1
fi

if [ ! -f "$REPO_ROOT/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12" ]; then
  echo "ERROR: venv libcudart.so.12 missing. Run 'uv sync' from $REPO_ROOT." >&2
  return 1 2>/dev/null || exit 1
fi

export CUDA_HOME="$HOME/cuda-12.8"
export PATH="$HOME/cuda-12.8/bin:$PATH"
export LD_LIBRARY_PATH="$REPO_ROOT/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/lib:$HOME/cuda-12.8/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"

# Mooncake batch_transfer_sync C++ timeout (seconds). Default in mooncake is
# 30 s; a single LRU eviction sweep on a saturated D scheduler can exceed
# that and cause the hair-trigger blacklist in conn.py:1270 to permanently
# mark the D's mooncake_session_id "failed". 1800 s = 30 min gives us
# headroom while still detecting genuinely broken peers eventually.
# See docs/E1_E2_RESULTS_ZH.md §5c and docs/E1_E2_FIX_DESIGN_ZH.md Q1.C.
export MC_TRANSFER_TIMEOUT="${MC_TRANSFER_TIMEOUT:-1800}"

echo "agentic-pd-hybrid env ready:"
echo "  CUDA_HOME=$CUDA_HOME ($(nvcc --version | grep release | sed 's/.*release //'))"
echo "  libcudart.so.12 at $REPO_ROOT/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/lib"
echo "  MC_TRANSFER_TIMEOUT=${MC_TRANSFER_TIMEOUT}s"