From 905d671135c73ab08689322c78754019be2dacc9 Mon Sep 17 00:00:00 2001 From: tim Date: Tue, 12 May 2026 11:45:09 +0800 Subject: [PATCH] feat(env): MC_TRANSFER_TIMEOUT=1800s default in setup_env + stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mooncake C++ batch_transfer_sync defaults to 30s timeout; on saturated D scheduler threads doing LRU eviction, that fires as a false positive and the SGLang hair-trigger in conn.py:1270 permanently blacklists the D's mooncake_session_id (E2 forensic in docs/E1_E2_RESULTS_ZH.md §5c). Bump to 1800s in setup_env.sh and mirror to subprocess env in stack.py so SGLang workers get it too. 30-min envelope still detects genuinely broken peers eventually. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/H200_DRIVER570_SETUP_ZH.md | 3 +++ scripts/setup_env.sh | 9 +++++++++ src/agentic_pd_hybrid/stack.py | 8 ++++++++ 3 files changed, 20 insertions(+) diff --git a/docs/H200_DRIVER570_SETUP_ZH.md b/docs/H200_DRIVER570_SETUP_ZH.md index 35c812f..2fb007f 100644 --- a/docs/H200_DRIVER570_SETUP_ZH.md +++ b/docs/H200_DRIVER570_SETUP_ZH.md @@ -46,8 +46,11 @@ source scripts/setup_env.sh agentic-pd-hybrid env ready: CUDA_HOME=/home//cuda-12.8 (12.8, V12.8.93) libcudart.so.12 at .../.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/lib + MC_TRANSFER_TIMEOUT=1800s ``` +**`MC_TRANSFER_TIMEOUT=1800` (30 min) 替代 mooncake 默认 30s**——E2 forensic 发现 D 端 LRU eviction 会让 mooncake C++ control plane 被 starved 30+s,触发 `conn.py:1270` hair-trigger 永久 blacklist 整个 D 的 mooncake_session_id。1800s 给足缓冲,30 分钟还没回应才是真正"D 死了"。详见 `docs/E1_E2_RESULTS_ZH.md §5c`。`stack.py` 也对 worker subprocess 设了同名默认值。 + --- ## 2. Smoke test(验证整条链路) diff --git a/scripts/setup_env.sh b/scripts/setup_env.sh index 897b75e..caa0b5d 100755 --- a/scripts/setup_env.sh +++ b/scripts/setup_env.sh @@ -30,6 +30,15 @@ export CUDA_HOME="$HOME/cuda-12.8" export PATH="$HOME/cuda-12.8/bin:$PATH" export LD_LIBRARY_PATH="$REPO_ROOT/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/lib:$HOME/cuda-12.8/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" +# Mooncake batch_transfer_sync C++ timeout (seconds). Default in mooncake is +# 30 s; a single LRU eviction sweep on a saturated D scheduler can exceed +# that and cause the hair-trigger blacklist in conn.py:1270 to permanently +# mark the D's mooncake_session_id "failed". 1800 s = 30 min gives us +# headroom while still detecting genuinely broken peers eventually. +# See docs/E1_E2_RESULTS_ZH.md §5c and docs/E1_E2_FIX_DESIGN_ZH.md Q1.C. +export MC_TRANSFER_TIMEOUT="${MC_TRANSFER_TIMEOUT:-1800}" + echo "agentic-pd-hybrid env ready:" echo " CUDA_HOME=$CUDA_HOME ($(nvcc --version | grep release | sed 's/.*release //'))" echo " libcudart.so.12 at $REPO_ROOT/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/lib" +echo " MC_TRANSFER_TIMEOUT=${MC_TRANSFER_TIMEOUT}s" diff --git a/src/agentic_pd_hybrid/stack.py b/src/agentic_pd_hybrid/stack.py index 33e07db..c6bea65 100644 --- a/src/agentic_pd_hybrid/stack.py +++ b/src/agentic_pd_hybrid/stack.py @@ -201,6 +201,14 @@ def _build_process_env(topology: SingleNodeTopology) -> dict[str, str]: # Default to TCP when RDMA is not forced (e.g. loopback on same node) env.setdefault("MOONCAKE_PROTOCOL", "tcp") + # Mooncake C++ batch_transfer_sync default timeout is 30 s, which can + # fire as a false positive when a saturated D scheduler thread is busy + # with LRU eviction (see docs/E1_E2_RESULTS_ZH.md §5c). Default to 1800 s + # so the hair-trigger blacklist in conn.py:1270 doesn't latch on + # transient stalls. Caller can override via shell env (setup_env.sh). + if topology.transfer_backend == "mooncake": + env.setdefault("MC_TRANSFER_TIMEOUT", "1800") + repo_root = Path(__file__).resolve().parents[2] python_paths = [ str(repo_root / "src"),