feat(kvc): Option D - delegate seed/reseed admission to D worker

v4 (cap=16) saw 35% session-cap fallback because the local soft_cap min(16, usable / target) evaluates to 1-2 for large agentic inputs. The cap was hit not because D was full but because replay's heuristic underestimated capacity. This change makes worker admission_mode authoritative for ALL paths: SGLang side: - io_struct.py: DirectAppendAdmissionReqInput gains a `mode` field ("direct_append" | "seed", default "direct_append" preserves prior behavior). - scheduler.py:admit_direct_append: when mode == "seed", skip the resident-on-D requirement and run the same capacity check + LRU eviction (maybe_trim_decode_session_cache) that direct_append uses. This lets D atomically decide if a new session can be admitted based on actual token_to_kv_pool_allocator state. Replay side (replay.py): - _query_decode_direct_admission gains a `mode` parameter. - _reserve_decode_session_capacity: in worker admission_mode, the seed/reseed branch now queries D with mode="seed" and trusts the result, instead of estimating capacity from the residency snapshot. - _should_admit_new_decode_session: in worker mode, skip the local soft_cap pre-check and let D decide. Same-D session fast-path is preserved. Effects: - Local hardcoded cap of 16 is bypassed under worker mode; D's real KV pool size is the only constraint. - LRU eviction runs in D's process atomically with admission, so starvation (the v3 bimodal "lucky vs starved sessions" pattern) should resolve. scripts/sweep_tp1_v5_optD.sh added to run the same 1P7D / 2P6D configs as v4 with the new admission path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 23:40:03 +08:00
parent 74194e660a
commit 6e5ed8da80
4 changed files with 173 additions and 2 deletions
--- a/scripts/sweep_tp1_v5_optD.sh
+++ b/scripts/sweep_tp1_v5_optD.sh
@@ -0,0 +1,114 @@
 #!/bin/bash
 # TP1 v5 sweep — Option D: D-side admission for seed/reseed.
 #
 # v4 (cap=16) still saw 35% session-cap fallback because the local soft_cap
 # evaluates min(16, usable_capacity_tokens / target_tokens) and target_tokens
 # (= input + output) is 50-100K in agentic workloads, giving cap = 1-2.
 #
 # v5 makes worker admission_mode authoritative for ALL admission decisions
 # (direct_append AND seed/reseed). Replay calls D's
 # /session_cache/admit_direct_append with mode={direct_append|seed} and
 # defers to D's KV pool availability + LRU eviction. Replay's local
 # _decode_session_soft_cap is bypassed entirely under worker mode.
 set -euo pipefail
 cd "$(dirname "$0")/.."
 MODEL=/mnt/kzlin/workflow/pd-hybrid/simm-swe-bench/models/Qwen3-30B-A3B-Instruct-2507
 TRACE=outputs/qwen35-swebench-50sess.jsonl
 OUTPUT=outputs/qwen3-30b-tp1-v5-optD
 VENV_PYTHON=.venv/bin/python
 RESULTS_FILE=$OUTPUT/sweep_results.txt
 mkdir -p $OUTPUT
 log() {
  echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a $RESULTS_FILE
 }
 save_result() {
  local label=$1
  local run_dir=$2
  log "=== $label COMPLETED ==="
  if [ -f "$run_dir/request-metrics.jsonl.summary.json" ]; then
    log "Summary:"
    cat "$run_dir/request-metrics.jsonl.summary.json" >> $RESULTS_FILE
    echo "" >> $RESULTS_FILE
    cp "$run_dir/request-metrics.jsonl.summary.json" "$OUTPUT/${label}_summary.json"
    cp "$run_dir/request-metrics.jsonl" "$OUTPUT/${label}_metrics.jsonl"
    log "Saved to $OUTPUT/${label}_summary.json + ${label}_metrics.jsonl"
  else
    log "WARNING: No summary file found in $run_dir"
  fi
 }
 log "Starting TP1 v5 sweep (Option D: D-side seed admission)"
 log "Model: $MODEL"
 log "Trace: $TRACE (4449 requests, 52 sessions)"
 log "Key change: worker admission_mode now drives seed/reseed via D's admit endpoint"
 ########################################
 # Experiment 1: 1P + 7D KVC kv-aware Option D
 ########################################
 log ""
 log "=== [EXP1] 1P7D KVC kv-aware Option D ==="
 PYTHONPATH=src:third_party/sglang/python \
 $VENV_PYTHON -m agentic_pd_hybrid.cli benchmark-live \
  --trace $TRACE \
  --output-root $OUTPUT \
  --mechanism kvcache-centric \
  --policy kv-aware \
  --model-path $MODEL \
  --prefill-workers 1 --decode-workers 7 \
  --prefill-tp-size 1 --decode-tp-size 1 \
  --prefill-gpu-ids 0 --decode-gpu-ids 1,2,3,4,5,6,7 \
  --transfer-backend mooncake \
  --gpu-budget 8 \
  --time-scale 10 \
  --session-sample-rate 1.0 \
  --target-duration-s 100000 \
  --concurrency-limit 32 \
  --timeout-s 900 \
  --request-timeout-s 300 \
  --kvcache-admission-mode worker \
  --kvcache-seed-min-turn-id 1 \
  --kvcache-seed-max-inflight-decode -1 \
  --kvcache-prefill-backup-policy release-after-transfer \
  --kvcache-prefill-priority-eviction
 EXP1_DIR=$(ls -td $OUTPUT/kvcache-centric-*/ 2>/dev/null | head -1)
 save_result "exp1_1p7d_kvc_optD" "$EXP1_DIR"
 ########################################
 # Experiment 2: 2P + 6D KVC kv-aware Option D
 ########################################
 log ""
 log "=== [EXP2] 2P6D KVC kv-aware Option D ==="
 PYTHONPATH=src:third_party/sglang/python \
 $VENV_PYTHON -m agentic_pd_hybrid.cli benchmark-live \
  --trace $TRACE \
  --output-root $OUTPUT \
  --mechanism kvcache-centric \
  --policy kv-aware \
  --model-path $MODEL \
  --prefill-workers 2 --decode-workers 6 \
  --prefill-tp-size 1 --decode-tp-size 1 \
  --prefill-gpu-ids 0,1 --decode-gpu-ids 2,3,4,5,6,7 \
  --transfer-backend mooncake \
  --gpu-budget 8 \
  --time-scale 10 \
  --session-sample-rate 1.0 \
  --target-duration-s 100000 \
  --concurrency-limit 32 \
  --timeout-s 900 \
  --request-timeout-s 300 \
  --kvcache-admission-mode worker \
  --kvcache-seed-min-turn-id 1 \
  --kvcache-seed-max-inflight-decode -1 \
  --kvcache-prefill-backup-policy release-after-transfer \
  --kvcache-prefill-priority-eviction
 EXP2_DIR=$(ls -td $OUTPUT/kvcache-centric-*/ 2>/dev/null | head -1)
 save_result "exp2_2p6d_kvc_optD" "$EXP2_DIR"
 log ""
 log "=== ALL TP1 V5 SWEEP EXPERIMENTS DONE ==="
--- a/src/agentic_pd_hybrid/replay.py
+++ b/src/agentic_pd_hybrid/replay.py
@@ -651,6 +651,7 @@ async def _query_decode_direct_admission(
    session_id: str,
    uncached_input_tokens: int,
    output_tokens: int,
    mode: str = "direct_append",
 ) -> dict[str, Any]:
    try:
        response = await client.post(
@@ -659,6 +660,7 @@ async def _query_decode_direct_admission(
                "session_id": session_id,
                "uncached_input_tokens": max(0, uncached_input_tokens),
                "output_tokens": max(0, output_tokens),
                "mode": mode,
            },
            timeout=_ADMISSION_PROBE_TIMEOUT_S,
        )
@@ -913,6 +915,7 @@ def _should_admit_new_decode_session(
    session: DirectSessionState,
    direct_sessions: dict[str, DirectSessionState],
    treat_as_fresh_session: bool,
    admission_mode: KvCacheAdmissionMode = "router",
 ) -> bool:
    if (
        not treat_as_fresh_session
@@ -920,6 +923,11 @@ def _should_admit_new_decode_session(
        and session.server_url == server_url
    ):
        return True
    if admission_mode == "worker":
        # Defer the capacity decision to D's admit_direct_append (mode=seed),
        # which checks real KV pool availability and runs LRU eviction. The
        # local soft cap is router-mode only.
        return True
    open_sessions = sum(
        1
        for candidate in direct_sessions.values()
@@ -1331,6 +1339,7 @@ async def _reserve_decode_session_capacity(
            session_id=session.session_id,
            uncached_input_tokens=max(0, request.input_length - current_tokens),
            output_tokens=request.output_length,
            mode="direct_append",
        )
        if not bool(admission.get("resident")):
            return False, 0, 0, 0, str(admission.get("reason") or "d-session-not-resident")
@@ -1355,6 +1364,41 @@ async def _reserve_decode_session_capacity(
            None,
        )
    # Seed / reseed path: ask D itself via the seed-mode admission endpoint
    # instead of estimating capacity from a stale router-state snapshot. D
    # will run LRU eviction internally to make room. Falls through to the
    # legacy router-state logic below if the endpoint is unavailable.
    seed_admission = await _query_decode_direct_admission(
        client=client,
        server_url=server_url,
        session_id=session.session_id,
        uncached_input_tokens=max(0, request.input_length - current_tokens),
        output_tokens=request.output_length,
        mode="seed",
    )
    seed_reason = seed_admission.get("reason")
    if seed_reason != "admission-query-failed":
        if not bool(seed_admission.get("can_admit")):
            return (
                False,
                0,
                int(seed_admission.get("evicted_session_count", 0) or 0),
                0,
                str(seed_reason or "d-no-space"),
            )
        reserved_tokens = int(
            seed_admission.get("required_tokens", required_extra_tokens)
            or required_extra_tokens
        )
        _add_reserved_tokens(residency, server_url, reserved_tokens)
        return (
            True,
            reserved_tokens,
            int(seed_admission.get("evicted_session_count", 0) or 0),
            0,
            None,
        )
    session_cache, max_total_num_tokens, reserved_decode_tokens = (
        await _fetch_decode_server_state(
            client=client,
@@ -1906,6 +1950,7 @@ async def _execute_request(
                    session=decode_session,
                    direct_sessions=direct_sessions,
                    treat_as_fresh_session=True,
                    admission_mode=config.kvcache_admission_mode,
                )
                if not admit_new_decode_session:
                    can_seed = False
@@ -2060,6 +2105,7 @@ async def _execute_request(
                    session=decode_session,
                    direct_sessions=direct_sessions,
                    treat_as_fresh_session=True,
                    admission_mode=config.kvcache_admission_mode,
                )
                if not admit_new_decode_session:
                    can_seed = False
--- a/third_party/sglang/python/sglang/srt/managers/io_struct.py
+++ b/third_party/sglang/python/sglang/srt/managers/io_struct.py
@@ -1602,6 +1602,9 @@ class DirectAppendAdmissionReqInput(BaseReq):
    session_id: str
    uncached_input_tokens: int
    output_tokens: int
    # "direct_append": existing behavior — require session resident on this D
    # "seed": new admission for session not yet resident; do capacity check + LRU eviction
    mode: str = "direct_append"
@dataclass
--- a/third_party/sglang/python/sglang/srt/managers/scheduler.py
+++ b/third_party/sglang/python/sglang/srt/managers/scheduler.py
@@ -3508,6 +3508,9 @@ class Scheduler(
                reason="unsupported",
            )
        mode = getattr(recv_req, "mode", "direct_append") or "direct_append"
        is_seed = mode == "seed"
        session_cache_status = self.session_controller.get_streaming_session_cache_status(
            recv_req.session_id
        )
@@ -3515,7 +3518,9 @@ class Scheduler(
        resident = bool(
            isinstance(target_session, dict) and target_session.get("resident")
        )
-        if not resident:
+        if not resident and not is_seed:
            # direct_append requires the session already resident on this D.
            # For seed we skip this check and let capacity decide.
            return DirectAppendAdmissionReqOutput(
                can_admit=False,
                resident=False,
@@ -3543,10 +3548,13 @@ class Scheduler(
            0, recv_req.output_tokens
        )
        available_tokens_before = int(self.token_to_kv_pool_allocator.available_size())
        # Don't evict the session itself when it's already resident; for seed
        # of a fresh session there is nothing to exclude.
        exclude_ids = {recv_req.session_id} if resident else set()
        trim_result = self.maybe_trim_decode_session_cache(
            required_tokens=required_tokens,
            force=available_tokens_before < required_tokens,
-            exclude_session_ids={recv_req.session_id},
+            exclude_session_ids=exclude_ids,
        )
        available_tokens_after = int(self.token_to_kv_pool_allocator.available_size())
        decode_retracted_queue_reqs = len(self.disagg_decode_prealloc_queue.retracted_queue)