v4 (cap=16) saw 35% session-cap fallback because the local soft_cap
min(16, usable / target) evaluates to 1-2 for large agentic inputs.
The cap was hit not because D was full but because replay's heuristic
underestimated capacity.
This change makes worker admission_mode authoritative for ALL paths:
SGLang side:
- io_struct.py: DirectAppendAdmissionReqInput gains a `mode` field
("direct_append" | "seed", default "direct_append" preserves prior
behavior).
- scheduler.py:admit_direct_append: when mode == "seed", skip the
resident-on-D requirement and run the same capacity check + LRU
eviction (maybe_trim_decode_session_cache) that direct_append uses.
This lets D atomically decide if a new session can be admitted based
on actual token_to_kv_pool_allocator state.
Replay side (replay.py):
- _query_decode_direct_admission gains a `mode` parameter.
- _reserve_decode_session_capacity: in worker admission_mode, the
seed/reseed branch now queries D with mode="seed" and trusts the
result, instead of estimating capacity from the residency snapshot.
- _should_admit_new_decode_session: in worker mode, skip the local
soft_cap pre-check and let D decide. Same-D session fast-path is
preserved.
Effects:
- Local hardcoded cap of 16 is bypassed under worker mode; D's real
KV pool size is the only constraint.
- LRU eviction runs in D's process atomically with admission, so
starvation (the v3 bimodal "lucky vs starved sessions" pattern)
should resolve.
scripts/sweep_tp1_v5_optD.sh added to run the same 1P7D / 2P6D
configs as v4 with the new admission path.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
115 lines
3.9 KiB
Bash
Executable File
115 lines
3.9 KiB
Bash
Executable File
#!/bin/bash
|
|
# TP1 v5 sweep — Option D: D-side admission for seed/reseed.
|
|
#
|
|
# v4 (cap=16) still saw 35% session-cap fallback because the local soft_cap
|
|
# evaluates min(16, usable_capacity_tokens / target_tokens) and target_tokens
|
|
# (= input + output) is 50-100K in agentic workloads, giving cap = 1-2.
|
|
#
|
|
# v5 makes worker admission_mode authoritative for ALL admission decisions
|
|
# (direct_append AND seed/reseed). Replay calls D's
|
|
# /session_cache/admit_direct_append with mode={direct_append|seed} and
|
|
# defers to D's KV pool availability + LRU eviction. Replay's local
|
|
# _decode_session_soft_cap is bypassed entirely under worker mode.
|
|
set -euo pipefail
|
|
cd "$(dirname "$0")/.."
|
|
|
|
MODEL=/mnt/kzlin/workflow/pd-hybrid/simm-swe-bench/models/Qwen3-30B-A3B-Instruct-2507
|
|
TRACE=outputs/qwen35-swebench-50sess.jsonl
|
|
OUTPUT=outputs/qwen3-30b-tp1-v5-optD
|
|
VENV_PYTHON=.venv/bin/python
|
|
RESULTS_FILE=$OUTPUT/sweep_results.txt
|
|
|
|
mkdir -p $OUTPUT
|
|
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a $RESULTS_FILE
|
|
}
|
|
|
|
save_result() {
|
|
local label=$1
|
|
local run_dir=$2
|
|
log "=== $label COMPLETED ==="
|
|
if [ -f "$run_dir/request-metrics.jsonl.summary.json" ]; then
|
|
log "Summary:"
|
|
cat "$run_dir/request-metrics.jsonl.summary.json" >> $RESULTS_FILE
|
|
echo "" >> $RESULTS_FILE
|
|
cp "$run_dir/request-metrics.jsonl.summary.json" "$OUTPUT/${label}_summary.json"
|
|
cp "$run_dir/request-metrics.jsonl" "$OUTPUT/${label}_metrics.jsonl"
|
|
log "Saved to $OUTPUT/${label}_summary.json + ${label}_metrics.jsonl"
|
|
else
|
|
log "WARNING: No summary file found in $run_dir"
|
|
fi
|
|
}
|
|
|
|
log "Starting TP1 v5 sweep (Option D: D-side seed admission)"
|
|
log "Model: $MODEL"
|
|
log "Trace: $TRACE (4449 requests, 52 sessions)"
|
|
log "Key change: worker admission_mode now drives seed/reseed via D's admit endpoint"
|
|
|
|
########################################
|
|
# Experiment 1: 1P + 7D KVC kv-aware Option D
|
|
########################################
|
|
log ""
|
|
log "=== [EXP1] 1P7D KVC kv-aware Option D ==="
|
|
PYTHONPATH=src:third_party/sglang/python \
|
|
$VENV_PYTHON -m agentic_pd_hybrid.cli benchmark-live \
|
|
--trace $TRACE \
|
|
--output-root $OUTPUT \
|
|
--mechanism kvcache-centric \
|
|
--policy kv-aware \
|
|
--model-path $MODEL \
|
|
--prefill-workers 1 --decode-workers 7 \
|
|
--prefill-tp-size 1 --decode-tp-size 1 \
|
|
--prefill-gpu-ids 0 --decode-gpu-ids 1,2,3,4,5,6,7 \
|
|
--transfer-backend mooncake \
|
|
--gpu-budget 8 \
|
|
--time-scale 10 \
|
|
--session-sample-rate 1.0 \
|
|
--target-duration-s 100000 \
|
|
--concurrency-limit 32 \
|
|
--timeout-s 900 \
|
|
--request-timeout-s 300 \
|
|
--kvcache-admission-mode worker \
|
|
--kvcache-seed-min-turn-id 1 \
|
|
--kvcache-seed-max-inflight-decode -1 \
|
|
--kvcache-prefill-backup-policy release-after-transfer \
|
|
--kvcache-prefill-priority-eviction
|
|
|
|
EXP1_DIR=$(ls -td $OUTPUT/kvcache-centric-*/ 2>/dev/null | head -1)
|
|
save_result "exp1_1p7d_kvc_optD" "$EXP1_DIR"
|
|
|
|
########################################
|
|
# Experiment 2: 2P + 6D KVC kv-aware Option D
|
|
########################################
|
|
log ""
|
|
log "=== [EXP2] 2P6D KVC kv-aware Option D ==="
|
|
PYTHONPATH=src:third_party/sglang/python \
|
|
$VENV_PYTHON -m agentic_pd_hybrid.cli benchmark-live \
|
|
--trace $TRACE \
|
|
--output-root $OUTPUT \
|
|
--mechanism kvcache-centric \
|
|
--policy kv-aware \
|
|
--model-path $MODEL \
|
|
--prefill-workers 2 --decode-workers 6 \
|
|
--prefill-tp-size 1 --decode-tp-size 1 \
|
|
--prefill-gpu-ids 0,1 --decode-gpu-ids 2,3,4,5,6,7 \
|
|
--transfer-backend mooncake \
|
|
--gpu-budget 8 \
|
|
--time-scale 10 \
|
|
--session-sample-rate 1.0 \
|
|
--target-duration-s 100000 \
|
|
--concurrency-limit 32 \
|
|
--timeout-s 900 \
|
|
--request-timeout-s 300 \
|
|
--kvcache-admission-mode worker \
|
|
--kvcache-seed-min-turn-id 1 \
|
|
--kvcache-seed-max-inflight-decode -1 \
|
|
--kvcache-prefill-backup-policy release-after-transfer \
|
|
--kvcache-prefill-priority-eviction
|
|
|
|
EXP2_DIR=$(ls -td $OUTPUT/kvcache-centric-*/ 2>/dev/null | head -1)
|
|
save_result "exp2_2p6d_kvc_optD" "$EXP2_DIR"
|
|
|
|
log ""
|
|
log "=== ALL TP1 V5 SWEEP EXPERIMENTS DONE ==="
|