Document the iterative debugging from v1 (broken KVC) through v4
(routing fixed + session cap raised), with code-level analysis of
the two main bugs encountered:
1. v2 root cause (mis-diagnosed previously as `allow_local_prefill`):
`--policy default` for KVC mechanism caused replay's round-robin
policy and the PD router's round-robin to diverge, sending requests
with `session_params` to a D worker that did not have the session
open. Resulted in 56-61% truncation with finish_reason
"session id X does not exist".
Fix: use `--policy kv-aware` (sweep_tp1_v3_kvaware.sh) so replay
emits `x-smg-target-worker` and PD router uses consistent_hashing.
2. v3 new bottleneck: `pd-router-fallback-large-append-session-cap`
dominated 52-65% of requests. Root cause was hardcoded
`min(4, ...)` in `_decode_session_soft_cap`. With 7 D workers x 4
sessions = 28 slots for 52 trace sessions, ~24 sessions starved
permanently (bimodal direct-to-D rate of 0% or 99%).
Fix: raise the cap to 16 (replay.py).
Also includes the v3 finding that direct-to-d-session path P50=0.495s
and TTFT P50=0.043s already beats the 8-way DP baseline (0.65s/0.093s)
- the KVC core mechanism works when fallback paths are avoided.
Files:
- docs/KVC_DEBUG_JOURNEY_V1_TO_V4.md: full journey + code location index
- docs/SWEBENCH_EXPERIMENT_{PROGRESS,RESULTS}.md: prior session notes
- scripts/sweep_tp1_v{2,3,4}*.sh: experiment driver scripts
- src/agentic_pd_hybrid/replay.py: cap 4 -> 16, audit fields
- src/agentic_pd_hybrid/pd_router.py: strip session_params from prefill
- src/agentic_pd_hybrid/metrics.py: truncated_request_count
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
132 lines
4.2 KiB
Bash
Executable File
132 lines
4.2 KiB
Bash
Executable File
#!/bin/bash
|
||
# TP1 configuration sweep v2 — after session_params fix + audit fields
|
||
# Qwen3-30B-A3B TP=1, single GPU per worker
|
||
set -euo pipefail
|
||
cd "$(dirname "$0")/.."
|
||
|
||
MODEL=/mnt/kzlin/workflow/pd-hybrid/simm-swe-bench/models/Qwen3-30B-A3B-Instruct-2507
|
||
TRACE=outputs/qwen35-swebench-50sess.jsonl
|
||
OUTPUT=outputs/qwen3-30b-tp1-v2-fixed
|
||
VENV_PYTHON=.venv/bin/python
|
||
RESULTS_FILE=$OUTPUT/sweep_results.txt
|
||
|
||
mkdir -p $OUTPUT
|
||
|
||
log() {
|
||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a $RESULTS_FILE
|
||
}
|
||
|
||
save_result() {
|
||
local label=$1
|
||
local run_dir=$2
|
||
log "=== $label COMPLETED ==="
|
||
if [ -f "$run_dir/request-metrics.jsonl.summary.json" ]; then
|
||
log "Summary:"
|
||
cat "$run_dir/request-metrics.jsonl.summary.json" >> $RESULTS_FILE
|
||
echo "" >> $RESULTS_FILE
|
||
cp "$run_dir/request-metrics.jsonl.summary.json" "$OUTPUT/${label}_summary.json"
|
||
cp "$run_dir/request-metrics.jsonl" "$OUTPUT/${label}_metrics.jsonl"
|
||
log "Saved to $OUTPUT/${label}_summary.json + ${label}_metrics.jsonl"
|
||
else
|
||
log "WARNING: No summary file found in $run_dir"
|
||
fi
|
||
}
|
||
|
||
log "Starting TP1 v2 sweep (session_params fix + audit fields)"
|
||
log "Model: $MODEL"
|
||
log "Trace: $TRACE (4449 requests, 52 sessions)"
|
||
|
||
########################################
|
||
# Experiment 1: 8-way DP cache-aware
|
||
########################################
|
||
log ""
|
||
log "=== [EXP1] 8-way DP cache-aware (8 direct × TP1) ==="
|
||
PYTHONPATH=src:third_party/sglang/python \
|
||
$VENV_PYTHON -m agentic_pd_hybrid.cli benchmark-live \
|
||
--trace $TRACE \
|
||
--output-root $OUTPUT \
|
||
--mechanism pd-colo \
|
||
--policy kv-aware \
|
||
--model-path $MODEL \
|
||
--prefill-workers 0 --decode-workers 0 \
|
||
--direct-workers 8 --direct-tp-size 1 \
|
||
--direct-gpu-ids 0,1,2,3,4,5,6,7 \
|
||
--gpu-budget 8 \
|
||
--time-scale 10 \
|
||
--session-sample-rate 1.0 \
|
||
--target-duration-s 100000 \
|
||
--concurrency-limit 32 \
|
||
--timeout-s 900 \
|
||
--request-timeout-s 300
|
||
|
||
EXP1_DIR=$(ls -td $OUTPUT/pd-colo-kv-aware-*/ 2>/dev/null | head -1)
|
||
save_result "exp1_8way_dp_cache_aware" "$EXP1_DIR"
|
||
|
||
########################################
|
||
# Experiment 2: 1P + 7D KVC (aggressive)
|
||
########################################
|
||
log ""
|
||
log "=== [EXP2] 1P7D KVC (inflight=off, min-turn=1) ==="
|
||
PYTHONPATH=src:third_party/sglang/python \
|
||
$VENV_PYTHON -m agentic_pd_hybrid.cli benchmark-live \
|
||
--trace $TRACE \
|
||
--output-root $OUTPUT \
|
||
--mechanism kvcache-centric \
|
||
--policy default \
|
||
--model-path $MODEL \
|
||
--prefill-workers 1 --decode-workers 7 \
|
||
--prefill-tp-size 1 --decode-tp-size 1 \
|
||
--prefill-gpu-ids 0 --decode-gpu-ids 1,2,3,4,5,6,7 \
|
||
--transfer-backend mooncake \
|
||
--gpu-budget 8 \
|
||
--time-scale 10 \
|
||
--session-sample-rate 1.0 \
|
||
--target-duration-s 100000 \
|
||
--concurrency-limit 32 \
|
||
--timeout-s 900 \
|
||
--request-timeout-s 300 \
|
||
--kvcache-admission-mode worker \
|
||
--kvcache-seed-min-turn-id 1 \
|
||
--kvcache-seed-max-inflight-decode -1 \
|
||
--kvcache-prefill-backup-policy release-after-transfer \
|
||
--kvcache-prefill-priority-eviction
|
||
|
||
EXP2_DIR=$(ls -td $OUTPUT/kvcache-centric-*/ 2>/dev/null | head -1)
|
||
save_result "exp2_1p7d_kvc_aggressive" "$EXP2_DIR"
|
||
|
||
########################################
|
||
# Experiment 3: 2P + 6D KVC (aggressive)
|
||
########################################
|
||
log ""
|
||
log "=== [EXP3] 2P6D KVC (inflight=off, min-turn=1) ==="
|
||
PYTHONPATH=src:third_party/sglang/python \
|
||
$VENV_PYTHON -m agentic_pd_hybrid.cli benchmark-live \
|
||
--trace $TRACE \
|
||
--output-root $OUTPUT \
|
||
--mechanism kvcache-centric \
|
||
--policy default \
|
||
--model-path $MODEL \
|
||
--prefill-workers 2 --decode-workers 6 \
|
||
--prefill-tp-size 1 --decode-tp-size 1 \
|
||
--prefill-gpu-ids 0,1 --decode-gpu-ids 2,3,4,5,6,7 \
|
||
--transfer-backend mooncake \
|
||
--gpu-budget 8 \
|
||
--time-scale 10 \
|
||
--session-sample-rate 1.0 \
|
||
--target-duration-s 100000 \
|
||
--concurrency-limit 32 \
|
||
--timeout-s 900 \
|
||
--request-timeout-s 300 \
|
||
--kvcache-admission-mode worker \
|
||
--kvcache-seed-min-turn-id 1 \
|
||
--kvcache-seed-max-inflight-decode -1 \
|
||
--kvcache-prefill-backup-policy release-after-transfer \
|
||
--kvcache-prefill-priority-eviction
|
||
|
||
EXP3_DIR=$(ls -td $OUTPUT/kvcache-centric-*/ 2>/dev/null | head -1)
|
||
save_result "exp3_2p6d_kvc_aggressive" "$EXP3_DIR"
|
||
|
||
########################################
|
||
log ""
|
||
log "=== ALL TP1 V2 SWEEP EXPERIMENTS DONE ==="
|