Add v4 sweep results and post-mortem analysis showing:
- direct-to-D path: 54.3% (1P7D) / 58.0% (2P6D) of requests now use
KVC cleanly. P50=0.5s and TTFT P50=0.043s; this path beats baseline
8DP across the board (P50 -24%, TTFT P50 -54%, TTFT P90 -79%).
- Overall vs baseline (errors+truncated excluded):
v4 2P6D P50=0.85s vs baseline 0.66s (28% slower).
Reason is not errors -- 35% of requests still hit
fallback-large-append-session-cap, where capacity-based
cap = usable_tokens / target_tokens evaluates to 1-2 (not 16)
for large agentic inputs.
- 9-10% errors on KVC variants are mooncake TCP transfer timeouts,
not SGLang logic bugs. Prefill log shows
"Failed to send kv chunk ... 32s timeout ... session not alive".
Errors concentrate in turn>=31 (large inputs) after run >44.8%.
Track:
- docs/KVC_DEBUG_JOURNEY_V1_TO_V4.md: append v4 results table,
per-mode breakdown, and error root cause.
- scripts/analysis/{analyze_v3,analyze_v4,analyze_errors,compare_no_error}.py
- outputs/qwen3-30b-tp1-v{3,4}*/exp*_summary.json (force-added,
small JSON; metrics.jsonl excluded due to size).
- outputs/qwen3-30b-tp1-v{3,4}*/sweep_results.txt
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
191 lines
5.7 KiB
Plaintext
191 lines
5.7 KiB
Plaintext
[2026-04-28 20:50:21] Starting TP1 v4 sweep (KVC kv-aware, session soft_cap raised 4->16)
|
|
[2026-04-28 20:50:21] Model: /mnt/kzlin/workflow/pd-hybrid/simm-swe-bench/models/Qwen3-30B-A3B-Instruct-2507
|
|
[2026-04-28 20:50:21] Trace: outputs/qwen35-swebench-50sess.jsonl (4449 requests, 52 sessions)
|
|
[2026-04-28 20:50:21] Key change: _decode_session_soft_cap now min(16, ...) instead of min(4, ...)
|
|
[2026-04-28 20:50:21]
|
|
[2026-04-28 20:50:21] === [EXP1] 1P7D KVC kv-aware cap=16 ===
|
|
[2026-04-28 21:40:57] === exp1_1p7d_kvc_cap16 COMPLETED ===
|
|
[2026-04-28 21:40:57] Summary:
|
|
{
|
|
"actual_output_tokens_stats": {
|
|
"count": 4014.0,
|
|
"mean": 215.048081714001,
|
|
"p50": 83.0,
|
|
"p90": 570.0,
|
|
"p99": 1343.0
|
|
},
|
|
"cache_hit_request_count": 3865,
|
|
"cached_tokens_stats": {
|
|
"count": 4449.0,
|
|
"mean": 21373.60867610699,
|
|
"p50": 18429.0,
|
|
"p90": 45643.0,
|
|
"p99": 65088.0
|
|
},
|
|
"decode_request_priorities": {},
|
|
"error_count": 435,
|
|
"execution_modes": {
|
|
"kvcache-centric": 435,
|
|
"kvcache-direct-to-d-session": 2180,
|
|
"pd-router-d-session-reseed": 44,
|
|
"pd-router-d-session-reseed-after-eviction": 1,
|
|
"pd-router-fallback-d-backpressure": 36,
|
|
"pd-router-fallback-large-append": 35,
|
|
"pd-router-fallback-large-append-seed-filter-early-turn": 52,
|
|
"pd-router-fallback-large-append-session-cap": 1500,
|
|
"pd-router-fallback-no-d-capacity": 13,
|
|
"pd-router-fallback-session-cap": 43,
|
|
"pd-router-large-append-reseed": 55,
|
|
"pd-router-large-append-reseed-after-eviction": 3,
|
|
"pd-router-turn1-d-backpressure": 1,
|
|
"pd-router-turn1-no-d-capacity": 5,
|
|
"pd-router-turn1-seed": 46
|
|
},
|
|
"latency_stats_s": {
|
|
"count": 4014.0,
|
|
"mean": 4.214657033050009,
|
|
"p50": 1.0827504023909569,
|
|
"p90": 13.380241627804935,
|
|
"p99": 24.453291333280504
|
|
},
|
|
"mechanisms": {
|
|
"kvcache-centric": 4449
|
|
},
|
|
"per_decode_load": {
|
|
"decode-0": 690,
|
|
"decode-1": 599,
|
|
"decode-2": 660,
|
|
"decode-3": 584,
|
|
"decode-4": 606,
|
|
"decode-5": 646,
|
|
"decode-6": 664
|
|
},
|
|
"per_prefill_load": {
|
|
"prefill-0": 4449
|
|
},
|
|
"prefill_request_priorities": {
|
|
"-100": 149,
|
|
"100": 1685
|
|
},
|
|
"re_prefill_count": 0,
|
|
"request_count": 4449,
|
|
"reuse_expected_count": 4397,
|
|
"reuse_observed_count": 4397,
|
|
"router_url": "http://127.0.0.1:8000",
|
|
"session_reset_count": 0,
|
|
"session_reused_count": 2180,
|
|
"total_actual_kv_transfer_blocks": 52857,
|
|
"total_cached_tokens": 95091185,
|
|
"total_kv_transfer_blocks": 105235,
|
|
"tpot_stats_s": {
|
|
"count": 4014.0,
|
|
"mean": 0.005804301410418847,
|
|
"p50": 0.005607025208882987,
|
|
"p90": 0.007293824862528552,
|
|
"p99": 0.008864479259402893
|
|
},
|
|
"trace_path": "outputs/qwen3-30b-tp1-v4-cap16/kvcache-centric-kv-aware-worker-admission-20260428T125022Z/sampled-trace.jsonl",
|
|
"truncated_request_count": 43,
|
|
"ttft_stats_s": {
|
|
"count": 4014.0,
|
|
"mean": 2.915135478307124,
|
|
"p50": 0.05643345229327679,
|
|
"p90": 11.900803190656006,
|
|
"p99": 22.758968392387033
|
|
}
|
|
}
|
|
[2026-04-28 21:40:57] Saved to outputs/qwen3-30b-tp1-v4-cap16/exp1_1p7d_kvc_cap16_summary.json + exp1_1p7d_kvc_cap16_metrics.jsonl
|
|
[2026-04-28 21:40:57]
|
|
[2026-04-28 21:40:57] === [EXP2] 2P6D KVC kv-aware cap=16 ===
|
|
[2026-04-28 22:27:53] === exp2_2p6d_kvc_cap16 COMPLETED ===
|
|
[2026-04-28 22:27:53] Summary:
|
|
{
|
|
"actual_output_tokens_stats": {
|
|
"count": 4046.0,
|
|
"mean": 224.65002471576867,
|
|
"p50": 84.0,
|
|
"p90": 576.0,
|
|
"p99": 1349.0
|
|
},
|
|
"cache_hit_request_count": 3925,
|
|
"cached_tokens_stats": {
|
|
"count": 4449.0,
|
|
"mean": 22852.7439874129,
|
|
"p50": 19584.0,
|
|
"p90": 49009.0,
|
|
"p99": 67320.0
|
|
},
|
|
"decode_request_priorities": {},
|
|
"error_count": 403,
|
|
"execution_modes": {
|
|
"kvcache-centric": 403,
|
|
"kvcache-direct-to-d-session": 2348,
|
|
"pd-router-d-session-reseed": 28,
|
|
"pd-router-fallback-d-backpressure": 7,
|
|
"pd-router-fallback-large-append": 68,
|
|
"pd-router-fallback-large-append-seed-filter-early-turn": 45,
|
|
"pd-router-fallback-large-append-session-cap": 1403,
|
|
"pd-router-fallback-no-d-capacity": 9,
|
|
"pd-router-fallback-session-cap": 25,
|
|
"pd-router-large-append-reseed": 57,
|
|
"pd-router-large-append-reseed-after-eviction": 6,
|
|
"pd-router-turn1-no-d-capacity": 1,
|
|
"pd-router-turn1-seed": 49
|
|
},
|
|
"latency_stats_s": {
|
|
"count": 4046.0,
|
|
"mean": 2.505981629502371,
|
|
"p50": 0.8372491216287017,
|
|
"p90": 6.5139341270551085,
|
|
"p99": 18.335972285829484
|
|
},
|
|
"mechanisms": {
|
|
"kvcache-centric": 4449
|
|
},
|
|
"per_decode_load": {
|
|
"decode-0": 767,
|
|
"decode-1": 680,
|
|
"decode-2": 906,
|
|
"decode-3": 818,
|
|
"decode-4": 800,
|
|
"decode-5": 478
|
|
},
|
|
"per_prefill_load": {
|
|
"prefill-0": 2225,
|
|
"prefill-1": 2224
|
|
},
|
|
"prefill_request_priorities": {
|
|
"-100": 140,
|
|
"100": 1558
|
|
},
|
|
"re_prefill_count": 0,
|
|
"request_count": 4449,
|
|
"reuse_expected_count": 4397,
|
|
"reuse_observed_count": 4397,
|
|
"router_url": "http://127.0.0.1:8000",
|
|
"session_reset_count": 0,
|
|
"session_reused_count": 2348,
|
|
"total_actual_kv_transfer_blocks": 50727,
|
|
"total_cached_tokens": 101671858,
|
|
"total_kv_transfer_blocks": 105235,
|
|
"tpot_stats_s": {
|
|
"count": 4046.0,
|
|
"mean": 0.005708743129332261,
|
|
"p50": 0.005565466725497757,
|
|
"p90": 0.006912594398356141,
|
|
"p99": 0.008102089307750717
|
|
},
|
|
"trace_path": "outputs/qwen3-30b-tp1-v4-cap16/kvcache-centric-kv-aware-worker-admission-20260428T134057Z/sampled-trace.jsonl",
|
|
"truncated_request_count": 36,
|
|
"ttft_stats_s": {
|
|
"count": 4046.0,
|
|
"mean": 1.1653790952959129,
|
|
"p50": 0.05140436999499798,
|
|
"p90": 2.6447059931233525,
|
|
"p99": 15.121314341202378
|
|
}
|
|
}
|
|
[2026-04-28 22:27:53] Saved to outputs/qwen3-30b-tp1-v4-cap16/exp2_2p6d_kvc_cap16_summary.json + exp2_2p6d_kvc_cap16_metrics.jsonl
|
|
[2026-04-28 22:27:53]
|
|
[2026-04-28 22:27:53] === ALL TP1 V4 SWEEP EXPERIMENTS DONE ===
|