From bf037594c4e079b42e0e7a348c6e2a1760b26e10 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Sat, 23 May 2026 15:44:34 +0800 Subject: [PATCH] Production-realistic baseline: APC 67.5%, TPOT +139% from interference Updated methodology: - Window+thin sampling preserves cross-session sharing (48% vs 16%) - --max-single-turn-ratio 0.3 boosts multi-turn to 70% - --window-seconds 600 for 10-min contiguous window - Trace-driven replay (no session limit, no time compression) - Daily config: --requests 850 (~13 min, APC~76%) Key result: TPOT p90=0.175s (vs 0.073s in legacy 1-req/GPU setup), confirming prefill-decode interference is real at production concurrency. APC 67.5% (vs 44%) from better KV reuse preservation. Also fixed KV reuse breakdown: 62% intra-session / 38% cross-session (was incorrectly reported as 91% / 9%). Co-Authored-By: Claude Opus 4.6 (1M context) --- REPORT.md | 91 ++++++++++++++++++++++++++++++----------- scripts/bench.sh | 3 +- scripts/sample_trace.py | 71 +++++++++++++++++++------------- 3 files changed, 111 insertions(+), 54 deletions(-) diff --git a/REPORT.md b/REPORT.md index c663bc0..b050285 100644 --- a/REPORT.md +++ b/REPORT.md @@ -42,10 +42,30 @@ For agentic LLM workloads (long input, short output, high KV cache reuse), is pr | Avg output tokens | 445 (p50=80) | | I/O ratio | 75.6× aggregate | | Prefill token share | 98% | -| KV reuse (intra-session) | 91% of reusable blocks | -| Theoretical max APC | 71% (infinite cache, single instance) | +| KV reuse breakdown | 62% intra-session, 38% cross-session (token-level) | +| Theoretical max APC | 67% (infinite cache, single instance, prefix-only) | -**Sampled trace for benchmarks**: `traces/sampled_1000req_seed42.jsonl` (1000 requests, seed=42, preserving session structure). For 200-request ablations: replayer `--request-limit 200`. +**Sampled trace for benchmarks**: `traces/w600_r0.0015_st30.jsonl` (1214 requests, 688 sessions, 70% multi-turn). Generated with window+thin sampling: + +```bash +python scripts/sample_trace.py \ + --input ~/ali-trace/trace-glm5.1-formatted/051315-051317.jsonl \ + --output traces/w600_r0.0015_st30.jsonl \ + --sample-ratio 0.0015 --max-single-turn-ratio 0.3 \ + --window-seconds 600 --seed 42 +``` + +| Trace property | Value | +|---------------|-------| +| Sessions | 688 (70% multi-turn, avg 4.9 turns) | +| Requests | 1214 (use `--request-limit 850` for daily, full for validation) | +| Avg input tokens | 48,776 | +| Trace span | 2912s (48.5 min); dense segment 0-990s (850 req) | +| Peak QPS | 1.6 req/s (in dense segment) | +| Hash block sharing | 48.3% (vs 52% full trace) | +| Theoretical APC | 80% (full), 76% (first 850 req) | + +> **Sampling methodology (2026-05-23)**: Prior traces used random session sampling + `--time-scale` compression + `--max-inflight-sessions` semaphore, which (a) destroyed cross-session hash block sharing (52% → 16%), (b) artificially limited concurrency to 1 req/GPU, and (c) masked prefill-decode interference. The new approach uses contiguous time-window sampling with session thinning (`--max-single-turn-ratio 0.3`) to preserve KV reuse patterns, and trace-driven replay with no artificial concurrency limits. ### 2.4 Two Configurations Compared @@ -119,9 +139,10 @@ python scripts/cache_aware_proxy.py \ | Parameter | Value | |-----------|-------| -| Requests | 200 (from sampled 1000-req trace, `--request-limit 200`) | -| Time scale | 20× (compress 2h trace into ~6min) | -| Max inflight sessions | 8 | +| Trace | `traces/w600_r0.0015_st30.jsonl` (window+thin, 70% multi-turn) | +| Daily iteration | `--request-limit 850` (~13 min, APC≈76%) | +| Full validation | All 1214 requests (~48 min, APC≈80%) | +| Replay mode | Trace-driven (no session limit, no time compression) | | Request timeout | 600s | | vLLM flags | `--enforce-eager --enable-prefix-caching --max-model-len 200000` | | GPU memory util | 0.9 | @@ -139,31 +160,22 @@ python scripts/sample_trace.py \ --output traces/sampled_1000req_seed42.jsonl \ --target-requests 1000 --seed 42 -# Start GPU monitoring (in a separate terminal) -bash scripts/gpu_monitor.sh > outputs//gpu_util.csv & +# Run benchmark (daily iteration) +bash scripts/bench.sh --tag my_experiment --mode baseline --policy linear \ + --trace traces/w600_r0.0015_st30.jsonl --requests 850 -# Run replayer against proxy -python -m replayer \ - --trace traces/sampled_1000req_seed42.jsonl \ - --output outputs//metrics.jsonl \ - --endpoint http://localhost:9090 \ - --time-scale 20 --max-inflight-sessions 8 \ - --request-limit 200 -v - -# Collect proxy breakdown (elastic only) -curl -s http://localhost:9090/breakdown > outputs//breakdown.json - -# Collect APC from vLLM logs -for i in $(seq 0 7); do - grep "Prefix cache hit rate\|External prefix cache hit rate" /tmp/_$i.log | tail -2 -done +# Run benchmark (full validation) +bash scripts/bench.sh --tag my_experiment_full --mode baseline --policy linear \ + --trace traces/w600_r0.0015_st30.jsonl ``` ## 3. Results -> **Errata (2026-05-22)**: The initial cross-machine A/B (dash0 baseline vs dash1 elastic) reported -44% E2E improvement. Post-hoc analysis revealed the dash0 baseline instances were **not freshly restarted** — residual KV cache from prior experiments caused 2× TTFT inflation. All results below use verified fresh-restart experiments on the same machine. +> **Errata (2026-05-22)**: The initial cross-machine A/B (dash0 baseline vs dash1 elastic) reported -44% E2E improvement. Post-hoc analysis revealed the dash0 baseline instances were **not freshly restarted** — residual KV cache from prior experiments caused 2× TTFT inflation. -### 3.1 Fair Comparison (all fresh-restart, same machine dash0, 200 req) +> **Errata (2026-05-23)**: §3.1 results used artificial concurrency limits (`--max-inflight-sessions 8`, 1 req/GPU) and random session sampling that destroyed cross-session KV sharing (52% → 16%). See §3.6 for production-realistic results with corrected methodology. + +### 3.1 Legacy Comparison (artificial 1 req/GPU, 200 req) | Config | OK/N | TTFT p50 | TTFT p90 | TPOT p90 | E2E p50 | |--------|------|----------|----------|----------|---------| @@ -230,6 +242,35 @@ Delta: -45% -44% ← INVALID The elastic numbers on dash1 were genuinely fresh. The "improvement" was actually comparing fresh elastic against degraded baseline. +### 3.6 Production-Realistic Baseline (trace-driven, corrected methodology) + +> Corrected sampling (window+thin, 70% multi-turn, block sharing 48%) and trace-driven replay (no session limit, no time compression). See §2.3 for trace details. + +**Linear policy, 912 requests (dense segment), peak QPS ≈ 1.6:** + +| Metric | Legacy (§3.1, 1 req/GPU) | **New (trace-driven)** | Delta | +|--------|-------------------------|----------------------|-------| +| TTFT mean | 1.07s | **4.54s** | +4.2× | +| TTFT p50 | 1.08s | **0.94s** | -13% | +| TTFT p90 | 9.38s | **14.12s** | +51% | +| TPOT p50 | 0.038s | **0.070s** | **+84%** | +| TPOT p90 | 0.073s | **0.175s** | **+139%** | +| APC (mean) | ~44% | **67.5%** | **+23pp** | +| Errors | 2/200 (1.0%) | 0/912 (0%) | better | +| E2E p50 | 5.08s | 6.98s | +37% | + +**Key differences from legacy methodology:** + +1. **APC 67.5% vs 44%**: Window+thin sampling preserves cross-session block sharing (48% vs 16% in legacy random sampling), yielding production-realistic cache hit rates. Per-instance APC ranges 46–84%. + +2. **TPOT +139% at p90**: With trace-driven replay, multiple concurrent requests per GPU create **real prefill-decode interference**. The legacy 1 req/GPU setup showed TPOT p90=0.073s (no interference), but production-realistic load shows TPOT p90=0.175s. This validates that prefill-decode interference is a real problem at production concurrency. + +3. **TTFT p50 improved (-13%) but mean degraded (+4.2×)**: Higher APC means cached requests get very fast TTFT (p50=0.94s). But concurrent heavy prefills cause queuing for non-cached requests, inflating the mean and p90. + +4. **Per-instance APC imbalance (46–84%)**: Routing quality directly determines per-instance APC. The 38pp gap between worst and best instance suggests routing optimization is still the highest-leverage improvement. + +**Output**: `outputs/baseline_r0015_st30/` on dash0. + ## 4. System-Level Analysis ### 4.1 Elastic P2P Does Not Improve Single-Machine Performance diff --git a/scripts/bench.sh b/scripts/bench.sh index da29ae3..aebae83 100755 --- a/scripts/bench.sh +++ b/scripts/bench.sh @@ -21,7 +21,7 @@ VENV="${VENV_PATH:-$PROJECT_DIR/.venv/bin}" PYTHON="$VENV/python" VLLM="$VENV/vllm" MODEL="${MODEL_PATH:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}" -TRACE="$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl" +TRACE="${TRACE:-$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl}" # Defaults TAG="" @@ -44,6 +44,7 @@ while [[ $# -gt 0 ]]; do --policy) POLICY="$2"; shift 2 ;; --instances) N_INSTANCES="$2"; shift 2 ;; --requests) REQUESTS="$2"; shift 2 ;; + --trace) TRACE="$2"; shift 2 ;; --heavy-threshold) HEAVY_THRESHOLD="$2"; shift 2 ;; --no-offload) NO_OFFLOAD=true; shift ;; --overload-factor) OVERLOAD_FACTOR_ARG="$2"; shift 2 ;; diff --git a/scripts/sample_trace.py b/scripts/sample_trace.py index 8933a17..84b46cb 100644 --- a/scripts/sample_trace.py +++ b/scripts/sample_trace.py @@ -70,13 +70,15 @@ def sample_sessions( sample_ratio: float | None = None, target_requests: int | None = None, max_single_turn_ratio: float | None = None, + window_seconds: float | None = None, seed: int, ) -> list[str]: """Sample sessions preserving KV cache reuse.""" rng = random.Random(seed) if sample_ratio is not None: - selected = _sample_window_then_thin(rows_by_session, sample_ratio, rng) + selected = _sample_window_then_thin(rows_by_session, sample_ratio, + window_seconds, rng) elif target_requests is not None: all_sids = list(rows_by_session.keys()) rng.shuffle(all_sids) @@ -121,17 +123,18 @@ def _cap_single_turn( def _sample_window_then_thin( rows_by_session: dict[str, list[dict]], ratio: float, + window_seconds: float | None, rng: random.Random, ) -> list[str]: """Window + thin sampling that preserves cross-session sharing. 1. Compute first-request timestamp for each session. - 2. Pick a contiguous time window sized so that - window_sessions * thin_ratio ≈ total_sessions * ratio. - thin_ratio is kept >= 0.5 to preserve cross-session sharing. - 3. Randomly drop (1 - thin_ratio) of sessions within the window. + 2. Pick a contiguous time window: + - If --window-seconds given: use that duration, thin by ratio within it. + - Otherwise: auto-size so window_sessions * thin_ratio ≈ target. + 3. Keep all sessions whose first request falls within the window. + 4. Randomly thin sessions within the window to hit target count. """ - # Session start times (timestamp of first request) session_starts: list[tuple[float, str]] = [] for sid, rows in rows_by_session.items(): t0 = min(float(r["timestamp"]) for r in rows) @@ -140,35 +143,44 @@ def _sample_window_then_thin( total_sessions = len(session_starts) target_n = max(1, int(total_sessions * ratio)) + trace_start = session_starts[0][0] + trace_end = session_starts[-1][0] + trace_duration = trace_end - trace_start - # Determine thin_ratio and window size - # thin_ratio >= 0.5 to preserve sharing; prefer 1.0 if window fits - # window_sessions = target_n / thin_ratio + if window_seconds is not None: + # Fixed window: pick a random start, thin to hit target ratio + max_start_t = trace_end - window_seconds + if max_start_t <= trace_start: + win_start_t = trace_start + else: + win_start_t = trace_start + rng.random() * (max_start_t - trace_start) + win_end_t = win_start_t + window_seconds + + window_sids = [sid for t, sid in session_starts + if win_start_t <= t <= win_end_t] + # Thin to target + if len(window_sids) > target_n: + thin_ratio = target_n / len(window_sids) + window_sids = [s for s in window_sids if rng.random() < thin_ratio] + return window_sids + + # Auto-size window thin_ratio = min(1.0, max(0.5, ratio * 10)) - window_sessions = int(target_n / thin_ratio) - window_sessions = min(window_sessions, total_sessions) + window_sessions = min(int(target_n / thin_ratio), total_sessions) - # Pick window start: random position in the trace max_start = total_sessions - window_sessions - if max_start <= 0: - window_start = 0 - else: - window_start = rng.randint(0, max_start) + window_start = rng.randint(0, max_start) if max_start > 0 else 0 + window_sids = [sid for _, sid in + session_starts[window_start:window_start + window_sessions]] - window_sids = [sid for _, sid in session_starts[window_start:window_start + window_sessions]] + if thin_ratio < 1.0: + window_sids = [s for s in window_sids if rng.random() < thin_ratio] - # Thin within window - if thin_ratio >= 1.0: - selected = window_sids - else: - selected = [sid for sid in window_sids if rng.random() < thin_ratio] + if len(window_sids) > target_n * 1.2: + rng.shuffle(window_sids) + window_sids = window_sids[:int(target_n * 1.1)] - # Ensure we don't overshoot target by too much - if len(selected) > target_n * 1.2: - rng.shuffle(selected) - selected = selected[:int(target_n * 1.1)] - - return selected + return window_sids def build_output( @@ -245,6 +257,8 @@ def main() -> None: help="Target number of requests (legacy, no sharing preservation)") p.add_argument("--max-single-turn-ratio", type=float, default=None, help="Cap single-turn sessions to this fraction of total (e.g. 0.3)") + p.add_argument("--window-seconds", type=float, default=None, + help="Time window duration in seconds (e.g. 600 for 10 min)") p.add_argument("--seed", type=int, default=42) args = p.parse_args() @@ -262,6 +276,7 @@ def main() -> None: sample_ratio=args.sample_ratio, target_requests=args.target_requests, max_single_turn_ratio=args.max_single_turn_ratio, + window_seconds=args.window_seconds, seed=args.seed, )