From bf037594c4e079b42e0e7a348c6e2a1760b26e10 Mon Sep 17 00:00:00 2001
From: Gahow Wang <gahow.wang@gmail.com>
Date: Sat, 23 May 2026 15:44:34 +0800
Subject: [PATCH] Production-realistic baseline: APC 67.5%, TPOT +139% from
 interference

Updated methodology:
- Window+thin sampling preserves cross-session sharing (48% vs 16%)
- --max-single-turn-ratio 0.3 boosts multi-turn to 70%
- --window-seconds 600 for 10-min contiguous window
- Trace-driven replay (no session limit, no time compression)
- Daily config: --requests 850 (~13 min, APC~76%)

Key result: TPOT p90=0.175s (vs 0.073s in legacy 1-req/GPU setup),
confirming prefill-decode interference is real at production concurrency.
APC 67.5% (vs 44%) from better KV reuse preservation.

Also fixed KV reuse breakdown: 62% intra-session / 38% cross-session
(was incorrectly reported as 91% / 9%).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 REPORT.md               | 91 ++++++++++++++++++++++++++++++-----------
 scripts/bench.sh        |  3 +-
 scripts/sample_trace.py | 71 +++++++++++++++++++-------------
 3 files changed, 111 insertions(+), 54 deletions(-)

diff --git a/REPORT.md b/REPORT.md
index c663bc0..b050285 100644
--- a/REPORT.md
+++ b/REPORT.md
@@ -42,10 +42,30 @@ For agentic LLM workloads (long input, short output, high KV cache reuse), is pr
 | Avg output tokens | 445 (p50=80) |
 | I/O ratio | 75.6× aggregate |
 | Prefill token share | 98% |
-| KV reuse (intra-session) | 91% of reusable blocks |
-| Theoretical max APC | 71% (infinite cache, single instance) |
+| KV reuse breakdown | 62% intra-session, 38% cross-session (token-level) |
+| Theoretical max APC | 67% (infinite cache, single instance, prefix-only) |
 
-**Sampled trace for benchmarks**: `traces/sampled_1000req_seed42.jsonl` (1000 requests, seed=42, preserving session structure). For 200-request ablations: replayer `--request-limit 200`.
+**Sampled trace for benchmarks**: `traces/w600_r0.0015_st30.jsonl` (1214 requests, 688 sessions, 70% multi-turn). Generated with window+thin sampling:
+
+```bash
+python scripts/sample_trace.py \
+    --input ~/ali-trace/trace-glm5.1-formatted/051315-051317.jsonl \
+    --output traces/w600_r0.0015_st30.jsonl \
+    --sample-ratio 0.0015 --max-single-turn-ratio 0.3 \
+    --window-seconds 600 --seed 42
+```
+
+| Trace property | Value |
+|---------------|-------|
+| Sessions | 688 (70% multi-turn, avg 4.9 turns) |
+| Requests | 1214 (use `--request-limit 850` for daily, full for validation) |
+| Avg input tokens | 48,776 |
+| Trace span | 2912s (48.5 min); dense segment 0-990s (850 req) |
+| Peak QPS | 1.6 req/s (in dense segment) |
+| Hash block sharing | 48.3% (vs 52% full trace) |
+| Theoretical APC | 80% (full), 76% (first 850 req) |
+
+> **Sampling methodology (2026-05-23)**: Prior traces used random session sampling + `--time-scale` compression + `--max-inflight-sessions` semaphore, which (a) destroyed cross-session hash block sharing (52% → 16%), (b) artificially limited concurrency to 1 req/GPU, and (c) masked prefill-decode interference. The new approach uses contiguous time-window sampling with session thinning (`--max-single-turn-ratio 0.3`) to preserve KV reuse patterns, and trace-driven replay with no artificial concurrency limits.
 
 ### 2.4 Two Configurations Compared
 
@@ -119,9 +139,10 @@ python scripts/cache_aware_proxy.py \
 
 | Parameter | Value |
 |-----------|-------|
-| Requests | 200 (from sampled 1000-req trace, `--request-limit 200`) |
-| Time scale | 20× (compress 2h trace into ~6min) |
-| Max inflight sessions | 8 |
+| Trace | `traces/w600_r0.0015_st30.jsonl` (window+thin, 70% multi-turn) |
+| Daily iteration | `--request-limit 850` (~13 min, APC≈76%) |
+| Full validation | All 1214 requests (~48 min, APC≈80%) |
+| Replay mode | Trace-driven (no session limit, no time compression) |
 | Request timeout | 600s |
 | vLLM flags | `--enforce-eager --enable-prefix-caching --max-model-len 200000` |
 | GPU memory util | 0.9 |
@@ -139,31 +160,22 @@ python scripts/sample_trace.py \
     --output traces/sampled_1000req_seed42.jsonl \
     --target-requests 1000 --seed 42
 
-# Start GPU monitoring (in a separate terminal)
-bash scripts/gpu_monitor.sh > outputs/<tag>/gpu_util.csv &
+# Run benchmark (daily iteration)
+bash scripts/bench.sh --tag my_experiment --mode baseline --policy linear \
+    --trace traces/w600_r0.0015_st30.jsonl --requests 850
 
-# Run replayer against proxy
-python -m replayer \
-    --trace traces/sampled_1000req_seed42.jsonl \
-    --output outputs/<tag>/metrics.jsonl \
-    --endpoint http://localhost:9090 \
-    --time-scale 20 --max-inflight-sessions 8 \
-    --request-limit 200 -v
-
-# Collect proxy breakdown (elastic only)
-curl -s http://localhost:9090/breakdown > outputs/<tag>/breakdown.json
-
-# Collect APC from vLLM logs
-for i in $(seq 0 7); do
-    grep "Prefix cache hit rate\|External prefix cache hit rate" /tmp/<prefix>_$i.log | tail -2
-done
+# Run benchmark (full validation)
+bash scripts/bench.sh --tag my_experiment_full --mode baseline --policy linear \
+    --trace traces/w600_r0.0015_st30.jsonl
 ```
 
 ## 3. Results
 
-> **Errata (2026-05-22)**: The initial cross-machine A/B (dash0 baseline vs dash1 elastic) reported -44% E2E improvement. Post-hoc analysis revealed the dash0 baseline instances were **not freshly restarted** — residual KV cache from prior experiments caused 2× TTFT inflation. All results below use verified fresh-restart experiments on the same machine.
+> **Errata (2026-05-22)**: The initial cross-machine A/B (dash0 baseline vs dash1 elastic) reported -44% E2E improvement. Post-hoc analysis revealed the dash0 baseline instances were **not freshly restarted** — residual KV cache from prior experiments caused 2× TTFT inflation.
 
-### 3.1 Fair Comparison (all fresh-restart, same machine dash0, 200 req)
+> **Errata (2026-05-23)**: §3.1 results used artificial concurrency limits (`--max-inflight-sessions 8`, 1 req/GPU) and random session sampling that destroyed cross-session KV sharing (52% → 16%). See §3.6 for production-realistic results with corrected methodology.
+
+### 3.1 Legacy Comparison (artificial 1 req/GPU, 200 req)
 
 | Config | OK/N | TTFT p50 | TTFT p90 | TPOT p90 | E2E p50 |
 |--------|------|----------|----------|----------|---------|
@@ -230,6 +242,35 @@ Delta:                   -45%          -44%    ← INVALID
 
 The elastic numbers on dash1 were genuinely fresh. The "improvement" was actually comparing fresh elastic against degraded baseline.
 
+### 3.6 Production-Realistic Baseline (trace-driven, corrected methodology)
+
+> Corrected sampling (window+thin, 70% multi-turn, block sharing 48%) and trace-driven replay (no session limit, no time compression). See §2.3 for trace details.
+
+**Linear policy, 912 requests (dense segment), peak QPS ≈ 1.6:**
+
+| Metric | Legacy (§3.1, 1 req/GPU) | **New (trace-driven)** | Delta |
+|--------|-------------------------|----------------------|-------|
+| TTFT mean | 1.07s | **4.54s** | +4.2× |
+| TTFT p50 | 1.08s | **0.94s** | -13% |
+| TTFT p90 | 9.38s | **14.12s** | +51% |
+| TPOT p50 | 0.038s | **0.070s** | **+84%** |
+| TPOT p90 | 0.073s | **0.175s** | **+139%** |
+| APC (mean) | ~44% | **67.5%** | **+23pp** |
+| Errors | 2/200 (1.0%) | 0/912 (0%) | better |
+| E2E p50 | 5.08s | 6.98s | +37% |
+
+**Key differences from legacy methodology:**
+
+1. **APC 67.5% vs 44%**: Window+thin sampling preserves cross-session block sharing (48% vs 16% in legacy random sampling), yielding production-realistic cache hit rates. Per-instance APC ranges 46–84%.
+
+2. **TPOT +139% at p90**: With trace-driven replay, multiple concurrent requests per GPU create **real prefill-decode interference**. The legacy 1 req/GPU setup showed TPOT p90=0.073s (no interference), but production-realistic load shows TPOT p90=0.175s. This validates that prefill-decode interference is a real problem at production concurrency.
+
+3. **TTFT p50 improved (-13%) but mean degraded (+4.2×)**: Higher APC means cached requests get very fast TTFT (p50=0.94s). But concurrent heavy prefills cause queuing for non-cached requests, inflating the mean and p90.
+
+4. **Per-instance APC imbalance (46–84%)**: Routing quality directly determines per-instance APC. The 38pp gap between worst and best instance suggests routing optimization is still the highest-leverage improvement.
+
+**Output**: `outputs/baseline_r0015_st30/` on dash0.
+
 ## 4. System-Level Analysis
 
 ### 4.1 Elastic P2P Does Not Improve Single-Machine Performance
diff --git a/scripts/bench.sh b/scripts/bench.sh
index da29ae3..aebae83 100755
--- a/scripts/bench.sh
+++ b/scripts/bench.sh
@@ -21,7 +21,7 @@ VENV="${VENV_PATH:-$PROJECT_DIR/.venv/bin}"
 PYTHON="$VENV/python"
 VLLM="$VENV/vllm"
 MODEL="${MODEL_PATH:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
-TRACE="$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl"
+TRACE="${TRACE:-$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl}"
 
 # Defaults
 TAG=""
@@ -44,6 +44,7 @@ while [[ $# -gt 0 ]]; do
         --policy) POLICY="$2"; shift 2 ;;
         --instances) N_INSTANCES="$2"; shift 2 ;;
         --requests) REQUESTS="$2"; shift 2 ;;
+        --trace) TRACE="$2"; shift 2 ;;
         --heavy-threshold) HEAVY_THRESHOLD="$2"; shift 2 ;;
         --no-offload) NO_OFFLOAD=true; shift ;;
         --overload-factor) OVERLOAD_FACTOR_ARG="$2"; shift 2 ;;
diff --git a/scripts/sample_trace.py b/scripts/sample_trace.py
index 8933a17..84b46cb 100644
--- a/scripts/sample_trace.py
+++ b/scripts/sample_trace.py
@@ -70,13 +70,15 @@ def sample_sessions(
     sample_ratio: float | None = None,
     target_requests: int | None = None,
     max_single_turn_ratio: float | None = None,
+    window_seconds: float | None = None,
     seed: int,
 ) -> list[str]:
     """Sample sessions preserving KV cache reuse."""
     rng = random.Random(seed)
 
     if sample_ratio is not None:
-        selected = _sample_window_then_thin(rows_by_session, sample_ratio, rng)
+        selected = _sample_window_then_thin(rows_by_session, sample_ratio,
+                                            window_seconds, rng)
     elif target_requests is not None:
         all_sids = list(rows_by_session.keys())
         rng.shuffle(all_sids)
@@ -121,17 +123,18 @@ def _cap_single_turn(
 def _sample_window_then_thin(
     rows_by_session: dict[str, list[dict]],
     ratio: float,
+    window_seconds: float | None,
     rng: random.Random,
 ) -> list[str]:
     """Window + thin sampling that preserves cross-session sharing.
 
     1. Compute first-request timestamp for each session.
-    2. Pick a contiguous time window sized so that
-       window_sessions * thin_ratio ≈ total_sessions * ratio.
-       thin_ratio is kept >= 0.5 to preserve cross-session sharing.
-    3. Randomly drop (1 - thin_ratio) of sessions within the window.
+    2. Pick a contiguous time window:
+       - If --window-seconds given: use that duration, thin by ratio within it.
+       - Otherwise: auto-size so window_sessions * thin_ratio ≈ target.
+    3. Keep all sessions whose first request falls within the window.
+    4. Randomly thin sessions within the window to hit target count.
     """
-    # Session start times (timestamp of first request)
     session_starts: list[tuple[float, str]] = []
     for sid, rows in rows_by_session.items():
         t0 = min(float(r["timestamp"]) for r in rows)
@@ -140,35 +143,44 @@ def _sample_window_then_thin(
 
     total_sessions = len(session_starts)
     target_n = max(1, int(total_sessions * ratio))
+    trace_start = session_starts[0][0]
+    trace_end = session_starts[-1][0]
+    trace_duration = trace_end - trace_start
 
-    # Determine thin_ratio and window size
-    # thin_ratio >= 0.5 to preserve sharing; prefer 1.0 if window fits
-    # window_sessions = target_n / thin_ratio
+    if window_seconds is not None:
+        # Fixed window: pick a random start, thin to hit target ratio
+        max_start_t = trace_end - window_seconds
+        if max_start_t <= trace_start:
+            win_start_t = trace_start
+        else:
+            win_start_t = trace_start + rng.random() * (max_start_t - trace_start)
+        win_end_t = win_start_t + window_seconds
+
+        window_sids = [sid for t, sid in session_starts
+                       if win_start_t <= t <= win_end_t]
+        # Thin to target
+        if len(window_sids) > target_n:
+            thin_ratio = target_n / len(window_sids)
+            window_sids = [s for s in window_sids if rng.random() < thin_ratio]
+        return window_sids
+
+    # Auto-size window
     thin_ratio = min(1.0, max(0.5, ratio * 10))
-    window_sessions = int(target_n / thin_ratio)
-    window_sessions = min(window_sessions, total_sessions)
+    window_sessions = min(int(target_n / thin_ratio), total_sessions)
 
-    # Pick window start: random position in the trace
     max_start = total_sessions - window_sessions
-    if max_start <= 0:
-        window_start = 0
-    else:
-        window_start = rng.randint(0, max_start)
+    window_start = rng.randint(0, max_start) if max_start > 0 else 0
+    window_sids = [sid for _, sid in
+                   session_starts[window_start:window_start + window_sessions]]
 
-    window_sids = [sid for _, sid in session_starts[window_start:window_start + window_sessions]]
+    if thin_ratio < 1.0:
+        window_sids = [s for s in window_sids if rng.random() < thin_ratio]
 
-    # Thin within window
-    if thin_ratio >= 1.0:
-        selected = window_sids
-    else:
-        selected = [sid for sid in window_sids if rng.random() < thin_ratio]
+    if len(window_sids) > target_n * 1.2:
+        rng.shuffle(window_sids)
+        window_sids = window_sids[:int(target_n * 1.1)]
 
-    # Ensure we don't overshoot target by too much
-    if len(selected) > target_n * 1.2:
-        rng.shuffle(selected)
-        selected = selected[:int(target_n * 1.1)]
-
-    return selected
+    return window_sids
 
 
 def build_output(
@@ -245,6 +257,8 @@ def main() -> None:
                    help="Target number of requests (legacy, no sharing preservation)")
     p.add_argument("--max-single-turn-ratio", type=float, default=None,
                    help="Cap single-turn sessions to this fraction of total (e.g. 0.3)")
+    p.add_argument("--window-seconds", type=float, default=None,
+                   help="Time window duration in seconds (e.g. 600 for 10 min)")
     p.add_argument("--seed", type=int, default=42)
     args = p.parse_args()
 
@@ -262,6 +276,7 @@ def main() -> None:
         sample_ratio=args.sample_ratio,
         target_requests=args.target_requests,
         max_single_turn_ratio=args.max_single_turn_ratio,
+        window_seconds=args.window_seconds,
         seed=args.seed,
     )