diff --git a/analysis/characterization/joined_analysis.py b/analysis/characterization/joined_analysis.py index 3f7d5a9..c2483c3 100644 --- a/analysis/characterization/joined_analysis.py +++ b/analysis/characterization/joined_analysis.py @@ -352,7 +352,10 @@ def hotspot_index(joined: list[JsonDict]) -> JsonDict: p90s_q = sorted(worker_p90_q.values()) idx = None if len(p90s_q) >= 2: - median = p90s_q[len(p90s_q) // 2] + # True median: average of two middle values for even-length lists. + # Previously used sorted[n//2] which returns the ~60th percentile + # for n=8 and systematically under-states hotspot_index. + median = statistics.median(p90s_q) if median > 0: idx = max(p90s_q) / median diff --git a/scripts/b3_analyze.sh b/scripts/b3_analyze.sh index ed823f9..a136231 100755 --- a/scripts/b3_analyze.sh +++ b/scripts/b3_analyze.sh @@ -65,9 +65,17 @@ from pathlib import Path sweep = Path("$SWEEP_DIR") def pct(vals, p): + # Linear-interpolated percentile, matches metrics._percentile. + # Previously used floor-indexed sorted[int(p*(n-1))] which is + # inconsistent with how the same percentile is computed elsewhere. if not vals: return None vs = sorted(vals) - return vs[int(p * (len(vs)-1))] + if len(vs) == 1: return vs[0] + rank = p * (len(vs) - 1) + lo = int(rank) + hi = min(lo + 1, len(vs) - 1) + frac = rank - lo + return vs[lo] * (1 - frac) + vs[hi] * frac rows = [] for sub in sorted(sweep.iterdir()): diff --git a/scripts/b3_sweep.sh b/scripts/b3_sweep.sh index 6e02702..f7c8532 100755 --- a/scripts/b3_sweep.sh +++ b/scripts/b3_sweep.sh @@ -161,7 +161,9 @@ for policy in $POLICIES; do run_policy "$policy" "$TRACE" done -# 3) Capped variant on lmetric +# 3) Capped variant: lmetric picker on a per-session turn-capped trace. +# The directory label is "capped" but the proxy must launch with +# --policy lmetric (the proxy's argparse has no "capped" choice). echo "[b3_sweep] building capped trace (max_turns=$MAX_TURNS_CAP) ..." CAPPED_TRACE="$OUTDIR/capped/trace.jsonl" mkdir -p "$OUTDIR/capped" @@ -169,7 +171,34 @@ mkdir -p "$OUTDIR/capped" --input "$TRACE" \ --output "$CAPPED_TRACE" \ --max-turns "$MAX_TURNS_CAP" | tee "$OUTDIR/capped/build.log" -run_policy "capped" "$CAPPED_TRACE" + +# Inline equivalent of run_policy "capped" but using --policy lmetric. +echo "[b3_sweep] === policy=capped (picker=lmetric) trace=$(basename "$CAPPED_TRACE") ===" +pkill -9 -f cache_aware_proxy 2>/dev/null || true +sleep 2 +launch_proxy lmetric "$OUTDIR/capped/proxy.log" +t_cap_start=$(date +%s.%N) +PYTHONPATH="$ROOT" "$VENV/python" -m replayer \ + --trace "$CAPPED_TRACE" \ + --output "$OUTDIR/capped/metrics.jsonl" \ + --endpoint "http://127.0.0.1:$PROXY_PORT" \ + --model "$MODEL" \ + 2>&1 | tee "$OUTDIR/capped/replayer.log" | tail -3 +t_cap_end=$(date +%s.%N) +python3 - "$OUTDIR/capped" capped "$CAPPED_TRACE" "$t_cap_start" "$t_cap_end" <<'PY' +import json, sys +rundir, policy, trace, t_start, t_end = sys.argv[1:] +with open(f"{rundir}/run_window.json", "w") as f: + json.dump({ + "policy": policy, "trace": trace, + "t_start_unix": float(t_start), + "t_end_unix": float(t_end), + }, f, indent=2) +PY +curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$OUTDIR/capped/breakdown.json" +curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$OUTDIR/capped/worker_state.json" +curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$OUTDIR/capped/stats.json" +echo "[b3_sweep] capped done: $(wc -l < "$OUTDIR/capped/metrics.jsonl") metric rows" # 4) Snapshot final engine state file sizes for the analyzer ls -l "$OUTDIR/engine_state/" > "$OUTDIR/engine_state_files.txt" diff --git a/tests/test_joined_analysis.py b/tests/test_joined_analysis.py index a3263f1..3a715c0 100644 --- a/tests/test_joined_analysis.py +++ b/tests/test_joined_analysis.py @@ -131,6 +131,26 @@ def test_hotspot_index_max_over_median_p90(): assert out["hotspot_index_ttft_p90"] > 5.0 +def test_hotspot_index_uses_true_median_for_even_n(): + """8 workers, sorted TTFT p90 [1,2,3,4,5,6,7,80]. + True median = (4+5)/2 = 4.5; hotspot = 80/4.5 ≈ 17.78. + Previous buggy implementation used sorted[4] = 5, giving 80/5 = 16.0. + """ + rows = [] + ttfts = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 80.0] + for i, t in enumerate(ttfts): + for _ in range(10): + rows.append({ + "request_id": f"x{i}", "routed_to": f"http://h:800{i}", + "endpoint_url": f"http://h:800{i}", + "ttft_s": t, "latency_s": 1.0, "error": None, + }) + out = hotspot_index(rows) + assert out["status"] == "supported" + idx = out["hotspot_index_ttft_p90"] + assert abs(idx - 80.0 / 4.5) < 1e-6, f"expected ~17.78, got {idx}" + + def test_label_slow_requests_flags_overlap_and_hot_worker(): metrics = [ _mk_metric("slow_overlap", ttft_s=10.0,