Fix B3 analysis bugs from subagent audit (median + percentile + sweep)
Three fixes from the B3 audit:
1) joined_analysis.hotspot_index used sorted[n//2] as median, which
returns the ~60th percentile for n=8 (even-length). Systematically
under-states the hotspot index. Recomputed values:
lmetric 2.238 -> 2.253 (+0.7%)
load_only 1.140 -> 1.294 (+13.5%)
sticky 2.349 -> 2.728 (+16.1%)
unified 3.350 -> 3.667 (+9.5%)
capped 1.937 -> 2.020 (+4.3%)
Qualitative ranking preserved; "capped only modestly reduces hotspot"
story holds with ~10% drop instead of the previously reported 13%.
Added test_hotspot_index_uses_true_median_for_even_n to lock in the
fix.
2) b3_analyze.sh's pct() helper used floor-indexed percentile
sorted[int(p*(n-1))], inconsistent with metrics._percentile and
joined_analysis._percentile which both use linear interpolation.
Now matches.
3) b3_sweep.sh's capped step called run_policy "capped", but the
proxy's argparse has no "capped" choice, so the hot-sweep variant
would have crashed on this step. The actual capped data was
produced via b3_isolated_policy.sh with --policy lmetric. Replace
the broken inline call with an explicit launch_proxy lmetric +
inline replayer block so the sweep script matches the data path
it documents.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -352,7 +352,10 @@ def hotspot_index(joined: list[JsonDict]) -> JsonDict:
|
||||
p90s_q = sorted(worker_p90_q.values())
|
||||
idx = None
|
||||
if len(p90s_q) >= 2:
|
||||
median = p90s_q[len(p90s_q) // 2]
|
||||
# True median: average of two middle values for even-length lists.
|
||||
# Previously used sorted[n//2] which returns the ~60th percentile
|
||||
# for n=8 and systematically under-states hotspot_index.
|
||||
median = statistics.median(p90s_q)
|
||||
if median > 0:
|
||||
idx = max(p90s_q) / median
|
||||
|
||||
|
||||
@@ -65,9 +65,17 @@ from pathlib import Path
|
||||
|
||||
sweep = Path("$SWEEP_DIR")
|
||||
def pct(vals, p):
|
||||
# Linear-interpolated percentile, matches metrics._percentile.
|
||||
# Previously used floor-indexed sorted[int(p*(n-1))] which is
|
||||
# inconsistent with how the same percentile is computed elsewhere.
|
||||
if not vals: return None
|
||||
vs = sorted(vals)
|
||||
return vs[int(p * (len(vs)-1))]
|
||||
if len(vs) == 1: return vs[0]
|
||||
rank = p * (len(vs) - 1)
|
||||
lo = int(rank)
|
||||
hi = min(lo + 1, len(vs) - 1)
|
||||
frac = rank - lo
|
||||
return vs[lo] * (1 - frac) + vs[hi] * frac
|
||||
|
||||
rows = []
|
||||
for sub in sorted(sweep.iterdir()):
|
||||
|
||||
@@ -161,7 +161,9 @@ for policy in $POLICIES; do
|
||||
run_policy "$policy" "$TRACE"
|
||||
done
|
||||
|
||||
# 3) Capped variant on lmetric
|
||||
# 3) Capped variant: lmetric picker on a per-session turn-capped trace.
|
||||
# The directory label is "capped" but the proxy must launch with
|
||||
# --policy lmetric (the proxy's argparse has no "capped" choice).
|
||||
echo "[b3_sweep] building capped trace (max_turns=$MAX_TURNS_CAP) ..."
|
||||
CAPPED_TRACE="$OUTDIR/capped/trace.jsonl"
|
||||
mkdir -p "$OUTDIR/capped"
|
||||
@@ -169,7 +171,34 @@ mkdir -p "$OUTDIR/capped"
|
||||
--input "$TRACE" \
|
||||
--output "$CAPPED_TRACE" \
|
||||
--max-turns "$MAX_TURNS_CAP" | tee "$OUTDIR/capped/build.log"
|
||||
run_policy "capped" "$CAPPED_TRACE"
|
||||
|
||||
# Inline equivalent of run_policy "capped" but using --policy lmetric.
|
||||
echo "[b3_sweep] === policy=capped (picker=lmetric) trace=$(basename "$CAPPED_TRACE") ==="
|
||||
pkill -9 -f cache_aware_proxy 2>/dev/null || true
|
||||
sleep 2
|
||||
launch_proxy lmetric "$OUTDIR/capped/proxy.log"
|
||||
t_cap_start=$(date +%s.%N)
|
||||
PYTHONPATH="$ROOT" "$VENV/python" -m replayer \
|
||||
--trace "$CAPPED_TRACE" \
|
||||
--output "$OUTDIR/capped/metrics.jsonl" \
|
||||
--endpoint "http://127.0.0.1:$PROXY_PORT" \
|
||||
--model "$MODEL" \
|
||||
2>&1 | tee "$OUTDIR/capped/replayer.log" | tail -3
|
||||
t_cap_end=$(date +%s.%N)
|
||||
python3 - "$OUTDIR/capped" capped "$CAPPED_TRACE" "$t_cap_start" "$t_cap_end" <<'PY'
|
||||
import json, sys
|
||||
rundir, policy, trace, t_start, t_end = sys.argv[1:]
|
||||
with open(f"{rundir}/run_window.json", "w") as f:
|
||||
json.dump({
|
||||
"policy": policy, "trace": trace,
|
||||
"t_start_unix": float(t_start),
|
||||
"t_end_unix": float(t_end),
|
||||
}, f, indent=2)
|
||||
PY
|
||||
curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$OUTDIR/capped/breakdown.json"
|
||||
curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$OUTDIR/capped/worker_state.json"
|
||||
curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$OUTDIR/capped/stats.json"
|
||||
echo "[b3_sweep] capped done: $(wc -l < "$OUTDIR/capped/metrics.jsonl") metric rows"
|
||||
|
||||
# 4) Snapshot final engine state file sizes for the analyzer
|
||||
ls -l "$OUTDIR/engine_state/" > "$OUTDIR/engine_state_files.txt"
|
||||
|
||||
@@ -131,6 +131,26 @@ def test_hotspot_index_max_over_median_p90():
|
||||
assert out["hotspot_index_ttft_p90"] > 5.0
|
||||
|
||||
|
||||
def test_hotspot_index_uses_true_median_for_even_n():
|
||||
"""8 workers, sorted TTFT p90 [1,2,3,4,5,6,7,80].
|
||||
True median = (4+5)/2 = 4.5; hotspot = 80/4.5 ≈ 17.78.
|
||||
Previous buggy implementation used sorted[4] = 5, giving 80/5 = 16.0.
|
||||
"""
|
||||
rows = []
|
||||
ttfts = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 80.0]
|
||||
for i, t in enumerate(ttfts):
|
||||
for _ in range(10):
|
||||
rows.append({
|
||||
"request_id": f"x{i}", "routed_to": f"http://h:800{i}",
|
||||
"endpoint_url": f"http://h:800{i}",
|
||||
"ttft_s": t, "latency_s": 1.0, "error": None,
|
||||
})
|
||||
out = hotspot_index(rows)
|
||||
assert out["status"] == "supported"
|
||||
idx = out["hotspot_index_ttft_p90"]
|
||||
assert abs(idx - 80.0 / 4.5) < 1e-6, f"expected ~17.78, got {idx}"
|
||||
|
||||
|
||||
def test_label_slow_requests_flags_overlap_and_hot_worker():
|
||||
metrics = [
|
||||
_mk_metric("slow_overlap", ttft_s=10.0,
|
||||
|
||||
Reference in New Issue
Block a user