Files
agentic-kvc/scripts/b3_analyze.sh
Gahow Wang 0e82612100 Fix B3 analysis bugs from subagent audit (median + percentile + sweep)
Three fixes from the B3 audit:

1) joined_analysis.hotspot_index used sorted[n//2] as median, which
   returns the ~60th percentile for n=8 (even-length). Systematically
   under-states the hotspot index. Recomputed values:
       lmetric   2.238 -> 2.253  (+0.7%)
       load_only 1.140 -> 1.294  (+13.5%)
       sticky    2.349 -> 2.728  (+16.1%)
       unified   3.350 -> 3.667  (+9.5%)
       capped    1.937 -> 2.020  (+4.3%)
   Qualitative ranking preserved; "capped only modestly reduces hotspot"
   story holds with ~10% drop instead of the previously reported 13%.
   Added test_hotspot_index_uses_true_median_for_even_n to lock in the
   fix.

2) b3_analyze.sh's pct() helper used floor-indexed percentile
   sorted[int(p*(n-1))], inconsistent with metrics._percentile and
   joined_analysis._percentile which both use linear interpolation.
   Now matches.

3) b3_sweep.sh's capped step called run_policy "capped", but the
   proxy's argparse has no "capped" choice, so the hot-sweep variant
   would have crashed on this step. The actual capped data was
   produced via b3_isolated_policy.sh with --policy lmetric. Replace
   the broken inline call with an explicit launch_proxy lmetric +
   inline replayer block so the sweep script matches the data path
   it documents.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-26 01:08:37 +08:00

119 lines
4.6 KiB
Bash
Executable File

#!/usr/bin/env bash
# Per-policy joined_analysis driver for a completed B3 sweep.
#
# For each policy directory under <SWEEP_DIR>:
# - slice engine_state by run_window.json
# - run joined_analysis.py to emit interference / hotspot / reuse
# / failure breakdown
# Then emit b3_policy_comparison.json aggregating one row per policy.
set -euo pipefail
ROOT="${ROOT:-/home/admin/cpfs/wjh/agentic-kv}"
VENV="$ROOT/.venv/bin"
SWEEP_DIR="${1:?usage: $0 <sweep_dir>}"
WORKER_MAP="http://127.0.0.1:8000=engine_0,http://127.0.0.1:8001=engine_1,http://127.0.0.1:8002=engine_2,http://127.0.0.1:8003=engine_3,http://127.0.0.1:8004=engine_4,http://127.0.0.1:8005=engine_5,http://127.0.0.1:8006=engine_6,http://127.0.0.1:8007=engine_7"
_has_engine_data() {
# Return 0 (true) if $1/*.jsonl contains any non-empty file.
local dir="$1"
[ -d "$dir" ] || return 1
local f
for f in "$dir"/engine_*.jsonl; do
if [ -s "$f" ]; then return 0; fi
done
return 1
}
for policy_dir in "$SWEEP_DIR"/*/; do
policy=$(basename "$policy_dir")
case "$policy" in
engine_state|logs) continue ;;
esac
if [ ! -f "$policy_dir/run_window.json" ]; then
continue
fi
echo "=== $policy ==="
# Isolated policies write engine_state into their own dir; hot-sweep
# policies share the sweep-root engine_state and need slicing.
if _has_engine_data "$policy_dir/engine_state"; then
echo " using policy-local engine_state ($(du -sh "$policy_dir/engine_state" | cut -f1))"
else
PYTHONPATH="$ROOT" "$VENV/python" \
"$ROOT/scripts/slice_engine_state.py" \
--input-dir "$SWEEP_DIR/engine_state" \
--output-dir "$policy_dir/engine_state" \
--window "$policy_dir/run_window.json"
fi
PYTHONPATH="$ROOT" "$VENV/python" \
"$ROOT/analysis/characterization/joined_analysis.py" \
--metrics "$policy_dir/metrics.jsonl" \
--breakdown "$policy_dir/breakdown.json" \
--worker-state "$policy_dir/worker_state.json" \
--engine-state-dir "$policy_dir/engine_state" \
--worker-map "$WORKER_MAP" \
--out-dir "$policy_dir/joined"
done
# Aggregate per-policy summary
"$VENV/python" - <<PY
import json, os, statistics
from pathlib import Path
sweep = Path("$SWEEP_DIR")
def pct(vals, p):
# Linear-interpolated percentile, matches metrics._percentile.
# Previously used floor-indexed sorted[int(p*(n-1))] which is
# inconsistent with how the same percentile is computed elsewhere.
if not vals: return None
vs = sorted(vals)
if len(vs) == 1: return vs[0]
rank = p * (len(vs) - 1)
lo = int(rank)
hi = min(lo + 1, len(vs) - 1)
frac = rank - lo
return vs[lo] * (1 - frac) + vs[hi] * frac
rows = []
for sub in sorted(sweep.iterdir()):
rw = sub / "run_window.json"
jd = sub / "joined"
if not rw.exists() or not jd.exists():
continue
policy = sub.name
metrics = [json.loads(l) for l in (sub / "metrics.jsonl").open()]
ok = [r for r in metrics if r.get("error") is None]
ttfts = [r["ttft_s"] for r in ok if r.get("ttft_s") is not None]
tpots = [r["tpot_s"] for r in ok if r.get("tpot_s") is not None]
e2es = [r["latency_s"] for r in ok if r.get("latency_s") is not None]
total_input = sum(r.get("input_length", 0) for r in ok)
total_cached = sum(r.get("cached_tokens", 0) for r in ok)
interf = json.loads((jd / "interference_index.json").read_text())
hot = json.loads((jd / "hotspot_index.json").read_text())
reuse = json.loads((jd / "reuse_decomposition.json").read_text())
fail = json.loads((jd / "failure_breakdown.json").read_text())
rows.append({
"policy": policy,
"n_ok": len(ok), "n_total": len(metrics),
"ttft_p50_s": pct(ttfts, 0.5), "ttft_p90_s": pct(ttfts, 0.9),
"ttft_p99_s": pct(ttfts, 0.99),
"tpot_p50_s": pct(tpots, 0.5), "tpot_p90_s": pct(tpots, 0.9),
"tpot_p99_s": pct(tpots, 0.99),
"e2e_p50_s": pct(e2es, 0.5), "e2e_p90_s": pct(e2es, 0.9),
"e2e_p99_s": pct(e2es, 0.99),
"apc_ratio": total_cached / max(total_input, 1),
"interference_index": interf.get("interference_index"),
"hotspot_index_ttft_p90": hot.get("hotspot_index_ttft_p90"),
"reuse_intra_frac": reuse.get("fractions", {}).get("intra"),
"reuse_cross_frac": reuse.get("fractions", {}).get("cross"),
"n_slow": fail.get("n_slow"),
"failure_counts": fail.get("counts"),
})
out = sweep / "b3_policy_comparison.json"
out.write_text(json.dumps({"rows": rows}, indent=2))
print(json.dumps(rows, indent=2))
PY