agentic-kvc/analysis/workload_chars/compute_chars.py

import json, sys, math, statistics as st
from collections import defaultdict, Counter
import matplotlib; matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np

PATH="/home/admin/cpfs/wjh/ali-trace/trace-glm5.1-formatted/051315-051317.jsonl"
OUT="/tmp/wlc_out"; import os; os.makedirs(OUT, exist_ok=True)
BLOCK=512
# --- transparent cost model for C3 (clearly-labeled estimate; raw-timing validation pending) ---
PREFILL_TOK_S=7000.0     # MB1: 32k->4.5s ~7100 tok/s effective on H20 / 30B-A3B
TPOT_S=0.010             # ~10ms/token decode (crossover unloaded ~5ms, loaded ~25ms)

def pct(v,p):
    if not v: return float('nan')
    s=sorted(v);k=(len(s)-1)*p;f=int(k)
    return s[f] if f+1>=len(s) else s[f]+(s[f+1]-s[f])*(k-f)

# ---------- Pass A: structure (scalars only) ----------
parents={}; recs={}; childcount=Counter()
for line in open(PATH):
    if not line.strip(): continue
    d=json.loads(line); cid=d["chat_id"]; pid=d["parent_chat_id"]
    parents[cid]=pid
    recs[cid]=(float(d["timestamp"]),int(d["input_length"]),int(d["output_length"]),int(d["turn"]))
    if pid!="-1": childcount[pid]+=1
print(f"[A] records={len(recs)}", file=sys.stderr)

root_of={}
def root(cid):
    path=[];c=cid
    while True:
        if c in root_of:r=root_of[c];break
        p=parents.get(c,"-1")
        if p=="-1" or p not in recs:r=c;break
        path.append(c);c=p
    for x in path:root_of[x]=r
    root_of[cid]=r;return r
sessions=defaultdict(list)
for cid in recs: sessions[root(cid)].append(cid)
seq={r:sorted(m,key=lambda c:(recs[c][3],recs[c][0])) for r,m in sessions.items()}
print(f"[A] sessions={len(seq)}", file=sys.stderr)

# ---------- C1: mixture + turn tail + hazard ----------
sr=mr=sm=mm=so=mo=0
turns_per=[]
for r,s in seq.items():
    multi=len(s)>1; turns_per.append(len(s))
    for c in s:
        _,inl,outl,_=recs[c]
        if multi: mr+=1;mm+=inl;mo+=outl
        else: sr+=1;sm+=inl;so+=outl
tot_r=sr+mr; tot_in=sm+mm; tot_out=so+mo
cnt_turn=Counter()
for r,s in seq.items():
    for c in s: cnt_turn[recs[c][3]]+=1
hazard={k: (cnt_turn[k+1]/cnt_turn[k] if cnt_turn[k] else 0) for k in range(1,30)}

# ---------- C2/C3: per-turn resident vs new-prefill (scalar) + hash_ids reuse ----------
by_in=defaultdict(list); by_new=defaultdict(list); by_out=defaultdict(list)
by_reuse_hash=defaultdict(list)  # hash-block prefix stability: reused/parent_blocks
store={}  # cid -> (blockset, in, out) for chats with pending children
tot_new_prefill=0; tot_reused=0
for line in open(PATH):
    if not line.strip(): continue
    d=json.loads(line); cid=d["chat_id"]; pid=d["parent_chat_id"]
    inl=int(d["input_length"]); outl=int(d["output_length"]); turn=int(d["turn"])
    blocks=set(d["hash_ids"])
    if pid in store:
        pblk,pin,pout=store[pid]
        new_prefill=max(0, inl - pin - pout)              # actual recompute (accounts for cached answer)
        reused_blk=len(blocks & pblk)
        by_reuse_hash[turn].append(reused_blk/len(pblk) if pblk else 0)
        childcount[pid]-=1
        if childcount[pid]<=0: del store[pid]
        tot_reused += (inl-new_prefill)
    else:
        new_prefill=inl                                    # session start: all new (intra-session)
    tot_new_prefill+=new_prefill
    by_in[turn].append(inl); by_new[turn].append(new_prefill); by_out[turn].append(outl)
    if childcount[cid]>0: store[cid]=(blocks,inl,outl)
print(f"[B] done; store residual={len(store)}", file=sys.stderr)

TURNS=[t for t in sorted(by_in) if len(by_in[t])>=50]
med_in=[pct(by_in[t],.5) for t in TURNS]
med_new=[max(pct(by_new[t],.5),1) for t in TURNS]
med_out=[pct(by_out[t],.5) for t in TURNS]
ratio=[med_in[i]/med_new[i] for i in range(len(TURNS))]
reuse_pct=[(1-med_new[i]/med_in[i])*100 for i in range(len(TURNS))]
# C3 time per turn (cost model)
t_pref=[med_new[i]/PREFILL_TOK_S for i in range(len(TURNS))]
t_dec=[med_out[i]*TPOT_S for i in range(len(TURNS))]

# aggregate decode/prefill time fraction over a RANGE of constants
def agg_time(prate,tpot):
    tp=tot_new_prefill/prate; td=tot_out*tpot; return td/(tp+td)
frac_lo=agg_time(13000,0.005); frac_mid=agg_time(7000,0.010); frac_hi=agg_time(3000,0.025)

chars={
 "mixture":{"single_sessions":sr if False else sum(1 for s in seq.values() if len(s)==1),
   "multi_sessions":sum(1 for s in seq.values() if len(s)>1),
   "req_single_pct":sr/tot_r*100,"req_multi_pct":mr/tot_r*100,
   "in_single_pct":sm/tot_in*100,"in_multi_pct":mm/tot_in*100,
   "out_single_pct":so/tot_out*100,"out_multi_pct":mo/tot_out*100},
 "turns":{"mean":st.mean(turns_per),"p99":pct(turns_per,.99),"max":max(turns_per),
   "single_turn_pct":sum(1 for x in turns_per if x==1)/len(turns_per)*100},
 "hazard":hazard,
 "token_mass":{"total_input":tot_in,"total_output":tot_out,"out_in_ratio_pct":tot_out/tot_in*100,
   "new_prefill":tot_new_prefill,"reused_prefix":tot_reused,
   "new_prefill_pct_of_input":tot_new_prefill/tot_in*100},
 "decode_time_fraction":{"optimistic_for_prefill":frac_lo,"mid":frac_mid,"pessimistic":frac_hi},
 "per_turn":{"turn":TURNS,"med_resident_input":med_in,"med_new_prefill":med_new,
   "med_output":med_out,"resident_over_new":ratio,"reuse_pct":reuse_pct},
}
json.dump(chars, open(f"{OUT}/chars.json","w"), indent=2)

# ================= FIGURES =================
plt.rcParams.update({"figure.dpi":140,"font.size":10,"axes.grid":True,"grid.alpha":.3})

# ---- C1 ----
fig,ax=plt.subplots(1,3,figsize=(15,4.2))
cats=["% sessions","% requests","% input\ntokens","% output\ntokens"];
singv=[chars["mixture"]["single_sessions"]/len(seq)*100, chars["mixture"]["req_single_pct"],
       chars["mixture"]["in_single_pct"], chars["mixture"]["out_single_pct"]]
multv=[100-x for x in singv]
x=np.arange(len(cats))
ax[0].bar(x,singv,label="single-turn",color="#7fb3d5")
ax[0].bar(x,multv,bottom=singv,label="multi-turn",color="#e74c3c")
for i in range(len(cats)):
    ax[0].text(i,singv[i]/2,f"{singv[i]:.0f}",ha="center",va="center",fontsize=9)
    ax[0].text(i,singv[i]+multv[i]/2,f"{multv[i]:.0f}",ha="center",va="center",color="white",fontsize=9)
ax[0].set_xticks(x);ax[0].set_xticklabels(cats);ax[0].set_ylabel("%");ax[0].set_ylim(0,100)
ax[0].set_title("C1a Mixture: 90% sessions single-turn,\nbut multi-turn carries 2/3 prefill mass");ax[0].legend(loc="center right")
# turn CCDF log-log
tc=sorted(turns_per); n=len(tc); xs=sorted(set(tc))
ccdf=[sum(1 for v in tc if v>=xx)/n for xx in xs]
ax[1].loglog(xs,ccdf,marker=".",ms=3,color="#34495e")
ax[1].set_xlabel("turns per session (k)");ax[1].set_ylabel("P(turns >= k)")
ax[1].set_title(f"C1b Heavy-tailed session length\n(p99={chars['turns']['p99']:.0f}, max={chars['turns']['max']})")
# hazard
hk=list(range(1,20)); hv=[hazard[k]*100 for k in hk]
ax[2].plot(hk,hv,marker="o",color="#16a085")
ax[2].set_xlabel("reached turn k");ax[2].set_ylabel("P(continue to k+1) %");ax[2].set_ylim(0,100)
ax[2].set_title("C1c Continuation hazard rises 10%->94%\n(unpredictable at start, Lindy after)")
fig.tight_layout(); fig.savefig(f"{OUT}/c1_session_mixture.png"); plt.close(fig)

# ---- C2 ----
fig,ax=plt.subplots(1,3,figsize=(15,4.2))
ax[0].semilogy(TURNS,med_in,marker="o",label="resident context (input)",color="#e74c3c")
ax[0].semilogy(TURNS,med_new,marker="s",label="new prefill this turn",color="#2980b9")
ax[0].set_xlabel("turn");ax[0].set_ylabel("tokens (median, log)");ax[0].legend()
ax[0].set_xlim(1,30)
ax[0].set_title("C2a Resident state explodes,\nmarginal work collapses")
ax[1].plot(TURNS,ratio,marker="o",color="#8e44ad")
ax[1].set_xlabel("turn");ax[1].set_ylabel("resident / new-prefill");ax[1].set_xlim(1,30)
ax[1].set_title("C2b The PD tax = resident/delta\n(grows to ~250x by deep turns)")
ax[2].plot(TURNS,reuse_pct,marker="o",color="#27ae60")
ax[2].set_xlabel("turn");ax[2].set_ylabel("per-turn reuse %");ax[2].set_ylim(50,100);ax[2].set_xlim(1,30)
ax[2].set_title("C2c Per-turn reuse climbs to 99.6%\n(deep turns are near-pure cache hits)")
fig.tight_layout(); fig.savefig(f"{OUT}/c2_work_amortization.png"); plt.close(fig)

# ---- C3 ----
fig,ax=plt.subplots(1,2,figsize=(11,4.4))
# token mass decomposition
vals=[tot_reused/1e9, tot_new_prefill/1e9, tot_out/1e9]
labs=[f"reused prefix\n{tot_reused/tot_in*100:.0f}% of input",
      f"new prefill\n{tot_new_prefill/tot_in*100:.0f}% of input",
      f"decode output\n{tot_out/tot_in*100:.1f}% of input"]
ax[0].bar(range(3),vals,color=["#95a5a6","#2980b9","#e67e22"])
ax[0].set_xticks(range(3));ax[0].set_xticklabels(labs,fontsize=8.5)
ax[0].set_ylabel("tokens (billions)")
ax[0].set_title("C3a Token mass: prefill-dominated\n(but tokens != time, see C3b)")
# per-turn prefill vs decode TIME (cost model)
ax[1].semilogy(TURNS,t_pref,marker="o",label="prefill time (new tok / 7k·s⁻¹)",color="#2980b9")
ax[1].semilogy(TURNS,t_dec,marker="s",label="decode time (out·10ms)",color="#e67e22")
ax[1].set_xlabel("turn");ax[1].set_ylabel("seconds (median, log)");ax[1].legend(fontsize=8);ax[1].set_xlim(1,30)
ax[1].set_title(f"C3b Prefill→decode bottleneck flips within a session\n(agg decode-time share ≈ {frac_mid*100:.0f}%, range {frac_lo*100:.0f}–{frac_hi*100:.0f}%)")
fig.tight_layout(); fig.savefig(f"{OUT}/c3_prefill_decode_balance.png"); plt.close(fig)
print("FIGURES + chars.json written to", OUT)
print(json.dumps({k:chars[k] for k in ["mixture","turns","token_mass","decode_time_fraction"]}, indent=2))