Joint/temporal characterizations of the full 051315 cluster trace (2.11M
req / 1.31M sessions / 2h), beyond the existing single-variable marginals:
- C1 mixture: 90.3% sessions single-turn, but multi-turn (9.7%) = 44% reqs /
67% prefill mass; continuation hazard rises 10%->94% (Lindy); heaviness
unpredictable at turn 1 (corr 0.04-0.15) => reactive routing justified.
- C2 resident/delta: resident context 11k->56k while new-prefill 2.7k->~200;
per-turn reuse ->99.6%; resident/delta ("PD tax") ->~250-450x.
- C3 prefill/decode: token mass 98.7% input / 1.3% output, BUT decode ~70% of
TIME (robust 68-71%); "decode negligible" is wrong (tokens != time). Correct
colo argument = roofline complementarity, not "no decode".
Maps each to (1) PD-colocation and (2) routing. compute_chars.py + chars.json
+ figs/workload_chars/. Raw-file exact validation (cached_tokens, real
timings) pending.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
181 lines
9.1 KiB
Python
181 lines
9.1 KiB
Python
import json, sys, math, statistics as st
|
||
from collections import defaultdict, Counter
|
||
import matplotlib; matplotlib.use("Agg")
|
||
import matplotlib.pyplot as plt
|
||
import numpy as np
|
||
|
||
PATH="/home/admin/cpfs/wjh/ali-trace/trace-glm5.1-formatted/051315-051317.jsonl"
|
||
OUT="/tmp/wlc_out"; import os; os.makedirs(OUT, exist_ok=True)
|
||
BLOCK=512
|
||
# --- transparent cost model for C3 (clearly-labeled estimate; raw-timing validation pending) ---
|
||
PREFILL_TOK_S=7000.0 # MB1: 32k->4.5s ~7100 tok/s effective on H20 / 30B-A3B
|
||
TPOT_S=0.010 # ~10ms/token decode (crossover unloaded ~5ms, loaded ~25ms)
|
||
|
||
def pct(v,p):
|
||
if not v: return float('nan')
|
||
s=sorted(v);k=(len(s)-1)*p;f=int(k)
|
||
return s[f] if f+1>=len(s) else s[f]+(s[f+1]-s[f])*(k-f)
|
||
|
||
# ---------- Pass A: structure (scalars only) ----------
|
||
parents={}; recs={}; childcount=Counter()
|
||
for line in open(PATH):
|
||
if not line.strip(): continue
|
||
d=json.loads(line); cid=d["chat_id"]; pid=d["parent_chat_id"]
|
||
parents[cid]=pid
|
||
recs[cid]=(float(d["timestamp"]),int(d["input_length"]),int(d["output_length"]),int(d["turn"]))
|
||
if pid!="-1": childcount[pid]+=1
|
||
print(f"[A] records={len(recs)}", file=sys.stderr)
|
||
|
||
root_of={}
|
||
def root(cid):
|
||
path=[];c=cid
|
||
while True:
|
||
if c in root_of:r=root_of[c];break
|
||
p=parents.get(c,"-1")
|
||
if p=="-1" or p not in recs:r=c;break
|
||
path.append(c);c=p
|
||
for x in path:root_of[x]=r
|
||
root_of[cid]=r;return r
|
||
sessions=defaultdict(list)
|
||
for cid in recs: sessions[root(cid)].append(cid)
|
||
seq={r:sorted(m,key=lambda c:(recs[c][3],recs[c][0])) for r,m in sessions.items()}
|
||
print(f"[A] sessions={len(seq)}", file=sys.stderr)
|
||
|
||
# ---------- C1: mixture + turn tail + hazard ----------
|
||
sr=mr=sm=mm=so=mo=0
|
||
turns_per=[]
|
||
for r,s in seq.items():
|
||
multi=len(s)>1; turns_per.append(len(s))
|
||
for c in s:
|
||
_,inl,outl,_=recs[c]
|
||
if multi: mr+=1;mm+=inl;mo+=outl
|
||
else: sr+=1;sm+=inl;so+=outl
|
||
tot_r=sr+mr; tot_in=sm+mm; tot_out=so+mo
|
||
cnt_turn=Counter()
|
||
for r,s in seq.items():
|
||
for c in s: cnt_turn[recs[c][3]]+=1
|
||
hazard={k: (cnt_turn[k+1]/cnt_turn[k] if cnt_turn[k] else 0) for k in range(1,30)}
|
||
|
||
# ---------- C2/C3: per-turn resident vs new-prefill (scalar) + hash_ids reuse ----------
|
||
by_in=defaultdict(list); by_new=defaultdict(list); by_out=defaultdict(list)
|
||
by_reuse_hash=defaultdict(list) # hash-block prefix stability: reused/parent_blocks
|
||
store={} # cid -> (blockset, in, out) for chats with pending children
|
||
tot_new_prefill=0; tot_reused=0
|
||
for line in open(PATH):
|
||
if not line.strip(): continue
|
||
d=json.loads(line); cid=d["chat_id"]; pid=d["parent_chat_id"]
|
||
inl=int(d["input_length"]); outl=int(d["output_length"]); turn=int(d["turn"])
|
||
blocks=set(d["hash_ids"])
|
||
if pid in store:
|
||
pblk,pin,pout=store[pid]
|
||
new_prefill=max(0, inl - pin - pout) # actual recompute (accounts for cached answer)
|
||
reused_blk=len(blocks & pblk)
|
||
by_reuse_hash[turn].append(reused_blk/len(pblk) if pblk else 0)
|
||
childcount[pid]-=1
|
||
if childcount[pid]<=0: del store[pid]
|
||
tot_reused += (inl-new_prefill)
|
||
else:
|
||
new_prefill=inl # session start: all new (intra-session)
|
||
tot_new_prefill+=new_prefill
|
||
by_in[turn].append(inl); by_new[turn].append(new_prefill); by_out[turn].append(outl)
|
||
if childcount[cid]>0: store[cid]=(blocks,inl,outl)
|
||
print(f"[B] done; store residual={len(store)}", file=sys.stderr)
|
||
|
||
TURNS=[t for t in sorted(by_in) if len(by_in[t])>=50]
|
||
med_in=[pct(by_in[t],.5) for t in TURNS]
|
||
med_new=[max(pct(by_new[t],.5),1) for t in TURNS]
|
||
med_out=[pct(by_out[t],.5) for t in TURNS]
|
||
ratio=[med_in[i]/med_new[i] for i in range(len(TURNS))]
|
||
reuse_pct=[(1-med_new[i]/med_in[i])*100 for i in range(len(TURNS))]
|
||
# C3 time per turn (cost model)
|
||
t_pref=[med_new[i]/PREFILL_TOK_S for i in range(len(TURNS))]
|
||
t_dec=[med_out[i]*TPOT_S for i in range(len(TURNS))]
|
||
|
||
# aggregate decode/prefill time fraction over a RANGE of constants
|
||
def agg_time(prate,tpot):
|
||
tp=tot_new_prefill/prate; td=tot_out*tpot; return td/(tp+td)
|
||
frac_lo=agg_time(13000,0.005); frac_mid=agg_time(7000,0.010); frac_hi=agg_time(3000,0.025)
|
||
|
||
chars={
|
||
"mixture":{"single_sessions":sr if False else sum(1 for s in seq.values() if len(s)==1),
|
||
"multi_sessions":sum(1 for s in seq.values() if len(s)>1),
|
||
"req_single_pct":sr/tot_r*100,"req_multi_pct":mr/tot_r*100,
|
||
"in_single_pct":sm/tot_in*100,"in_multi_pct":mm/tot_in*100,
|
||
"out_single_pct":so/tot_out*100,"out_multi_pct":mo/tot_out*100},
|
||
"turns":{"mean":st.mean(turns_per),"p99":pct(turns_per,.99),"max":max(turns_per),
|
||
"single_turn_pct":sum(1 for x in turns_per if x==1)/len(turns_per)*100},
|
||
"hazard":hazard,
|
||
"token_mass":{"total_input":tot_in,"total_output":tot_out,"out_in_ratio_pct":tot_out/tot_in*100,
|
||
"new_prefill":tot_new_prefill,"reused_prefix":tot_reused,
|
||
"new_prefill_pct_of_input":tot_new_prefill/tot_in*100},
|
||
"decode_time_fraction":{"optimistic_for_prefill":frac_lo,"mid":frac_mid,"pessimistic":frac_hi},
|
||
"per_turn":{"turn":TURNS,"med_resident_input":med_in,"med_new_prefill":med_new,
|
||
"med_output":med_out,"resident_over_new":ratio,"reuse_pct":reuse_pct},
|
||
}
|
||
json.dump(chars, open(f"{OUT}/chars.json","w"), indent=2)
|
||
|
||
# ================= FIGURES =================
|
||
plt.rcParams.update({"figure.dpi":140,"font.size":10,"axes.grid":True,"grid.alpha":.3})
|
||
|
||
# ---- C1 ----
|
||
fig,ax=plt.subplots(1,3,figsize=(15,4.2))
|
||
cats=["% sessions","% requests","% input\ntokens","% output\ntokens"];
|
||
singv=[chars["mixture"]["single_sessions"]/len(seq)*100, chars["mixture"]["req_single_pct"],
|
||
chars["mixture"]["in_single_pct"], chars["mixture"]["out_single_pct"]]
|
||
multv=[100-x for x in singv]
|
||
x=np.arange(len(cats))
|
||
ax[0].bar(x,singv,label="single-turn",color="#7fb3d5")
|
||
ax[0].bar(x,multv,bottom=singv,label="multi-turn",color="#e74c3c")
|
||
for i in range(len(cats)):
|
||
ax[0].text(i,singv[i]/2,f"{singv[i]:.0f}",ha="center",va="center",fontsize=9)
|
||
ax[0].text(i,singv[i]+multv[i]/2,f"{multv[i]:.0f}",ha="center",va="center",color="white",fontsize=9)
|
||
ax[0].set_xticks(x);ax[0].set_xticklabels(cats);ax[0].set_ylabel("%");ax[0].set_ylim(0,100)
|
||
ax[0].set_title("C1a Mixture: 90% sessions single-turn,\nbut multi-turn carries 2/3 prefill mass");ax[0].legend(loc="center right")
|
||
# turn CCDF log-log
|
||
tc=sorted(turns_per); n=len(tc); xs=sorted(set(tc))
|
||
ccdf=[sum(1 for v in tc if v>=xx)/n for xx in xs]
|
||
ax[1].loglog(xs,ccdf,marker=".",ms=3,color="#34495e")
|
||
ax[1].set_xlabel("turns per session (k)");ax[1].set_ylabel("P(turns >= k)")
|
||
ax[1].set_title(f"C1b Heavy-tailed session length\n(p99={chars['turns']['p99']:.0f}, max={chars['turns']['max']})")
|
||
# hazard
|
||
hk=list(range(1,20)); hv=[hazard[k]*100 for k in hk]
|
||
ax[2].plot(hk,hv,marker="o",color="#16a085")
|
||
ax[2].set_xlabel("reached turn k");ax[2].set_ylabel("P(continue to k+1) %");ax[2].set_ylim(0,100)
|
||
ax[2].set_title("C1c Continuation hazard rises 10%->94%\n(unpredictable at start, Lindy after)")
|
||
fig.tight_layout(); fig.savefig(f"{OUT}/c1_session_mixture.png"); plt.close(fig)
|
||
|
||
# ---- C2 ----
|
||
fig,ax=plt.subplots(1,3,figsize=(15,4.2))
|
||
ax[0].semilogy(TURNS,med_in,marker="o",label="resident context (input)",color="#e74c3c")
|
||
ax[0].semilogy(TURNS,med_new,marker="s",label="new prefill this turn",color="#2980b9")
|
||
ax[0].set_xlabel("turn");ax[0].set_ylabel("tokens (median, log)");ax[0].legend()
|
||
ax[0].set_xlim(1,30)
|
||
ax[0].set_title("C2a Resident state explodes,\nmarginal work collapses")
|
||
ax[1].plot(TURNS,ratio,marker="o",color="#8e44ad")
|
||
ax[1].set_xlabel("turn");ax[1].set_ylabel("resident / new-prefill");ax[1].set_xlim(1,30)
|
||
ax[1].set_title("C2b The PD tax = resident/delta\n(grows to ~250x by deep turns)")
|
||
ax[2].plot(TURNS,reuse_pct,marker="o",color="#27ae60")
|
||
ax[2].set_xlabel("turn");ax[2].set_ylabel("per-turn reuse %");ax[2].set_ylim(50,100);ax[2].set_xlim(1,30)
|
||
ax[2].set_title("C2c Per-turn reuse climbs to 99.6%\n(deep turns are near-pure cache hits)")
|
||
fig.tight_layout(); fig.savefig(f"{OUT}/c2_work_amortization.png"); plt.close(fig)
|
||
|
||
# ---- C3 ----
|
||
fig,ax=plt.subplots(1,2,figsize=(11,4.4))
|
||
# token mass decomposition
|
||
vals=[tot_reused/1e9, tot_new_prefill/1e9, tot_out/1e9]
|
||
labs=[f"reused prefix\n{tot_reused/tot_in*100:.0f}% of input",
|
||
f"new prefill\n{tot_new_prefill/tot_in*100:.0f}% of input",
|
||
f"decode output\n{tot_out/tot_in*100:.1f}% of input"]
|
||
ax[0].bar(range(3),vals,color=["#95a5a6","#2980b9","#e67e22"])
|
||
ax[0].set_xticks(range(3));ax[0].set_xticklabels(labs,fontsize=8.5)
|
||
ax[0].set_ylabel("tokens (billions)")
|
||
ax[0].set_title("C3a Token mass: prefill-dominated\n(but tokens != time, see C3b)")
|
||
# per-turn prefill vs decode TIME (cost model)
|
||
ax[1].semilogy(TURNS,t_pref,marker="o",label="prefill time (new tok / 7k·s⁻¹)",color="#2980b9")
|
||
ax[1].semilogy(TURNS,t_dec,marker="s",label="decode time (out·10ms)",color="#e67e22")
|
||
ax[1].set_xlabel("turn");ax[1].set_ylabel("seconds (median, log)");ax[1].legend(fontsize=8);ax[1].set_xlim(1,30)
|
||
ax[1].set_title(f"C3b Prefill→decode bottleneck flips within a session\n(agg decode-time share ≈ {frac_mid*100:.0f}%, range {frac_lo*100:.0f}–{frac_hi*100:.0f}%)")
|
||
fig.tight_layout(); fig.savefig(f"{OUT}/c3_prefill_decode_balance.png"); plt.close(fig)
|
||
print("FIGURES + chars.json written to", OUT)
|
||
print(json.dumps({k:chars[k] for k in ["mixture","turns","token_mass","decode_time_fraction"]}, indent=2))
|