PD-disagg crossover: regular synthetic trace + goodput sweep + figure

gen_synthetic_trace.py --mode regular: maximally-regular multi-turn trace
(fixed prefix/delta/turns, constant arrivals, zero session skew) to isolate
the structural PD cost (per-turn full-context transfer + P/D capacity split)
from the skew/hot-pin artifact.

analysis/crossover/: SLO-goodput PD_advantage sweeps bracketing the
prefill<->decode bottleneck axis (D1 grow input -> prefill-bound; D2 grow
output -> decode-bound). figs/crossover_pd_advantage.png shows the crossover
(y=1) with the agentic operating region annotated.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-05-29 18:19:23 +08:00
parent 48ae72467a
commit 847f52f03b
15 changed files with 1776 additions and 2 deletions

View File

@@ -161,11 +161,65 @@ def gen_multiturn(
return rows
def gen_regular(
*,
session_qps: float,
duration_s: float,
turns: int,
prefix_len: int,
delta_len: int,
output_len: int,
inter_turn_gap_s: float,
seed: int,
) -> list[dict]:
"""Maximally-regular multi-turn trace: every session identical, no skew.
Each session has a FIXED reused prefix of `prefix_len` tokens (its own,
established on turn 1) and every turn appends a FIXED `delta_len` of fresh
tokens. So per turn: input = prefix_len + delta_len (fixed), reuse ratio =
prefix_len/(prefix_len+delta_len) (fixed), actual new-prefill = delta_len
(fixed). Constant-interval session arrivals, fixed inter-turn gap, fixed
turn count -> zero session-size skew, so session-affinity cannot hot-pin.
This isolates the *structural* PD cost (per-turn full-context KV transfer +
P/D capacity split) from the skew/hot-pin artifact.
"""
rows: list[dict] = []
next_hash = HASH_BASE
chat_id = 1
n_prefix = max(1, prefix_len // BLOCK_SIZE)
n_delta = max(1, delta_len // BLOCK_SIZE)
n_sessions = max(1, int(duration_s * session_qps))
for s in range(n_sessions):
start = s / session_qps
sid = f"r{s}"
prefix_blocks = list(range(next_hash, next_hash + n_prefix))
next_hash += n_prefix
prev = -1
for k in range(1, turns + 1):
delta_blocks = list(range(next_hash, next_hash + n_delta))
next_hash += n_delta
rows.append({
"chat_id": chat_id,
"parent_chat_id": prev,
"timestamp": round(start + (k - 1) * inter_turn_gap_s, 6),
"input_length": prefix_len + delta_len,
"output_length": output_len,
"type": "synthetic_regular",
"turn": k,
"hash_ids": prefix_blocks + delta_blocks,
"session_id": sid,
})
prev = chat_id
chat_id += 1
rows.sort(key=lambda r: r["timestamp"])
return rows
def main() -> None:
p = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
p.add_argument("--out", type=Path, required=True, help="output trace JSONL")
p.add_argument("--mode", choices=["vanilla", "multiturn"], default="vanilla")
p.add_argument("--mode", choices=["vanilla", "multiturn", "regular"], default="vanilla")
p.add_argument("--qps", type=float, help="vanilla: mean Poisson request rate; "
"multiturn: mean Poisson SESSION rate")
p.add_argument("--duration-s", type=float, default=600.0, help="trace span (s)")
@@ -182,7 +236,14 @@ def main() -> None:
p.add_argument("--new-user-tokens", type=int, default=256,
help="multiturn: fresh user tokens added each subsequent turn")
p.add_argument("--inter-turn-gap-s", type=float, default=1.6,
help="multiturn: mean think-time between turns")
help="multiturn: mean think-time; regular: FIXED think-time")
# regular knobs
p.add_argument("--prefix-len", type=int, default=16384,
help="regular: per-session fixed reused prefix length")
p.add_argument("--delta-len", type=int, default=512,
help="regular: fixed fresh new-prefill tokens per turn")
p.add_argument("--turns", type=int, default=8,
help="regular: fixed turns per session")
args = p.parse_args()
if args.mode == "vanilla":
@@ -194,6 +255,20 @@ def main() -> None:
cfg = {"mode": "vanilla", "qps": args.qps, "duration_s": args.duration_s,
"input_len": args.input_len, "output_len": args.output_len,
"seed": args.seed, "reuse": "none"}
elif args.mode == "regular":
assert args.qps, "regular needs --qps (session rate)"
rows = gen_regular(
session_qps=args.qps, duration_s=args.duration_s, turns=args.turns,
prefix_len=args.prefix_len, delta_len=args.delta_len,
output_len=args.output_len, inter_turn_gap_s=args.inter_turn_gap_s,
seed=args.seed,
)
cfg = {"mode": "regular", "session_qps": args.qps,
"duration_s": args.duration_s, "turns": args.turns,
"prefix_len": args.prefix_len, "delta_len": args.delta_len,
"output_len": args.output_len, "inter_turn_gap_s": args.inter_turn_gap_s,
"reuse_ratio": args.prefix_len / (args.prefix_len + args.delta_len),
"seed": args.seed, "reuse": "fixed-intra-session"}
else:
assert args.qps, "multiturn needs --qps (session rate)"
rows = gen_multiturn(