agentic-kvc/scripts/compute_roofline.py

"""Roofline analysis: compute/memory ratio for prefill vs decode
under different sequence lengths and KV cache reuse ratios.

Model: Qwen3-Coder-30B-A3B (MoE)
  - 48 layers, hidden=2048, heads=32, kv_heads=4, head_dim=128
  - MoE: 128 experts, top-8 active, intermediate=6144
  - Total params: ~30B, Active params per token: ~3B

GPU: NVIDIA H20
  - BF16 peak: 148 TFLOPS
  - HBM bandwidth: 4.0 TB/s
  - Roofline ridge point: 148/4.0 = 37 FLOP/byte
"""

import argparse
import json
import math

# ===== Model config =====
L = 48            # layers
D = 2048          # hidden dim
H = 32            # attention heads
H_kv = 4          # KV heads (GQA)
D_head = 128      # head dim
D_ffn = 6144      # FFN intermediate (per expert)
N_experts = 128   # total experts
K_experts = 8     # active experts per token
VOCAB = 151936
BYTES = 2         # BF16

# ===== GPU config (H20) =====
PEAK_FLOPS = 148e12   # BF16 TFLOPS
HBM_BW = 4.0e12       # bytes/s
RIDGE_POINT = PEAK_FLOPS / HBM_BW  # ~37 FLOP/byte

print("=" * 80)
print("  ROOFLINE ANALYSIS: Prefill vs Decode under KV Cache Reuse")
print("  Model: Qwen3-Coder-30B-A3B (MoE 128E top-8) | GPU: H20")
print("=" * 80)
print(f"  Ridge point: {RIDGE_POINT:.1f} FLOP/byte")
print(f"  Above ridge → compute-bound | Below ridge → memory-bound")

# ===== Per-token compute & memory for each component =====

def attention_prefill_flops(seq_len, new_tokens):
    """FLOPs for attention on new_tokens with seq_len context."""
    # QKV projection: new_tokens * D * (D + 2*D_kv) * 2
    d_kv = H_kv * D_head
    qkv_flops = new_tokens * (D * D * 2 + D * d_kv * 2 * 2)  # Q + K + V
    # Attention score: new_tokens * seq_len * D * 2 (Q@K^T + softmax@V)
    attn_flops = new_tokens * seq_len * D * 2 * 2  # simplified: 2 matmuls
    # Output projection: new_tokens * D * D * 2
    out_flops = new_tokens * D * D * 2
    return (qkv_flops + attn_flops + out_flops) * L

def attention_prefill_bytes(seq_len, new_tokens, cached_tokens):
    """Memory access for attention prefill."""
    d_kv = H_kv * D_head
    # Load model weights (QKV + O projections): D*(D+2*d_kv+D) * BYTES * L
    weight_bytes = D * (D + 2 * d_kv + D) * BYTES * L
    # Load cached KV: cached_tokens * 2 * d_kv * BYTES * L
    cached_kv_bytes = cached_tokens * 2 * d_kv * BYTES * L
    # Read input activations + write output: new_tokens * D * BYTES * 2 * L
    act_bytes = new_tokens * D * BYTES * 2 * L
    # Write new KV to cache: new_tokens * 2 * d_kv * BYTES * L
    new_kv_bytes = new_tokens * 2 * d_kv * BYTES * L
    return weight_bytes + cached_kv_bytes + act_bytes + new_kv_bytes

def ffn_flops(n_tokens):
    """FLOPs for MoE FFN on n_tokens."""
    # Per expert: 3 * n_tokens * D * D_ffn * 2 (gate + up + down)
    # Active experts: K_experts
    return 3 * n_tokens * D * D_ffn * 2 * K_experts * L

def ffn_bytes(n_tokens):
    """Memory access for MoE FFN."""
    # Load K_experts worth of weights per layer: K * 3 * D * D_ffn * BYTES
    weight_bytes = K_experts * 3 * D * D_ffn * BYTES * L
    # Activations: n_tokens * D * BYTES * 2 * L
    act_bytes = n_tokens * D * BYTES * 2 * L
    return weight_bytes + act_bytes

def decode_flops(seq_len):
    """FLOPs for 1 decode token."""
    return attention_prefill_flops(seq_len, 1) + ffn_flops(1)

def decode_bytes(seq_len):
    """Memory bytes for 1 decode token."""
    return attention_prefill_bytes(seq_len, 1, seq_len) + ffn_bytes(1)

# ===== Analysis =====

print("\n" + "-" * 80)
print("  PART 1: Decode Roofline (baseline)")
print("-" * 80)
print(f"  {'SeqLen':>8} {'FLOP':>14} {'Bytes':>14} {'AI (F/B)':>10} {'Bound':>12}")

for seq_len in [1000, 4000, 8000, 16000, 32000, 64000, 128000]:
    flops = decode_flops(seq_len)
    bytes_ = decode_bytes(seq_len)
    ai = flops / bytes_
    bound = "COMPUTE" if ai > RIDGE_POINT else "MEMORY"
    print(f"  {seq_len:>8,} {flops:>14.2e} {bytes_:>14.2e} {ai:>10.1f} {bound:>12}")

print("\n" + "-" * 80)
print("  PART 2: Prefill with KV Cache Reuse")
print("  (Total input = seq_len, cached = seq_len * reuse_ratio, new = rest)")
print("-" * 80)
print(f"  {'SeqLen':>8} {'Reuse%':>7} {'NewTok':>8} {'FLOP':>14} {'Bytes':>14} {'AI (F/B)':>10} {'Bound':>12} {'vs Decode':>10}")

for seq_len in [4000, 16000, 32000, 64000, 128000]:
    for reuse in [0.0, 0.3, 0.5, 0.7, 0.9, 0.95]:
        cached = int(seq_len * reuse)
        new = seq_len - cached

        # Attention: compute on new tokens, but read cached KV for context
        attn_f = attention_prefill_flops(seq_len, new)
        attn_b = attention_prefill_bytes(seq_len, new, cached)

        # FFN: only on new tokens
        ffn_f = ffn_flops(new)
        ffn_b = ffn_bytes(new)

        total_f = attn_f + ffn_f
        total_b = attn_b + ffn_b
        ai = total_f / total_b if total_b > 0 else 0

        # Compare with decode at same seq_len
        dec_f = decode_flops(seq_len)
        dec_b = decode_bytes(seq_len)
        dec_ai = dec_f / dec_b

        bound = "COMPUTE" if ai > RIDGE_POINT else "MEMORY"
        ratio = f"{ai/dec_ai:.1f}x" if dec_ai > 0 else "N/A"

        print(f"  {seq_len:>8,} {reuse*100:>6.0f}% {new:>8,} {total_f:>14.2e} {total_b:>14.2e} {ai:>10.1f} {bound:>12} {ratio:>10}")
    print()

print("-" * 80)
print("  PART 3: Key Thresholds")
print("-" * 80)

# At what reuse ratio does prefill become memory-bound?
for seq_len in [4000, 16000, 32000, 64000, 128000]:
    for reuse_pct in range(0, 100):
        reuse = reuse_pct / 100.0
        cached = int(seq_len * reuse)
        new = seq_len - cached
        if new < 1: continue
        attn_f = attention_prefill_flops(seq_len, new)
        attn_b = attention_prefill_bytes(seq_len, new, cached)
        ffn_f = ffn_flops(new)
        ffn_b = ffn_bytes(new)
        ai = (attn_f + ffn_f) / (attn_b + ffn_b)
        if ai < RIDGE_POINT:
            print(f"  SeqLen={seq_len:>6,}: prefill becomes memory-bound at {reuse_pct}% reuse (AI={ai:.1f})")
            break

print()
print("-" * 80)
print("  PART 4: Agentic Workload Real Distribution")
print("-" * 80)

# Use actual trace data
_parser = argparse.ArgumentParser(description=__doc__)
_parser.add_argument("--trace", type=str,
                     default="traces/w600_r0.0015_st30.jsonl",
                     help="Sampled trace JSONL for empirical workload roofline (Part 4)")
_args, _ = _parser.parse_known_args()
trace_path = _args.trace
try:
    _trace_fh = open(trace_path)
except FileNotFoundError:
    print(f"  (skipped: trace file not found: {trace_path})")
    _trace_fh = None
if _trace_fh is not None:
    BLOCK_SIZE = 512
    seen = set()
    compute_bound = 0
    memory_bound = 0
    total = 0

    for line in _trace_fh:
        d = json.loads(line)
        seq_len = d["input_length"]
        if seq_len < 1: continue
        hids = d.get("hash_ids", [])

        cached_blocks = 0
        for hid in hids:
            if hid in seen:
                cached_blocks += 1
            else:
                break
        for hid in hids:
            seen.add(hid)

        cached = cached_blocks * BLOCK_SIZE
        new = max(1, seq_len - cached)
        reuse = cached / seq_len

        attn_f = attention_prefill_flops(seq_len, new)
        attn_b = attention_prefill_bytes(seq_len, new, cached)
        ffn_f = ffn_flops(new)
        ffn_b = ffn_bytes(new)
        ai = (attn_f + ffn_f) / (attn_b + ffn_b)

        total += 1
        if ai > RIDGE_POINT:
            compute_bound += 1
        else:
            memory_bound += 1

    _trace_fh.close()
    if total > 0:
        print(f"  With actual trace prefix cache pattern:")
        print(f"    Compute-bound prefills: {compute_bound} ({compute_bound*100//total}%)")
        print(f"    Memory-bound prefills:  {memory_bound} ({memory_bound*100//total}%)")
        print(f"    (Decode is ALWAYS memory-bound at these seq lengths)")
        print()
        print(f"  Implication: {memory_bound*100//total}% of agentic prefills behave like decode")
        print(f"  → PD separation treats them as 'compute-heavy' but they are actually memory-heavy")