feat: add agentic pd hybrid benchmark prototype

2026-04-24 12:17:46 +00:00
parent d2fe014db7
commit 4bca741f32
16 changed files with 9182 additions and 0 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,24 @@
 [project]
 name = "agentic-pd-hybrid"
 version = "0.1.0"
 description = "Prototype for session-aware and KV-cache-aware PD routing on SGLang xPyD"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
    "httpx>=0.28.1",
    "mooncake-transfer-engine",
    "sglang==0.5.10",
 ]
 [project.scripts]
 agentic-pd-hybrid = "agentic_pd_hybrid.cli:main"
 [build-system]
 requires = ["setuptools>=68"]
 build-backend = "setuptools.build_meta"
 [tool.setuptools.packages.find]
 where = ["src"]
 [tool.uv]
 prerelease = "allow"
--- a/src/agentic_pd_hybrid/init.py
+++ b/src/agentic_pd_hybrid/init.py
@@ -0,0 +1,12 @@
 """Agentic PD hybrid prototype."""
 __all__ = [
    "cli",
    "launcher",
    "metrics",
    "microbench",
    "policies",
    "replay",
    "topology",
    "trace",
 ]
--- a/src/agentic_pd_hybrid/benchmark.py
+++ b/src/agentic_pd_hybrid/benchmark.py
@@ -0,0 +1,221 @@
 from __future__ import annotations
 import asyncio
 import json
 import signal
 from dataclasses import asdict, dataclass, replace
 from datetime import UTC, datetime
 from pathlib import Path
 from agentic_pd_hybrid.replay import ReplayConfig, replay_trace
 from agentic_pd_hybrid.sampling import SessionSampleConfig, sample_trace_sessions
 from agentic_pd_hybrid.stack import ManagedPdStack, launch_pd_stack
 from agentic_pd_hybrid.topology import SingleNodeTopology
@dataclass(frozen=True)
 class BenchmarkConfig:
    trace_path: Path
    output_root: Path
    topology: SingleNodeTopology
    policy_name: str
    mechanism_name: str = "pd-disaggregation"
    target_duration_s: float = 600.0
    start_time_s: float = 0.0
    session_sample_rate: float = 0.01
    min_turns: int = 1
    time_scale: float = 1.0
    concurrency_limit: int = 32
    timeout_s: float = 1200.0
    stream: bool = True
    stream_idle_timeout_s: float | None = 900.0
    kvcache_direct_max_uncached_tokens: int = 2048
    kvcache_admission_mode: str = "router"
    sample_profile: str = "default"
    min_initial_input_tokens: int | None = None
    max_initial_input_tokens: int | None = None
    max_append_input_tokens: int | None = None
    max_output_tokens: int | None = None
    min_overlap_ratio: float | None = None
    launch_stack: bool = True
@dataclass(frozen=True)
 class BenchmarkArtifacts:
    run_dir: Path
    sampled_trace_path: Path
    metrics_path: Path
    summary_path: Path
    benchmark_config_path: Path
 def run_live_benchmark(config: BenchmarkConfig) -> BenchmarkArtifacts:
    run_id = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
    run_label = f"{config.mechanism_name}-{config.policy_name}"
    if config.mechanism_name == "kvcache-centric":
        run_label = f"{run_label}-{config.kvcache_admission_mode}-admission"
    run_dir = config.output_root / f"{run_label}-{run_id}"
    run_dir.mkdir(parents=True, exist_ok=True)
    topology = config.topology
    if config.mechanism_name == "kvcache-centric":
        topology = replace(
            topology,
            prefill_extra_server_args=topology.prefill_extra_server_args
            + ("--enable-streaming-session",),
            decode_extra_server_args=topology.decode_extra_server_args
            + (
                "--enable-streaming-session",
                "--disaggregation-decode-allow-local-prefill",
            ),
        )
    sampled_trace_path = run_dir / "sampled-trace.jsonl"
    sample_summary = sample_trace_sessions(
        SessionSampleConfig(
            trace_path=config.trace_path,
            output_path=sampled_trace_path,
            target_duration_s=config.target_duration_s,
            start_time_s=config.start_time_s,
            session_sample_rate=config.session_sample_rate,
            min_turns=config.min_turns,
            profile=config.sample_profile,  # type: ignore[arg-type]
            min_initial_input_tokens=config.min_initial_input_tokens,
            max_initial_input_tokens=config.max_initial_input_tokens,
            max_append_input_tokens=config.max_append_input_tokens,
            max_output_tokens=config.max_output_tokens,
            min_overlap_ratio=config.min_overlap_ratio,
        )
    )
    stack: ManagedPdStack | None = None
    previous_sigint = signal.getsignal(signal.SIGINT)
    previous_sigterm = signal.getsignal(signal.SIGTERM)
    def _handle_termination(signum, _frame) -> None:
        if stack is not None:
            stack.stop()
        raise SystemExit(128 + signum)
    try:
        signal.signal(signal.SIGINT, _handle_termination)
        signal.signal(signal.SIGTERM, _handle_termination)
        if config.launch_stack:
            stack = launch_pd_stack(
                topology=topology,
                run_dir=run_dir,
                prefill_policy="round_robin",
                decode_policy=_decode_policy_for(config.policy_name),
                timeout_s=config.timeout_s,
                include_router=(
                    config.mechanism_name in {"pd-disaggregation", "kvcache-centric"}
                ),
            )
            router_url = (
                stack.router_url
                if config.mechanism_name in {"pd-disaggregation", "kvcache-centric"}
                else None
            )
        else:
            router_url = (
                topology.router_url
                if config.mechanism_name in {"pd-disaggregation", "kvcache-centric"}
                else None
            )
        metrics_path = run_dir / "request-metrics.jsonl"
        replay_config = ReplayConfig(
            trace_path=sampled_trace_path,
            output_path=metrics_path,
            policy_name=config.policy_name,
            mechanism_name=config.mechanism_name,
            topology=topology,
            router_url=router_url,
            model_name=topology.model_name,
            pace=True,
            time_scale=config.time_scale,
            request_limit=None,
            concurrency_limit=config.concurrency_limit,
            header_mode=_header_mode_for(config.policy_name),
            timeout_s=config.timeout_s,
            stream=config.stream,
            stream_idle_timeout_s=config.stream_idle_timeout_s,
            kvcache_direct_max_uncached_tokens=config.kvcache_direct_max_uncached_tokens,
            kvcache_admission_mode=config.kvcache_admission_mode,  # type: ignore[arg-type]
        )
        asyncio.run(replay_trace(replay_config))
    finally:
        signal.signal(signal.SIGINT, previous_sigint)
        signal.signal(signal.SIGTERM, previous_sigterm)
        if stack is not None:
            stack.stop()
    benchmark_config_path = run_dir / "benchmark-config.json"
    with benchmark_config_path.open("w", encoding="utf-8") as handle:
        json.dump(
            {
                "policy_name": config.policy_name,
                "mechanism_name": config.mechanism_name,
                "target_duration_s": config.target_duration_s,
                "start_time_s": config.start_time_s,
                "session_sample_rate": config.session_sample_rate,
                "min_turns": config.min_turns,
                "time_scale": config.time_scale,
                "concurrency_limit": config.concurrency_limit,
                "timeout_s": config.timeout_s,
                "stream": config.stream,
                "stream_idle_timeout_s": config.stream_idle_timeout_s,
                "kvcache_direct_max_uncached_tokens": config.kvcache_direct_max_uncached_tokens,
                "kvcache_admission_mode": config.kvcache_admission_mode,
                "sample_profile": config.sample_profile,
                "min_initial_input_tokens": config.min_initial_input_tokens,
                "max_initial_input_tokens": config.max_initial_input_tokens,
                "max_append_input_tokens": config.max_append_input_tokens,
                "max_output_tokens": config.max_output_tokens,
                "min_overlap_ratio": config.min_overlap_ratio,
                "sample_summary": asdict(sample_summary),
                "topology": {
                    "model_path": config.topology.model_path,
                    "router_url": topology.router_url,
                    "transfer_backend": topology.transfer_backend,
                    "force_rdma": topology.force_rdma,
                    "ib_device": topology.ib_device,
                    "prefill_workers": [
                        worker.worker_id for worker in topology.prefill_workers
                    ],
                    "decode_workers": [
                        worker.worker_id for worker in topology.decode_workers
                    ],
                    "direct_workers": [
                        worker.worker_id for worker in topology.direct_workers
                    ],
                },
            },
            handle,
            indent=2,
            sort_keys=True,
        )
    return BenchmarkArtifacts(
        run_dir=run_dir,
        sampled_trace_path=sampled_trace_path,
        metrics_path=run_dir / "request-metrics.jsonl",
        summary_path=run_dir / "request-metrics.jsonl.summary.json",
        benchmark_config_path=benchmark_config_path,
    )
 def _decode_policy_for(policy_name: str) -> str:
    if policy_name == "sticky":
        return "manual"
    if policy_name == "kv-aware":
        return "consistent_hashing"
    return "round_robin"
 def _header_mode_for(policy_name: str) -> str:
    if policy_name == "sticky":
        return "routing-key"
    if policy_name == "kv-aware":
        return "target-worker"
    return "none"
--- a/src/agentic_pd_hybrid/cli.py
+++ b/src/agentic_pd_hybrid/cli.py
@@ -0,0 +1,484 @@
 from __future__ import annotations
 import argparse
 import asyncio
 from pathlib import Path
 from agentic_pd_hybrid.benchmark import BenchmarkConfig, run_live_benchmark
 from agentic_pd_hybrid.launcher import build_launch_plan
 from agentic_pd_hybrid.microbench import SmallAppendTraceConfig, write_small_append_trace
 from agentic_pd_hybrid.replay import ReplayConfig, replay_trace
 from agentic_pd_hybrid.sampling import SessionSampleConfig, sample_trace_sessions
 from agentic_pd_hybrid.trace_profiles import (
    NormalizeTraceLengthsConfig,
    normalize_trace_lengths,
 )
 from agentic_pd_hybrid.topology import build_single_node_topology
 def _normalize_mechanism_name(name: str) -> str:
    normalized = name.strip().lower()
    aliases = {
        "pd-disagg": "pd-disaggregation",
        "pd-disaggregation": "pd-disaggregation",
        "pd-hybrid": "pd-disaggregation",
        "baseline-pd-disagg": "pd-disaggregation",
        "pd-colo": "pd-colo",
        "direct-d": "pd-colo",
        "colocation": "pd-colo",
        "kvcache-centric": "kvcache-centric",
        "turn2+-direct-to-d": "kvcache-centric",
        "pd-with-d-append": "kvcache-centric",
    }
    if normalized not in aliases:
        raise ValueError(f"Unsupported mechanism: {name}")
    return aliases[normalized]
 def _parse_gpu_id_list(value: str | None) -> tuple[int, ...] | None:
    if value is None:
        return None
    items = [item.strip() for item in value.split(",") if item.strip()]
    if not items:
        return tuple()
    return tuple(int(item) for item in items)
 def main() -> None:
    parser = argparse.ArgumentParser(description="Agentic PD hybrid prototype")
    subparsers = parser.add_subparsers(dest="command", required=True)
    print_launch = subparsers.add_parser(
        "print-launch",
        help="Print one-node SGLang PD launch commands",
    )
    _add_topology_arguments(print_launch)
    print_launch.add_argument("--prefill-policy", default="round_robin")
    print_launch.add_argument("--decode-policy", default="manual")
    replay = subparsers.add_parser(
        "replay",
        help="Replay trace and log request-level metrics",
    )
    _add_topology_arguments(replay)
    replay.add_argument("--trace", type=Path, required=True)
    replay.add_argument("--output", type=Path, required=True)
    replay.add_argument(
        "--policy",
        choices=["default", "sticky", "kv-aware"],
        default="sticky",
    )
    replay.add_argument(
        "--mechanism",
        choices=[
            "pd-disaggregation",
            "pd-hybrid",
            "pd-disagg",
            "pd-colo",
            "direct-d",
            "kvcache-centric",
            "turn2+-direct-to-d",
            "pd-with-d-append",
        ],
        default="pd-disaggregation",
    )
    replay.add_argument("--router-url")
    replay.add_argument("--model")
    replay.add_argument(
        "--header-mode",
        choices=["auto", "none", "routing-key", "target-worker"],
        default="auto",
    )
    replay.add_argument(
        "--request-limit",
        type=int,
        default=None,
        help="Replay at most this many requests",
    )
    replay.add_argument(
        "--no-pace",
        action="store_true",
        help="Disable wall-clock pacing from trace timestamps",
    )
    replay.add_argument(
        "--time-scale",
        type=float,
        default=1.0,
        help="Scale trace timing by this factor when pacing is enabled",
    )
    replay.add_argument(
        "--concurrency-limit",
        type=int,
        default=32,
    )
    replay.add_argument(
        "--timeout-s",
        type=float,
        default=600.0,
    )
    replay.add_argument(
        "--no-stream",
        action="store_true",
        help="Use non-streaming OpenAI responses for more robust E2E-only replay.",
    )
    replay.add_argument(
        "--stream-idle-timeout-s",
        type=float,
        default=900.0,
        help="Abort a streaming request if no SSE line arrives within this many seconds.",
    )
    replay.add_argument(
        "--kvcache-direct-max-uncached-tokens",
        type=int,
        default=2048,
        help="For kvcache-centric routing, bypass P when the uncached suffix is at most this many tokens.",
    )
    replay.add_argument(
        "--kvcache-admission-mode",
        choices=["router", "worker"],
        default="router",
        help=(
            "For kvcache-centric routing, use router shadow-state admission "
            "or query the decode worker on the critical path."
        ),
    )
    sample = subparsers.add_parser(
        "sample-sessions",
        help="Sample a session-granularity trace shard for live benchmarking",
    )
    sample.add_argument("--trace", type=Path, required=True)
    sample.add_argument("--output", type=Path, required=True)
    sample.add_argument("--target-duration-s", type=float, default=600.0)
    sample.add_argument("--start-time-s", type=float, default=0.0)
    sample.add_argument("--session-sample-rate", type=float, default=0.01)
    sample.add_argument("--min-turns", type=int, default=1)
    sample.add_argument("--max-requests", type=int, default=None)
    sample.add_argument(
        "--profile",
        choices=["default", "small-append"],
        default="default",
        help="Optional workload-shape filter for live benchmarks.",
    )
    sample.add_argument("--min-initial-input-tokens", type=int, default=None)
    sample.add_argument("--max-initial-input-tokens", type=int, default=None)
    sample.add_argument("--max-append-input-tokens", type=int, default=None)
    sample.add_argument("--max-output-tokens", type=int, default=None)
    sample.add_argument("--min-overlap-ratio", type=float, default=None)
    normalize = subparsers.add_parser(
        "normalize-trace-lengths",
        help="Rewrite a trace to a fixed turn1/append/output length profile",
    )
    normalize.add_argument("--trace", type=Path, required=True)
    normalize.add_argument("--output", type=Path, required=True)
    normalize.add_argument("--initial-input-length", type=int, default=10_000)
    normalize.add_argument("--append-input-length", type=int, default=1_000)
    normalize.add_argument("--output-length", type=int, default=1_000)
    normalize.add_argument("--max-requests", type=int, default=None)
    micro = subparsers.add_parser(
        "make-small-append-trace",
        help="Generate a synthetic multi-turn trace with small turn2+ appends",
    )
    micro.add_argument("--output", type=Path, required=True)
    micro.add_argument("--session-count", type=int, default=8)
    micro.add_argument("--turns-per-session", type=int, default=3)
    micro.add_argument("--initial-input-length", type=int, default=10_000)
    micro.add_argument("--append-input-length", type=int, default=1_000)
    micro.add_argument("--output-length", type=int, default=1_000)
    micro.add_argument("--inter-turn-gap-s", type=float, default=1.0)
    micro.add_argument("--session-stagger-s", type=float, default=0.1)
    benchmark = subparsers.add_parser(
        "benchmark-live",
        help="Launch a real PD stack, sample sessions, and collect live E2E numbers",
    )
    _add_topology_arguments(benchmark)
    benchmark.add_argument("--trace", type=Path, required=True)
    benchmark.add_argument(
        "--policy",
        choices=["default", "sticky", "kv-aware"],
        default="sticky",
    )
    benchmark.add_argument(
        "--mechanism",
        choices=[
            "pd-disaggregation",
            "pd-hybrid",
            "pd-disagg",
            "pd-colo",
            "direct-d",
            "kvcache-centric",
            "turn2+-direct-to-d",
            "pd-with-d-append",
        ],
        default="pd-disaggregation",
    )
    benchmark.add_argument("--output-root", type=Path, default=Path("outputs/live"))
    benchmark.add_argument("--target-duration-s", type=float, default=600.0)
    benchmark.add_argument("--start-time-s", type=float, default=0.0)
    benchmark.add_argument("--session-sample-rate", type=float, default=0.01)
    benchmark.add_argument("--min-turns", type=int, default=1)
    benchmark.add_argument("--time-scale", type=float, default=1.0)
    benchmark.add_argument("--concurrency-limit", type=int, default=32)
    benchmark.add_argument("--timeout-s", type=float, default=1200.0)
    benchmark.add_argument(
        "--no-stream",
        action="store_true",
        help="Use non-streaming OpenAI responses for E2E-only live benchmarking.",
    )
    benchmark.add_argument(
        "--stream-idle-timeout-s",
        type=float,
        default=900.0,
        help="Abort a streaming request if no SSE line arrives within this many seconds.",
    )
    benchmark.add_argument(
        "--kvcache-direct-max-uncached-tokens",
        type=int,
        default=2048,
        help="For kvcache-centric routing, bypass P when the uncached suffix is at most this many tokens.",
    )
    benchmark.add_argument(
        "--kvcache-admission-mode",
        choices=["router", "worker"],
        default="router",
        help=(
            "For kvcache-centric routing, use router shadow-state admission "
            "or query the decode worker on the critical path."
        ),
    )
    benchmark.add_argument(
        "--sample-profile",
        choices=["default", "small-append"],
        default="default",
        help="Optional session-shape filter applied before live replay.",
    )
    benchmark.add_argument("--min-initial-input-tokens", type=int, default=None)
    benchmark.add_argument("--max-initial-input-tokens", type=int, default=None)
    benchmark.add_argument("--max-append-input-tokens", type=int, default=None)
    benchmark.add_argument("--max-output-tokens", type=int, default=None)
    benchmark.add_argument("--min-overlap-ratio", type=float, default=None)
    args = parser.parse_args()
    if args.command == "print-launch":
        topology = _topology_from_args(args)
        plan = build_launch_plan(
            topology,
            prefill_policy=args.prefill_policy,
            decode_policy=args.decode_policy,
            include_router=bool(topology.prefill_workers and topology.decode_workers),
        )
        print(plan.render())
        return
    if args.command == "replay":
        topology = _topology_from_args(args)
        config = ReplayConfig(
            trace_path=args.trace,
            output_path=args.output,
            policy_name=args.policy,
            mechanism_name=_normalize_mechanism_name(args.mechanism),
            topology=topology,
            router_url=args.router_url,
            model_name=args.model,
            pace=not args.no_pace,
            time_scale=args.time_scale,
            request_limit=args.request_limit,
            concurrency_limit=args.concurrency_limit,
            header_mode=args.header_mode,
            timeout_s=args.timeout_s,
            stream=not args.no_stream,
            stream_idle_timeout_s=args.stream_idle_timeout_s,
            kvcache_direct_max_uncached_tokens=args.kvcache_direct_max_uncached_tokens,
            kvcache_admission_mode=args.kvcache_admission_mode,
        )
        results = asyncio.run(replay_trace(config))
        print(
            f"wrote {len(results)} request records to {args.output} and "
            f"{args.output}{'.summary.json'}"
        )
        return
    if args.command == "sample-sessions":
        summary = sample_trace_sessions(
            SessionSampleConfig(
                trace_path=args.trace,
                output_path=args.output,
                target_duration_s=args.target_duration_s,
                start_time_s=args.start_time_s,
                session_sample_rate=args.session_sample_rate,
                min_turns=args.min_turns,
                max_requests=args.max_requests,
                profile=args.profile,
                min_initial_input_tokens=args.min_initial_input_tokens,
                max_initial_input_tokens=args.max_initial_input_tokens,
                max_append_input_tokens=args.max_append_input_tokens,
                max_output_tokens=args.max_output_tokens,
                min_overlap_ratio=args.min_overlap_ratio,
            )
        )
        print(
            f"wrote {summary.request_count} requests from {summary.session_count} sessions "
            f"covering {summary.sampled_duration_s:.3f}s to {args.output}"
        )
        return
    if args.command == "normalize-trace-lengths":
        summary = normalize_trace_lengths(
            NormalizeTraceLengthsConfig(
                trace_path=args.trace,
                output_path=args.output,
                initial_input_length=args.initial_input_length,
                append_input_length=args.append_input_length,
                output_length=args.output_length,
                max_requests=args.max_requests,
            )
        )
        print(
            f"wrote {summary.request_count} normalized requests from "
            f"{summary.session_count} sessions to {args.output}"
        )
        return
    if args.command == "make-small-append-trace":
        summary = write_small_append_trace(
            SmallAppendTraceConfig(
                output_path=args.output,
                session_count=args.session_count,
                turns_per_session=args.turns_per_session,
                initial_input_length=args.initial_input_length,
                append_input_length=args.append_input_length,
                output_length=args.output_length,
                inter_turn_gap_s=args.inter_turn_gap_s,
                session_stagger_s=args.session_stagger_s,
            )
        )
        print(
            f"wrote {summary.request_count} requests across {summary.session_count} sessions "
            f"to {args.output}"
        )
        return
    if args.command == "benchmark-live":
        topology = _topology_from_args(args)
        artifacts = run_live_benchmark(
            BenchmarkConfig(
                trace_path=args.trace,
                output_root=args.output_root,
                topology=topology,
                policy_name=args.policy,
                mechanism_name=_normalize_mechanism_name(args.mechanism),
                target_duration_s=args.target_duration_s,
                start_time_s=args.start_time_s,
                session_sample_rate=args.session_sample_rate,
                min_turns=args.min_turns,
                time_scale=args.time_scale,
                concurrency_limit=args.concurrency_limit,
                timeout_s=args.timeout_s,
                stream=not args.no_stream,
                stream_idle_timeout_s=args.stream_idle_timeout_s,
                kvcache_direct_max_uncached_tokens=args.kvcache_direct_max_uncached_tokens,
                kvcache_admission_mode=args.kvcache_admission_mode,
                sample_profile=args.sample_profile,
                min_initial_input_tokens=args.min_initial_input_tokens,
                max_initial_input_tokens=args.max_initial_input_tokens,
                max_append_input_tokens=args.max_append_input_tokens,
                max_output_tokens=args.max_output_tokens,
                min_overlap_ratio=args.min_overlap_ratio,
                launch_stack=True,
            )
        )
        print(
            f"benchmark artifacts written under {artifacts.run_dir}; "
            f"metrics={artifacts.metrics_path} summary={artifacts.summary_path}"
        )
        return
    raise AssertionError(f"Unhandled command: {args.command}")
 def _add_topology_arguments(parser: argparse.ArgumentParser) -> None:
    parser.add_argument(
        "--model-path",
        default="~/models/Qwen/Qwen3-Coder-30B-A3B-Instruct",
    )
    parser.add_argument("--prefill-workers", type=int, default=1)
    parser.add_argument("--decode-workers", type=int, default=1)
    parser.add_argument("--direct-workers", type=int, default=0)
    parser.add_argument("--prefill-tp-size", type=int, default=1)
    parser.add_argument("--decode-tp-size", type=int, default=1)
    parser.add_argument("--direct-tp-size", type=int, default=1)
    parser.add_argument("--gpu-budget", type=int, default=8)
    parser.add_argument(
        "--prefill-gpu-ids",
        default=None,
        help="Comma-separated GPU IDs for prefill workers, e.g. 3,4",
    )
    parser.add_argument(
        "--decode-gpu-ids",
        default=None,
        help="Comma-separated GPU IDs for decode workers, e.g. 5",
    )
    parser.add_argument(
        "--direct-gpu-ids",
        default=None,
        help="Comma-separated GPU IDs for direct workers, e.g. 6",
    )
    parser.add_argument("--host", default="127.0.0.1")
    parser.add_argument("--router-port", type=int, default=8000)
    parser.add_argument("--prefill-port-base", type=int, default=30000)
    parser.add_argument("--decode-port-base", type=int, default=31000)
    parser.add_argument("--direct-port-base", type=int, default=32000)
    parser.add_argument("--bootstrap-port-base", type=int, default=8998)
    parser.add_argument("--transfer-backend", default="nixl")
    parser.add_argument(
        "--force-rdma",
        action="store_true",
        help=(
            "Force real RDMA transport for PD KV transfer. "
            "Currently this requires Mooncake plus --ib-device."
        ),
    )
    parser.add_argument("--ib-device", default=None)
    parser.add_argument(
        "--no-trust-remote-code",
        action="store_true",
    )
 def _topology_from_args(args: argparse.Namespace):
    transfer_backend = args.transfer_backend
    if args.force_rdma:
        transfer_backend = "mooncake"
    return build_single_node_topology(
        model_path=str(Path(args.model_path).expanduser()),
        prefill_worker_count=args.prefill_workers,
        decode_worker_count=args.decode_workers,
        direct_worker_count=args.direct_workers,
        prefill_tp_size=args.prefill_tp_size,
        decode_tp_size=args.decode_tp_size,
        direct_tp_size=args.direct_tp_size,
        prefill_gpu_ids=_parse_gpu_id_list(args.prefill_gpu_ids),
        decode_gpu_ids=_parse_gpu_id_list(args.decode_gpu_ids),
        direct_gpu_ids=_parse_gpu_id_list(args.direct_gpu_ids),
        total_gpu_budget=args.gpu_budget,
        host=args.host,
        router_port=args.router_port,
        prefill_port_base=args.prefill_port_base,
        decode_port_base=args.decode_port_base,
        direct_port_base=args.direct_port_base,
        bootstrap_port_base=args.bootstrap_port_base,
        transfer_backend=transfer_backend,
        force_rdma=args.force_rdma,
        trust_remote_code=not args.no_trust_remote_code,
        ib_device=args.ib_device,
        direct_extra_server_args=("--enable-streaming-session",),
    )
 if __name__ == "__main__":
    main()
--- a/src/agentic_pd_hybrid/launcher.py
+++ b/src/agentic_pd_hybrid/launcher.py
@@ -0,0 +1,140 @@
 from __future__ import annotations
 import shlex
 import sys
 from dataclasses import dataclass
 from agentic_pd_hybrid.topology import SingleNodeTopology, WorkerSpec
@dataclass(frozen=True)
 class LaunchPlan:
    prefill_commands: tuple[tuple[str, ...], ...]
    decode_commands: tuple[tuple[str, ...], ...]
    direct_commands: tuple[tuple[str, ...], ...]
    router_command: tuple[str, ...] | None
    def render(self) -> str:
        sections: list[str] = []
        for idx, command in enumerate(self.prefill_commands):
            sections.append(_render_named_command(f"prefill-{idx}", command))
        for idx, command in enumerate(self.decode_commands):
            sections.append(_render_named_command(f"decode-{idx}", command))
        for idx, command in enumerate(self.direct_commands):
            sections.append(_render_named_command(f"direct-{idx}", command))
        if self.router_command is not None:
            sections.append(_render_named_command("router", self.router_command))
        return "\n\n".join(sections)
 def build_launch_plan(
    topology: SingleNodeTopology,
    *,
    prefill_policy: str = "round_robin",
    decode_policy: str = "manual",
    include_router: bool = True,
 ) -> LaunchPlan:
    return LaunchPlan(
        prefill_commands=tuple(
            _build_server_command(topology, worker) for worker in topology.prefill_workers
        ),
        decode_commands=tuple(
            _build_server_command(topology, worker) for worker in topology.decode_workers
        ),
        direct_commands=tuple(
            _build_server_command(topology, worker) for worker in topology.direct_workers
        ),
        router_command=(
            _build_router_command(
                topology,
                prefill_policy=prefill_policy,
                decode_policy=decode_policy,
            )
            if include_router and topology.prefill_workers and topology.decode_workers
            else None
        ),
    )
 def _build_server_command(
    topology: SingleNodeTopology,
    worker: WorkerSpec,
 ) -> tuple[str, ...]:
    command = [
        sys.executable,
        "-B",
        "-u",
        "-m",
        "sglang.launch_server",
        "--model-path",
        topology.model_path,
        "--host",
        worker.host,
        "--port",
        str(worker.port),
        "--base-gpu-id",
        str(worker.gpu_id),
        "--disaggregation-mode",
        _disaggregation_mode_for(worker),
        "--disaggregation-transfer-backend",
        topology.transfer_backend,
    ]
    if worker.tp_size > 1:
        command.extend(["--tp-size", str(worker.tp_size)])
    if topology.trust_remote_code:
        command.append("--trust-remote-code")
    command.append("--enable-cache-report")
    if worker.bootstrap_port is not None:
        command.extend(
            ["--disaggregation-bootstrap-port", str(worker.bootstrap_port)]
        )
    if topology.ib_device:
        command.extend(["--disaggregation-ib-device", topology.ib_device])
    command.extend(topology.extra_server_args)
    if worker.role == "prefill":
        command.extend(topology.prefill_extra_server_args)
    elif worker.role == "decode":
        command.extend(topology.decode_extra_server_args)
    else:
        command.extend(topology.direct_extra_server_args)
    return tuple(command)
 def _build_router_command(
    topology: SingleNodeTopology,
    *,
    prefill_policy: str,
    decode_policy: str,
 ) -> tuple[str, ...]:
    command: list[str] = [
        sys.executable,
        "-B",
        "-u",
        "-m",
        "agentic_pd_hybrid.pd_router",
        "--host",
        topology.router_host,
        "--port",
        str(topology.router_port),
        "--prefill-policy",
        prefill_policy,
        "--decode-policy",
        decode_policy,
    ]
    for worker in topology.prefill_workers:
        command.extend(
            ["--prefill", worker.url, str(worker.bootstrap_port or topology.router_port)]
        )
    for worker in topology.decode_workers:
        command.extend(["--decode", worker.url])
    return tuple(command)
 def _render_named_command(name: str, command: tuple[str, ...]) -> str:
    return f"# {name}\n" + " ".join(shlex.quote(part) for part in command)
 def _disaggregation_mode_for(worker: WorkerSpec) -> str:
    if worker.role == "direct":
        return "null"
    return worker.role
--- a/src/agentic_pd_hybrid/metrics.py
+++ b/src/agentic_pd_hybrid/metrics.py
@@ -0,0 +1,165 @@
 from __future__ import annotations
 import json
 import statistics
 from collections import Counter
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any
 from agentic_pd_hybrid.policies import RoutingDecision
 from agentic_pd_hybrid.trace import TraceRequest
@dataclass(frozen=True)
 class RequestMetrics:
    request_id: str
    session_id: str
    turn_id: int
    mechanism_name: str
    execution_mode: str
    trace_timestamp_s: float
    input_length: int
    output_length: int
    request_type: str
    policy_name: str
    assigned_prefill_node: str
    assigned_decode_node: str
    assigned_decode_index: int
    inflight_decode_load_at_assignment: int
    reuse_expected: bool
    reuse_observed: bool
    observed_overlap_blocks: int
    kv_transfer_blocks: int
    actual_kv_transfer_blocks: int
    cached_tokens: int
    re_prefill_required: bool
    effective_input_length: int | None
    session_reused: bool
    session_reset: bool
    latency_s: float | None
    ttft_s: float | None
    tpot_s: float | None
    error: str | None = None
    @classmethod
    def from_decision(
        cls,
        request: TraceRequest,
        decision: RoutingDecision,
        *,
        mechanism_name: str,
        execution_mode: str,
        actual_kv_transfer_blocks: int,
        effective_input_length: int | None,
        cached_tokens: int,
        session_reused: bool,
        session_reset: bool,
        latency_s: float | None,
        ttft_s: float | None,
        tpot_s: float | None,
        error: str | None = None,
    ) -> "RequestMetrics":
        return cls(
            request_id=request.request_id,
            session_id=request.session_id,
            turn_id=request.turn_id,
            mechanism_name=mechanism_name,
            execution_mode=execution_mode,
            trace_timestamp_s=request.timestamp_s,
            input_length=request.input_length,
            output_length=request.output_length,
            request_type=request.request_type,
            policy_name=decision.policy_name,
            assigned_prefill_node=decision.prefill_worker_id,
            assigned_decode_node=decision.decode_worker_id,
            assigned_decode_index=decision.decode_worker_index,
            inflight_decode_load_at_assignment=decision.inflight_decode_load,
            reuse_expected=decision.reuse_expected,
            reuse_observed=decision.observed_reuse,
            observed_overlap_blocks=decision.observed_overlap_blocks,
            kv_transfer_blocks=decision.kv_transfer_blocks,
            actual_kv_transfer_blocks=actual_kv_transfer_blocks,
            cached_tokens=cached_tokens,
            re_prefill_required=decision.re_prefill_required,
            effective_input_length=effective_input_length,
            session_reused=session_reused,
            session_reset=session_reset,
            latency_s=latency_s,
            ttft_s=ttft_s,
            tpot_s=tpot_s,
            error=error,
        )
 def write_metrics_jsonl(path: Path, rows: list[RequestMetrics]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as handle:
        for row in rows:
            handle.write(json.dumps(asdict(row), sort_keys=True) + "\n")
 def write_summary_json(
    path: Path,
    rows: list[RequestMetrics],
    *,
    trace_path: Path,
    router_url: str | None,
 ) -> None:
    latencies = [row.latency_s for row in rows if row.latency_s is not None]
    ttfts = [row.ttft_s for row in rows if row.ttft_s is not None]
    tpots = [row.tpot_s for row in rows if row.tpot_s is not None]
    per_decode_load = Counter(row.assigned_decode_node for row in rows)
    per_prefill_load = Counter(row.assigned_prefill_node for row in rows)
    summary: dict[str, Any] = {
        "trace_path": str(trace_path),
        "router_url": router_url,
        "request_count": len(rows),
        "mechanisms": dict(sorted(Counter(row.mechanism_name for row in rows).items())),
        "execution_modes": dict(sorted(Counter(row.execution_mode for row in rows).items())),
        "latency_stats_s": _stats(latencies),
        "ttft_stats_s": _stats(ttfts),
        "tpot_stats_s": _stats(tpots),
        "reuse_expected_count": sum(1 for row in rows if row.reuse_expected),
        "reuse_observed_count": sum(1 for row in rows if row.reuse_observed),
        "re_prefill_count": sum(1 for row in rows if row.re_prefill_required),
        "cache_hit_request_count": sum(1 for row in rows if row.cached_tokens > 0),
        "total_cached_tokens": sum(row.cached_tokens for row in rows),
        "cached_tokens_stats": _stats([float(row.cached_tokens) for row in rows]),
        "session_reused_count": sum(1 for row in rows if row.session_reused),
        "session_reset_count": sum(1 for row in rows if row.session_reset),
        "total_kv_transfer_blocks": sum(row.kv_transfer_blocks for row in rows),
        "total_actual_kv_transfer_blocks": sum(
            row.actual_kv_transfer_blocks for row in rows
        ),
        "per_decode_load": dict(sorted(per_decode_load.items())),
        "per_prefill_load": dict(sorted(per_prefill_load.items())),
        "error_count": sum(1 for row in rows if row.error is not None),
    }
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as handle:
        json.dump(summary, handle, indent=2, sort_keys=True)
 def _stats(values: list[float | None]) -> dict[str, float] | None:
    clean = [value for value in values if value is not None]
    if not clean:
        return None
    clean.sort()
    return {
        "count": float(len(clean)),
        "mean": statistics.fmean(clean),
        "p50": _percentile(clean, 0.50),
        "p90": _percentile(clean, 0.90),
        "p99": _percentile(clean, 0.99),
    }
 def _percentile(sorted_values: list[float], percentile: float) -> float:
    if not sorted_values:
        raise ValueError("sorted_values must not be empty")
    if len(sorted_values) == 1:
        return sorted_values[0]
    index = round((len(sorted_values) - 1) * percentile)
    return sorted_values[index]
--- a/src/agentic_pd_hybrid/microbench.py
+++ b/src/agentic_pd_hybrid/microbench.py
@@ -0,0 +1,123 @@
 from __future__ import annotations
 import json
 from dataclasses import asdict, dataclass
 from math import ceil
 from pathlib import Path
 BLOCK_TOKEN_BUDGET = 24
@dataclass(frozen=True)
 class SmallAppendTraceConfig:
    output_path: Path
    session_count: int = 8
    turns_per_session: int = 3
    initial_input_length: int = 10_000
    append_input_length: int = 1_000
    output_length: int = 1_000
    inter_turn_gap_s: float = 1.0
    session_stagger_s: float = 0.1
    request_type: str = "coder"
@dataclass(frozen=True)
 class SmallAppendTraceSummary:
    output_path: str
    session_count: int
    turns_per_session: int
    request_count: int
    initial_input_length: int
    append_input_length: int
    output_length: int
    inter_turn_gap_s: float
    session_stagger_s: float
 def write_small_append_trace(config: SmallAppendTraceConfig) -> SmallAppendTraceSummary:
    if config.session_count <= 0:
        raise ValueError("session_count must be > 0")
    if config.turns_per_session <= 0:
        raise ValueError("turns_per_session must be > 0")
    if config.initial_input_length < 0:
        raise ValueError("initial_input_length must be >= 0")
    if config.append_input_length < 0:
        raise ValueError("append_input_length must be >= 0")
    if config.output_length < 0:
        raise ValueError("output_length must be >= 0")
    config.output_path.parent.mkdir(parents=True, exist_ok=True)
    records: list[dict[str, object]] = []
    next_chat_id = 1_000_000
    for session_idx in range(config.session_count):
        root_chat_id = next_chat_id
        previous_chat_id = -1
        session_base_time = session_idx * config.session_stagger_s
        base_block_count = ceil(config.initial_input_length / BLOCK_TOKEN_BUDGET)
        base_hash_ids = [
            _hash_id_for(session_idx=session_idx, block_idx=block_idx)
            for block_idx in range(base_block_count)
        ]
        for turn_idx in range(config.turns_per_session):
            chat_id = root_chat_id if turn_idx == 0 else next_chat_id
            if turn_idx > 0:
                next_chat_id += 1
            input_length = config.initial_input_length + turn_idx * (
                config.append_input_length + config.output_length
            )
            total_block_count = ceil(input_length / BLOCK_TOKEN_BUDGET)
            hash_ids = base_hash_ids + [
                _hash_id_for(
                    session_idx=session_idx,
                    block_idx=base_block_count + append_block_idx,
                )
                for append_block_idx in range(max(0, total_block_count - base_block_count))
            ]
            records.append(
                {
                    "chat_id": chat_id,
                    "parent_chat_id": previous_chat_id,
                    "timestamp": session_base_time
                    + turn_idx * config.inter_turn_gap_s,
                    "input_length": input_length,
                    "output_length": config.output_length,
                    "type": config.request_type,
                    "turn": turn_idx + 1,
                    "hash_ids": hash_ids,
                }
            )
            previous_chat_id = chat_id
        next_chat_id += 1
    records.sort(key=lambda item: float(item["timestamp"]))
    with config.output_path.open("w", encoding="utf-8") as handle:
        for record in records:
            handle.write(json.dumps(record, sort_keys=True) + "\n")
    summary = SmallAppendTraceSummary(
        output_path=str(config.output_path),
        session_count=config.session_count,
        turns_per_session=config.turns_per_session,
        request_count=len(records),
        initial_input_length=config.initial_input_length,
        append_input_length=config.append_input_length,
        output_length=config.output_length,
        inter_turn_gap_s=config.inter_turn_gap_s,
        session_stagger_s=config.session_stagger_s,
    )
    summary_path = config.output_path.with_suffix(
        config.output_path.suffix + ".summary.json"
    )
    with summary_path.open("w", encoding="utf-8") as handle:
        json.dump(asdict(summary), handle, indent=2, sort_keys=True)
    return summary
 def _hash_id_for(*, session_idx: int, block_idx: int) -> int:
    return session_idx * 1_000_000 + block_idx
--- a/src/agentic_pd_hybrid/pd_router.py
+++ b/src/agentic_pd_hybrid/pd_router.py
@@ -0,0 +1,267 @@
 from __future__ import annotations
 import argparse
 import asyncio
 import random
 import urllib.parse
 from dataclasses import dataclass
 from http import HTTPStatus
 from itertools import chain
 from typing import AsyncIterator
 import aiohttp
 import orjson
 import uvicorn
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import ORJSONResponse, Response, StreamingResponse
 _STREAM_CHUNK_SIZE = 1024 * 64
@dataclass
 class RouterConfig:
    host: str
    port: int
    prefill_urls: list[tuple[str, int]]
    decode_urls: list[str]
    prefill_policy: str = "round_robin"
    decode_policy: str = "manual"
    request_timeout_s: float = 1800.0
 class RouterState:
    def __init__(self, config: RouterConfig):
        if not config.prefill_urls:
            raise ValueError("At least one prefill worker is required")
        if not config.decode_urls:
            raise ValueError("At least one decode worker is required")
        self.config = config
        self.prefill_cursor = 0
        self.decode_cursor = 0
        self.sticky_decode_map: dict[str, int] = {}
    def select_pair(self, headers: dict[str, str]) -> tuple[str, int, str]:
        prefill_url, bootstrap_port = self.config.prefill_urls[
            self.prefill_cursor % len(self.config.prefill_urls)
        ]
        self.prefill_cursor += 1
        decode_index = self._select_decode_index(headers)
        return prefill_url, bootstrap_port, self.config.decode_urls[decode_index]
    def _select_decode_index(self, headers: dict[str, str]) -> int:
        target_worker = headers.get("x-smg-target-worker")
        routing_key = headers.get("x-smg-routing-key")
        if (
            self.config.decode_policy == "consistent_hashing"
            and target_worker is not None
        ):
            idx = int(target_worker)
            if 0 <= idx < len(self.config.decode_urls):
                return idx
        if self.config.decode_policy == "manual" and routing_key:
            cached = self.sticky_decode_map.get(routing_key)
            if cached is not None:
                return cached
            idx = self.decode_cursor % len(self.config.decode_urls)
            self.decode_cursor += 1
            self.sticky_decode_map[routing_key] = idx
            return idx
        idx = self.decode_cursor % len(self.config.decode_urls)
        self.decode_cursor += 1
        return idx
 app = FastAPI()
 router_state: RouterState | None = None
@app.get("/health")
 async def health() -> Response:
    return Response(status_code=200)
@app.get("/health_generate")
 async def health_generate() -> Response:
    state = _require_state()
    async with aiohttp.ClientSession() as session:
        tasks = []
        for server in chain(
            (url for url, _ in state.config.prefill_urls),
            state.config.decode_urls,
        ):
            tasks.append(session.get(f"{server}/health_generate"))
        for response in asyncio.as_completed(tasks):
            async with await response:
                pass
    return Response(status_code=200)
@app.get("/v1/models")
 async def models() -> ORJSONResponse:
    state = _require_state()
    async with aiohttp.ClientSession() as session:
        async with session.get(f"{state.config.prefill_urls[0][0]}/v1/models") as response:
            payload = await response.json()
            return ORJSONResponse(payload, status_code=response.status)
@app.post("/v1/chat/completions")
 async def chat_completions(request: Request) -> Response:
    request_data = await request.json()
    headers = {key.lower(): value for key, value in request.headers.items()}
    return await _forward_to_backend(
        request_data=request_data,
        headers=headers,
        endpoint_name="v1/chat/completions",
    )
@app.post("/v1/completions")
 async def completions(request: Request) -> Response:
    request_data = await request.json()
    headers = {key.lower(): value for key, value in request.headers.items()}
    return await _forward_to_backend(
        request_data=request_data,
        headers=headers,
        endpoint_name="v1/completions",
    )
@app.post("/generate")
 async def generate(request: Request) -> Response:
    request_data = await request.json()
    headers = {key.lower(): value for key, value in request.headers.items()}
    return await _forward_to_backend(
        request_data=request_data,
        headers=headers,
        endpoint_name="generate",
    )
 async def _forward_to_backend(
    *,
    request_data: dict,
    headers: dict[str, str],
    endpoint_name: str,
 ) -> Response:
    state = _require_state()
    prefill_server, bootstrap_port, decode_server = state.select_pair(headers)
    modified_request = request_data.copy()
    modified_request.update(_build_bootstrap_payload(prefill_server, bootstrap_port))
    if request_data.get("stream", False):
        return StreamingResponse(
            _stream_generate(
                modified_request=modified_request,
                prefill_server=prefill_server,
                decode_server=decode_server,
                endpoint_name=endpoint_name,
                timeout_s=state.config.request_timeout_s,
            ),
            media_type="text/event-stream",
        )
    async with aiohttp.ClientSession(
        timeout=aiohttp.ClientTimeout(total=state.config.request_timeout_s)
    ) as session:
        prefill_response, decode_response = await asyncio.gather(
            session.post(f"{prefill_server}/{endpoint_name}", json=modified_request),
            session.post(f"{decode_server}/{endpoint_name}", json=modified_request),
        )
        async with prefill_response:
            await prefill_response.read()
        async with decode_response:
            body = await decode_response.read()
            return Response(
                content=body,
                status_code=decode_response.status,
                media_type=decode_response.content_type,
            )
 async def _stream_generate(
    *,
    modified_request: dict,
    prefill_server: str,
    decode_server: str,
    endpoint_name: str,
    timeout_s: float,
 ) -> AsyncIterator[bytes]:
    async with aiohttp.ClientSession(
        timeout=aiohttp.ClientTimeout(total=timeout_s)
    ) as session:
        prefill_response, decode_response = await asyncio.gather(
            session.post(f"{prefill_server}/{endpoint_name}", json=modified_request),
            session.post(f"{decode_server}/{endpoint_name}", json=modified_request),
        )
        async with prefill_response, decode_response:
            if decode_response.status != HTTPStatus.OK:
                payload = await decode_response.read()
                yield payload
                return
            async for chunk in decode_response.content.iter_chunked(_STREAM_CHUNK_SIZE):
                yield chunk
 def _build_bootstrap_payload(prefill_server: str, bootstrap_port: int) -> dict[str, object]:
    parsed_url = urllib.parse.urlparse(prefill_server)
    hostname = parsed_url.hostname
    if hostname is None:
        raise HTTPException(
            status_code=500,
            detail=f"Unable to parse prefill hostname from {prefill_server}",
        )
    return {
        "bootstrap_host": hostname,
        "bootstrap_port": bootstrap_port,
        "bootstrap_room": random.randint(0, 2**63 - 1),
    }
 def _require_state() -> RouterState:
    if router_state is None:
        raise HTTPException(status_code=500, detail="router not initialized")
    return router_state
 def main() -> None:
    parser = argparse.ArgumentParser(description="Minimal local PD router")
    parser.add_argument("--host", default="127.0.0.1")
    parser.add_argument("--port", type=int, default=8000)
    parser.add_argument(
        "--prefill",
        nargs=2,
        metavar=("URL", "BOOTSTRAP_PORT"),
        action="append",
        required=True,
    )
    parser.add_argument(
        "--decode",
        action="append",
        required=True,
    )
    parser.add_argument("--prefill-policy", default="round_robin")
    parser.add_argument("--decode-policy", default="manual")
    parser.add_argument("--request-timeout-s", type=float, default=1800.0)
    args = parser.parse_args()
    global router_state
    router_state = RouterState(
        RouterConfig(
            host=args.host,
            port=args.port,
            prefill_urls=[(url, int(port)) for url, port in args.prefill],
            decode_urls=list(args.decode),
            prefill_policy=args.prefill_policy,
            decode_policy=args.decode_policy,
            request_timeout_s=args.request_timeout_s,
        )
    )
    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
 if __name__ == "__main__":
    main()
--- a/src/agentic_pd_hybrid/policies.py
+++ b/src/agentic_pd_hybrid/policies.py
@@ -0,0 +1,235 @@
 from __future__ import annotations
 from collections import Counter
 from dataclasses import dataclass, field
 from typing import Protocol
 from agentic_pd_hybrid.topology import SingleNodeTopology
 from agentic_pd_hybrid.trace import TraceRequest
@dataclass
 class SessionRouteState:
    last_decode_worker: str | None = None
@dataclass
 class RoutingDecision:
    policy_name: str
    prefill_worker_id: str
    decode_worker_id: str
    decode_worker_index: int
    reuse_expected: bool
    observed_overlap_blocks: int
    kv_transfer_blocks: int
    inflight_decode_load: int
    session_id: str
    request_id: str
    turn_id: int
    @property
    def observed_reuse(self) -> bool:
        return self.observed_overlap_blocks > 0
    @property
    def re_prefill_required(self) -> bool:
        return self.turn_id > 1 and self.observed_overlap_blocks == 0
@dataclass
 class RoutingState:
    prefill_cursor: int = 0
    decode_cursor: int = 0
    session_state: dict[str, SessionRouteState] = field(default_factory=dict)
    inflight_decode: Counter[str] = field(default_factory=Counter)
    decode_assignment_counts: Counter[str] = field(default_factory=Counter)
    decode_resident_blocks: dict[str, set[int]] = field(default_factory=dict)
    @classmethod
    def create(cls, topology: SingleNodeTopology) -> "RoutingState":
        return cls(
            decode_resident_blocks={
                worker.worker_id: set() for worker in topology.route_workers
            }
        )
    def next_prefill_worker_id(self, topology: SingleNodeTopology) -> str:
        if not topology.prefill_workers:
            return "none"
        worker = topology.prefill_workers[self.prefill_cursor % len(topology.prefill_workers)]
        self.prefill_cursor += 1
        return worker.worker_id
    def next_decode_worker_id(self, topology: SingleNodeTopology) -> str:
        route_workers = topology.route_workers
        worker = route_workers[self.decode_cursor % len(route_workers)]
        self.decode_cursor += 1
        return worker.worker_id
    def finish(self, request: TraceRequest, decision: RoutingDecision) -> None:
        session = self.session_state.setdefault(request.session_id, SessionRouteState())
        session.last_decode_worker = decision.decode_worker_id
        self.decode_resident_blocks[decision.decode_worker_id].update(request.hash_ids)
        self.inflight_decode[decision.decode_worker_id] -= 1
        if self.inflight_decode[decision.decode_worker_id] <= 0:
            del self.inflight_decode[decision.decode_worker_id]
 class RoutingPolicy(Protocol):
    name: str
    def select(
        self,
        request: TraceRequest,
        *,
        topology: SingleNodeTopology,
        state: RoutingState,
    ) -> RoutingDecision:
        ...
@dataclass(frozen=True)
 class DefaultPolicy:
    name: str = "default"
    def select(
        self,
        request: TraceRequest,
        *,
        topology: SingleNodeTopology,
        state: RoutingState,
    ) -> RoutingDecision:
        prefill_worker_id = state.next_prefill_worker_id(topology)
        decode_worker_id = state.next_decode_worker_id(topology)
        return _build_decision(
            policy_name=self.name,
            request=request,
            topology=topology,
            state=state,
            prefill_worker_id=prefill_worker_id,
            decode_worker_id=decode_worker_id,
            reuse_expected=False,
        )
@dataclass(frozen=True)
 class StickyDecodePolicy:
    name: str = "sticky"
    def select(
        self,
        request: TraceRequest,
        *,
        topology: SingleNodeTopology,
        state: RoutingState,
    ) -> RoutingDecision:
        session = state.session_state.get(request.session_id)
        prefill_worker_id = state.next_prefill_worker_id(topology)
        if request.turn_id > 1 and session and session.last_decode_worker is not None:
            decode_worker_id = session.last_decode_worker
            reuse_expected = True
        else:
            decode_worker_id = state.next_decode_worker_id(topology)
            reuse_expected = False
        return _build_decision(
            policy_name=self.name,
            request=request,
            topology=topology,
            state=state,
            prefill_worker_id=prefill_worker_id,
            decode_worker_id=decode_worker_id,
            reuse_expected=reuse_expected,
        )
@dataclass(frozen=True)
 class KvAwarePolicy:
    name: str = "kv-aware"
    sticky_bonus: int = 1
    def select(
        self,
        request: TraceRequest,
        *,
        topology: SingleNodeTopology,
        state: RoutingState,
    ) -> RoutingDecision:
        prefill_worker_id = state.next_prefill_worker_id(topology)
        session = state.session_state.get(request.session_id)
        best_decode_worker_id: str | None = None
        best_score: tuple[int, int, int] | None = None
        for worker in topology.route_workers:
            overlap = _overlap_blocks(request, state, worker.worker_id)
            sticky = int(session is not None and session.last_decode_worker == worker.worker_id)
            inflight_penalty = -state.inflight_decode.get(worker.worker_id, 0)
            assignment_penalty = -state.decode_assignment_counts.get(worker.worker_id, 0)
            score = (
                overlap + sticky * self.sticky_bonus,
                sticky,
                inflight_penalty,
                assignment_penalty,
            )
            if best_score is None or score > best_score:
                best_score = score
                best_decode_worker_id = worker.worker_id
        assert best_decode_worker_id is not None
        reuse_expected = bool(best_score and best_score[0] > 0)
        return _build_decision(
            policy_name=self.name,
            request=request,
            topology=topology,
            state=state,
            prefill_worker_id=prefill_worker_id,
            decode_worker_id=best_decode_worker_id,
            reuse_expected=reuse_expected,
        )
 def create_policy(name: str) -> RoutingPolicy:
    normalized = name.strip().lower()
    if normalized == "default":
        return DefaultPolicy()
    if normalized == "sticky":
        return StickyDecodePolicy()
    if normalized in {"kv-aware", "kv_aware", "kv"}:
        return KvAwarePolicy()
    raise ValueError(f"Unsupported policy: {name}")
 def _build_decision(
    *,
    policy_name: str,
    request: TraceRequest,
    topology: SingleNodeTopology,
    state: RoutingState,
    prefill_worker_id: str,
    decode_worker_id: str,
    reuse_expected: bool,
 ) -> RoutingDecision:
    overlap = _overlap_blocks(request, state, decode_worker_id)
    state.inflight_decode[decode_worker_id] += 1
    state.decode_assignment_counts[decode_worker_id] += 1
    return RoutingDecision(
        policy_name=policy_name,
        prefill_worker_id=prefill_worker_id,
        decode_worker_id=decode_worker_id,
        decode_worker_index=topology.route_index(decode_worker_id),
        reuse_expected=reuse_expected,
        observed_overlap_blocks=overlap,
        kv_transfer_blocks=max(0, len(request.hash_ids) - overlap),
        inflight_decode_load=state.inflight_decode[decode_worker_id],
        session_id=request.session_id,
        request_id=request.request_id,
        turn_id=request.turn_id,
    )
 def _overlap_blocks(
    request: TraceRequest,
    state: RoutingState,
    decode_worker_id: str,
 ) -> int:
    resident = state.decode_resident_blocks.get(decode_worker_id, set())
    return sum(1 for block in request.hash_ids if block in resident)
--- a/src/agentic_pd_hybrid/replay.py
+++ b/src/agentic_pd_hybrid/replay.py
--- a/src/agentic_pd_hybrid/sampling.py
+++ b/src/agentic_pd_hybrid/sampling.py
@@ -0,0 +1,295 @@
 from __future__ import annotations
 import hashlib
 import json
 from collections import defaultdict
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Literal
 from agentic_pd_hybrid.trace import TraceRequest, load_trace
 SampleProfile = Literal["default", "small-append"]
@dataclass(frozen=True)
 class SessionSampleConfig:
    trace_path: Path
    output_path: Path
    target_duration_s: float = 600.0
    start_time_s: float = 0.0
    session_sample_rate: float = 1.0
    min_turns: int = 1
    max_requests: int | None = None
    profile: SampleProfile = "default"
    min_initial_input_tokens: int | None = None
    max_initial_input_tokens: int | None = None
    max_append_input_tokens: int | None = None
    max_output_tokens: int | None = None
    min_overlap_ratio: float | None = None
@dataclass(frozen=True)
 class SessionSampleSummary:
    input_trace_path: str
    output_trace_path: str
    request_count: int
    session_count: int
    multi_turn_session_count: int
    start_time_s: float
    end_time_s: float
    sampled_duration_s: float
    session_sample_rate: float
    min_turns: int
    profile: str
    min_initial_input_tokens: int | None
    max_initial_input_tokens: int | None
    max_append_input_tokens: int | None
    max_output_tokens: int | None
    min_overlap_ratio: float | None
    mean_append_input_tokens: float | None
    mean_turn_overlap_ratio: float | None
 def sample_trace_sessions(config: SessionSampleConfig) -> SessionSampleSummary:
    requests = load_trace(config.trace_path)
    sessions: dict[str, list[TraceRequest]] = defaultdict(list)
    for request in requests:
        sessions[request.session_id].append(request)
    filters = _resolve_filters(config)
    eligible_sessions = {
        session_id: session_requests
        for session_id, session_requests in sessions.items()
        if len(session_requests) >= filters.min_turns
        and _session_matches_filters(session_requests, filters)
        and _keep_session(session_id, config.session_sample_rate)
    }
    ordered_sessions = sorted(
        eligible_sessions.values(),
        key=lambda session_requests: session_requests[0].timestamp_s,
    )
    selected_requests: list[TraceRequest] = []
    sampled_start: float | None = None
    sampled_end: float | None = None
    for session_requests in ordered_sessions:
        session_first = session_requests[0].timestamp_s
        if session_first < config.start_time_s:
            continue
        if sampled_start is None:
            sampled_start = session_first
        selected_requests.extend(session_requests)
        sampled_end = max(request.timestamp_s for request in session_requests)
        if config.max_requests is not None and len(selected_requests) >= config.max_requests:
            break
        if sampled_end - sampled_start >= config.target_duration_s:
            break
    selected_requests.sort(key=lambda request: request.timestamp_s)
    if config.max_requests is not None:
        selected_requests = selected_requests[: config.max_requests]
    if not selected_requests:
        raise ValueError("Sampling produced no requests; adjust the sampling arguments")
    config.output_path.parent.mkdir(parents=True, exist_ok=True)
    with config.output_path.open("w", encoding="utf-8") as handle:
        for request in selected_requests:
            payload = {
                "request_id": request.request_id,
                "session_id": request.session_id,
                "chat_id": request.chat_id,
                "parent_chat_id": request.parent_chat_id,
                "timestamp": request.timestamp_s,
                "input_length": request.input_length,
                "output_length": request.output_length,
                "type": request.request_type,
                "turn": request.turn_id,
                "hash_ids": list(request.hash_ids),
            }
            handle.write(json.dumps(payload, sort_keys=True) + "\n")
    selected_session_ids = {request.session_id for request in selected_requests}
    selected_session_requests = [
        eligible_sessions[session_id] for session_id in selected_session_ids
    ]
    append_lengths = [
        length
        for session_requests in selected_session_requests
        for length in _turn_append_lengths(session_requests)
    ]
    overlap_ratios = [
        ratio
        for session_requests in selected_session_requests
        for ratio in _turn_overlap_ratios(session_requests)
    ]
    summary = SessionSampleSummary(
        input_trace_path=str(config.trace_path),
        output_trace_path=str(config.output_path),
        request_count=len(selected_requests),
        session_count=len(selected_session_ids),
        multi_turn_session_count=sum(
            1
            for session_id in selected_session_ids
            if len(eligible_sessions[session_id]) > 1
        ),
        start_time_s=selected_requests[0].timestamp_s,
        end_time_s=selected_requests[-1].timestamp_s,
        sampled_duration_s=selected_requests[-1].timestamp_s
        - selected_requests[0].timestamp_s,
        session_sample_rate=config.session_sample_rate,
        min_turns=filters.min_turns,
        profile=config.profile,
        min_initial_input_tokens=filters.min_initial_input_tokens,
        max_initial_input_tokens=filters.max_initial_input_tokens,
        max_append_input_tokens=filters.max_append_input_tokens,
        max_output_tokens=filters.max_output_tokens,
        min_overlap_ratio=filters.min_overlap_ratio,
        mean_append_input_tokens=_mean(append_lengths),
        mean_turn_overlap_ratio=_mean(overlap_ratios),
    )
    summary_path = config.output_path.with_suffix(config.output_path.suffix + ".summary.json")
    with summary_path.open("w", encoding="utf-8") as handle:
        json.dump(asdict(summary), handle, indent=2, sort_keys=True)
    return summary
@dataclass(frozen=True)
 class _ResolvedFilters:
    min_turns: int
    min_initial_input_tokens: int | None
    max_initial_input_tokens: int | None
    max_append_input_tokens: int | None
    max_output_tokens: int | None
    min_overlap_ratio: float | None
 def _resolve_filters(config: SessionSampleConfig) -> _ResolvedFilters:
    if config.profile == "default":
        return _ResolvedFilters(
            min_turns=config.min_turns,
            min_initial_input_tokens=config.min_initial_input_tokens,
            max_initial_input_tokens=config.max_initial_input_tokens,
            max_append_input_tokens=config.max_append_input_tokens,
            max_output_tokens=config.max_output_tokens,
            min_overlap_ratio=config.min_overlap_ratio,
        )
    if config.profile != "small-append":
        raise ValueError(f"Unsupported sample profile: {config.profile}")
    return _ResolvedFilters(
        min_turns=max(config.min_turns, 2),
        min_initial_input_tokens=(
            2048
            if config.min_initial_input_tokens is None
            else config.min_initial_input_tokens
        ),
        max_initial_input_tokens=(
            16000
            if config.max_initial_input_tokens is None
            else config.max_initial_input_tokens
        ),
        max_append_input_tokens=(
            2048
            if config.max_append_input_tokens is None
            else config.max_append_input_tokens
        ),
        max_output_tokens=(
            2048 if config.max_output_tokens is None else config.max_output_tokens
        ),
        min_overlap_ratio=(
            0.75 if config.min_overlap_ratio is None else config.min_overlap_ratio
        ),
    )
 def _session_matches_filters(
    session_requests: list[TraceRequest],
    filters: _ResolvedFilters,
 ) -> bool:
    ordered = sorted(
        session_requests,
        key=lambda request: (request.timestamp_s, request.turn_id, request.chat_id),
    )
    if not ordered:
        return False
    initial = ordered[0]
    if (
        filters.min_initial_input_tokens is not None
        and initial.input_length < filters.min_initial_input_tokens
    ):
        return False
    if (
        filters.max_initial_input_tokens is not None
        and initial.input_length > filters.max_initial_input_tokens
    ):
        return False
    if filters.max_output_tokens is not None and any(
        request.output_length > filters.max_output_tokens for request in ordered
    ):
        return False
    append_lengths = _turn_append_lengths(ordered)
    if filters.max_append_input_tokens is not None and any(
        append_length <= 0 or append_length > filters.max_append_input_tokens
        for append_length in append_lengths
    ):
        return False
    overlap_ratios = _turn_overlap_ratios(ordered)
    if filters.min_overlap_ratio is not None and any(
        overlap_ratio < filters.min_overlap_ratio for overlap_ratio in overlap_ratios
    ):
        return False
    return True
 def _turn_append_lengths(session_requests: list[TraceRequest]) -> list[int]:
    ordered = sorted(
        session_requests,
        key=lambda request: (request.timestamp_s, request.turn_id, request.chat_id),
    )
    return [
        current.input_length - (previous.input_length + previous.output_length)
        for previous, current in zip(ordered, ordered[1:], strict=False)
    ]
 def _turn_overlap_ratios(session_requests: list[TraceRequest]) -> list[float]:
    ordered = sorted(
        session_requests,
        key=lambda request: (request.timestamp_s, request.turn_id, request.chat_id),
    )
    ratios: list[float] = []
    for previous, current in zip(ordered, ordered[1:], strict=False):
        if not current.hash_ids:
            ratios.append(0.0)
            continue
        previous_blocks = set(previous.hash_ids)
        overlap = sum(1 for block in current.hash_ids if block in previous_blocks)
        ratios.append(overlap / len(current.hash_ids))
    return ratios
 def _mean(values: list[int] | list[float]) -> float | None:
    if not values:
        return None
    return sum(values) / len(values)
 def _keep_session(session_id: str, sample_rate: float) -> bool:
    if sample_rate >= 1.0:
        return True
    if sample_rate <= 0.0:
        return False
    digest = hashlib.blake2b(session_id.encode("utf-8"), digest_size=8).digest()
    bucket = int.from_bytes(digest, byteorder="big", signed=False) / 2**64
    return bucket < sample_rate
--- a/src/agentic_pd_hybrid/stack.py
+++ b/src/agentic_pd_hybrid/stack.py
@@ -0,0 +1,222 @@
 from __future__ import annotations
 import os
 import signal
 import subprocess
 import time
 from dataclasses import dataclass
 from pathlib import Path
 import httpx
 from agentic_pd_hybrid.launcher import build_launch_plan
 from agentic_pd_hybrid.topology import SingleNodeTopology
@dataclass
 class ManagedProcess:
    name: str
    command: tuple[str, ...]
    process: subprocess.Popen[bytes]
    log_path: Path
@dataclass
 class ManagedPdStack:
    topology: SingleNodeTopology
    run_dir: Path
    prefill_processes: list[ManagedProcess]
    decode_processes: list[ManagedProcess]
    direct_processes: list[ManagedProcess]
    router_process: ManagedProcess | None
    @property
    def router_url(self) -> str:
        return self.topology.router_url
    def stop(self) -> None:
        processes = (
            ([self.router_process] if self.router_process is not None else [])
            + self.direct_processes
            + self.decode_processes
            + self.prefill_processes
        )
        for managed in processes:
            if managed.process.poll() is None:
                os.killpg(os.getpgid(managed.process.pid), signal.SIGTERM)
        deadline = time.time() + 20
        for managed in processes:
            if managed.process.poll() is not None:
                continue
            remaining = max(0.0, deadline - time.time())
            try:
                managed.process.wait(timeout=remaining)
            except subprocess.TimeoutExpired:
                if managed.process.poll() is None:
                    os.killpg(os.getpgid(managed.process.pid), signal.SIGKILL)
                    managed.process.wait(timeout=5)
 def launch_pd_stack(
    *,
    topology: SingleNodeTopology,
    run_dir: Path,
    prefill_policy: str,
    decode_policy: str,
    timeout_s: float = 1200.0,
    include_router: bool = True,
 ) -> ManagedPdStack:
    run_dir.mkdir(parents=True, exist_ok=True)
    logs_dir = run_dir / "logs"
    logs_dir.mkdir(parents=True, exist_ok=True)
    plan = build_launch_plan(
        topology,
        prefill_policy=prefill_policy,
        decode_policy=decode_policy,
        include_router=include_router,
    )
    prefill_processes = [
        _spawn_process(
            name=f"prefill-{idx}",
            command=command,
            log_path=logs_dir / f"prefill-{idx}.log",
            topology=topology,
        )
        for idx, command in enumerate(plan.prefill_commands)
    ]
    decode_processes = [
        _spawn_process(
            name=f"decode-{idx}",
            command=command,
            log_path=logs_dir / f"decode-{idx}.log",
            topology=topology,
        )
        for idx, command in enumerate(plan.decode_commands)
    ]
    direct_processes = [
        _spawn_process(
            name=f"direct-{idx}",
            command=command,
            log_path=logs_dir / f"direct-{idx}.log",
            topology=topology,
        )
        for idx, command in enumerate(plan.direct_commands)
    ]
    router_process: ManagedProcess | None = None
    try:
        for worker in topology.prefill_workers:
            _wait_for_ready_endpoint(f"{worker.url}/v1/models", timeout_s=timeout_s)
        for worker in topology.decode_workers:
            _wait_for_ready_endpoint(f"{worker.url}/v1/models", timeout_s=timeout_s)
        for worker in topology.direct_workers:
            _wait_for_ready_endpoint(f"{worker.url}/v1/models", timeout_s=timeout_s)
        if plan.router_command is not None:
            router_process = _spawn_process(
                name="router",
                command=plan.router_command,
                log_path=logs_dir / "router.log",
                topology=topology,
            )
            _wait_for_ready_endpoint(f"{topology.router_url}/health", timeout_s=timeout_s)
    except Exception:
        stack = ManagedPdStack(
            topology=topology,
            run_dir=run_dir,
            prefill_processes=prefill_processes,
            decode_processes=decode_processes,
            direct_processes=direct_processes,
            router_process=router_process,
        )
        stack.stop()
        raise
    return ManagedPdStack(
        topology=topology,
        run_dir=run_dir,
        prefill_processes=prefill_processes,
        decode_processes=decode_processes,
        direct_processes=direct_processes,
        router_process=router_process,
    )
 def _spawn_process(
    *,
    name: str,
    command: tuple[str, ...],
    log_path: Path,
    topology: SingleNodeTopology,
 ) -> ManagedProcess:
    log_handle = log_path.open("wb")
    env = _build_process_env(topology)
    process = subprocess.Popen(
        command,
        stdout=log_handle,
        stderr=subprocess.STDOUT,
        env=env,
        preexec_fn=os.setsid,
    )
    return ManagedProcess(
        name=name,
        command=command,
        process=process,
        log_path=log_path,
    )
 def _build_process_env(topology: SingleNodeTopology) -> dict[str, str]:
    env = os.environ.copy()
    env["PYTHONDONTWRITEBYTECODE"] = "1"
    env["PYTHONUNBUFFERED"] = "1"
    # SGLang's PD bootstrap path uses `requests`; force localhost traffic to stay local.
    for key in (
        "http_proxy",
        "https_proxy",
        "all_proxy",
        "HTTP_PROXY",
        "HTTPS_PROXY",
        "ALL_PROXY",
    ):
        env.pop(key, None)
    env["NO_PROXY"] = "*"
    env["no_proxy"] = "*"
    env.setdefault("SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", "600")
    env.setdefault("SGLANG_DISAGGREGATION_WAITING_TIMEOUT", "60")
    env.setdefault("FLASHINFER_DISABLE_VERSION_CHECK", "1")
    if topology.force_rdma:
        env["MOONCAKE_PROTOCOL"] = "rdma"
        env["MC_MS_AUTO_DISC"] = "0"
        if topology.ib_device:
            env["MOONCAKE_DEVICE"] = topology.ib_device
    repo_root = Path(__file__).resolve().parents[2]
    python_paths = [
        str(repo_root / "src"),
        str(repo_root / "third_party" / "sglang" / "python"),
    ]
    existing_pythonpath = env.get("PYTHONPATH")
    if existing_pythonpath:
        python_paths.append(existing_pythonpath)
    env["PYTHONPATH"] = os.pathsep.join(python_paths)
    return env
 def _wait_for_ready_endpoint(url: str, *, timeout_s: float) -> None:
    start = time.perf_counter()
    last_error: str | None = None
    with httpx.Client(timeout=5.0, trust_env=False) as client:
        while time.perf_counter() - start < timeout_s:
            try:
                response = client.get(url)
                if response.status_code == 200:
                    return
                last_error = f"status={response.status_code}"
            except Exception as exc:  # pragma: no cover
                last_error = f"{type(exc).__name__}: {exc}"
            time.sleep(1.0)
    raise TimeoutError(f"Timed out waiting for {url} ({last_error})")
--- a/src/agentic_pd_hybrid/topology.py
+++ b/src/agentic_pd_hybrid/topology.py
@@ -0,0 +1,245 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Literal
 WorkerRole = Literal["prefill", "decode", "direct"]
@dataclass(frozen=True)
 class WorkerSpec:
    role: WorkerRole
    ordinal: int
    gpu_ids: tuple[int, ...]
    host: str
    port: int
    bootstrap_port: int | None = None
    @property
    def worker_id(self) -> str:
        return f"{self.role}-{self.ordinal}"
    @property
    def url(self) -> str:
        return f"http://{self.host}:{self.port}"
    @property
    def gpu_id(self) -> int:
        return self.gpu_ids[0]
    @property
    def tp_size(self) -> int:
        return len(self.gpu_ids)
@dataclass(frozen=True)
 class SingleNodeTopology:
    model_path: str
    prefill_workers: tuple[WorkerSpec, ...]
    decode_workers: tuple[WorkerSpec, ...]
    direct_workers: tuple[WorkerSpec, ...]
    router_host: str
    router_port: int
    transfer_backend: str
    trust_remote_code: bool
    force_rdma: bool = False
    ib_device: str | None = None
    extra_server_args: tuple[str, ...] = ()
    prefill_extra_server_args: tuple[str, ...] = ()
    decode_extra_server_args: tuple[str, ...] = ()
    direct_extra_server_args: tuple[str, ...] = ()
    @property
    def model_name(self) -> str:
        return Path(self.model_path).name
    @property
    def router_url(self) -> str:
        return f"http://{self.router_host}:{self.router_port}"
    @property
    def route_workers(self) -> tuple[WorkerSpec, ...]:
        if self.decode_workers:
            return self.decode_workers
        return self.direct_workers
    def route_index(self, worker_id: str) -> int:
        for idx, worker in enumerate(self.route_workers):
            if worker.worker_id == worker_id:
                return idx
        raise KeyError(f"Unknown route worker: {worker_id}")
 def build_single_node_topology(
    *,
    model_path: str,
    prefill_worker_count: int,
    decode_worker_count: int,
    direct_worker_count: int = 0,
    prefill_tp_size: int = 1,
    decode_tp_size: int = 1,
    direct_tp_size: int = 1,
    prefill_gpu_ids: tuple[int, ...] | None = None,
    decode_gpu_ids: tuple[int, ...] | None = None,
    direct_gpu_ids: tuple[int, ...] | None = None,
    total_gpu_budget: int = 8,
    host: str = "127.0.0.1",
    router_port: int = 8000,
    prefill_port_base: int = 30000,
    decode_port_base: int = 31000,
    direct_port_base: int = 32000,
    bootstrap_port_base: int = 8998,
    transfer_backend: str = "nixl",
    force_rdma: bool = False,
    trust_remote_code: bool = True,
    ib_device: str | None = None,
    extra_server_args: tuple[str, ...] = (),
    prefill_extra_server_args: tuple[str, ...] = (),
    decode_extra_server_args: tuple[str, ...] = (),
    direct_extra_server_args: tuple[str, ...] = (),
 ) -> SingleNodeTopology:
    if prefill_worker_count < 0:
        raise ValueError("prefill_worker_count must be >= 0")
    if decode_worker_count < 0:
        raise ValueError("decode_worker_count must be >= 0")
    if direct_worker_count < 0:
        raise ValueError("direct_worker_count must be >= 0")
    if (
        prefill_worker_count == 0
        and decode_worker_count == 0
        and direct_worker_count == 0
    ):
        raise ValueError("At least one worker must be configured")
    if prefill_tp_size <= 0:
        raise ValueError("prefill_tp_size must be >= 1")
    if decode_tp_size <= 0:
        raise ValueError("decode_tp_size must be >= 1")
    if direct_tp_size <= 0:
        raise ValueError("direct_tp_size must be >= 1")
    if force_rdma and not ib_device:
        raise ValueError("force_rdma requires --ib-device to be set")
    if force_rdma and transfer_backend != "mooncake":
        raise ValueError(
            "force_rdma currently requires transfer_backend='mooncake' "
            "to guarantee an RDMA path"
        )
    total_gpus_required = (
        prefill_worker_count * prefill_tp_size
        + decode_worker_count * decode_tp_size
        + direct_worker_count * direct_tp_size
    )
    if total_gpus_required > total_gpu_budget:
        raise ValueError(
            "Single-node GPU budget exceeded: "
            f"{prefill_worker_count} prefill x tp={prefill_tp_size} + "
            f"{decode_worker_count} decode x tp={decode_tp_size} + "
            f"{direct_worker_count} direct x tp={direct_tp_size} > "
            f"{total_gpu_budget} GPUs"
        )
    if prefill_gpu_ids is None:
        prefill_gpu_ids = tuple(range(prefill_worker_count * prefill_tp_size))
    if decode_gpu_ids is None:
        decode_gpu_ids = tuple(
            range(
                len(prefill_gpu_ids),
                len(prefill_gpu_ids) + decode_worker_count * decode_tp_size,
            )
        )
    if direct_gpu_ids is None:
        direct_gpu_ids = tuple(
            range(
                len(prefill_gpu_ids) + len(decode_gpu_ids),
                len(prefill_gpu_ids)
                + len(decode_gpu_ids)
                + direct_worker_count * direct_tp_size,
            )
        )
    if len(prefill_gpu_ids) != prefill_worker_count * prefill_tp_size:
        raise ValueError(
            "prefill_gpu_ids length must equal prefill_worker_count * prefill_tp_size: "
            f"{len(prefill_gpu_ids)} != {prefill_worker_count * prefill_tp_size}"
        )
    if len(decode_gpu_ids) != decode_worker_count * decode_tp_size:
        raise ValueError(
            "decode_gpu_ids length must equal decode_worker_count * decode_tp_size: "
            f"{len(decode_gpu_ids)} != {decode_worker_count * decode_tp_size}"
        )
    if len(direct_gpu_ids) != direct_worker_count * direct_tp_size:
        raise ValueError(
            "direct_gpu_ids length must equal direct_worker_count * direct_tp_size: "
            f"{len(direct_gpu_ids)} != {direct_worker_count * direct_tp_size}"
        )
    assigned_gpu_ids = prefill_gpu_ids + decode_gpu_ids + direct_gpu_ids
    if len(set(assigned_gpu_ids)) != len(assigned_gpu_ids):
        raise ValueError("prefill/decode/direct GPU IDs must be unique")
    if any(gpu_id < 0 or gpu_id >= total_gpu_budget for gpu_id in assigned_gpu_ids):
        raise ValueError(
            "GPU IDs must fall within the single-node budget range "
            f"[0, {total_gpu_budget - 1}]"
        )
    prefill_workers = tuple(
        WorkerSpec(
            role="prefill",
            ordinal=idx,
            gpu_ids=tuple(
                prefill_gpu_ids[
                    idx * prefill_tp_size : (idx + 1) * prefill_tp_size
                ]
            ),
            host=host,
            port=prefill_port_base + idx,
            bootstrap_port=bootstrap_port_base + idx,
        )
        for idx in range(prefill_worker_count)
    )
    decode_workers = tuple(
        WorkerSpec(
            role="decode",
            ordinal=idx,
            gpu_ids=tuple(
                decode_gpu_ids[
                    idx * decode_tp_size : (idx + 1) * decode_tp_size
                ]
            ),
            host=host,
            port=decode_port_base + idx,
        )
        for idx in range(decode_worker_count)
    )
    direct_workers = tuple(
        WorkerSpec(
            role="direct",
            ordinal=idx,
            gpu_ids=tuple(
                direct_gpu_ids[
                    idx * direct_tp_size : (idx + 1) * direct_tp_size
                ]
            ),
            host=host,
            port=direct_port_base + idx,
        )
        for idx in range(direct_worker_count)
    )
    return SingleNodeTopology(
        model_path=model_path,
        prefill_workers=prefill_workers,
        decode_workers=decode_workers,
        direct_workers=direct_workers,
        router_host=host,
        router_port=router_port,
        transfer_backend=transfer_backend,
        trust_remote_code=trust_remote_code,
        force_rdma=force_rdma,
        ib_device=ib_device,
        extra_server_args=extra_server_args,
        prefill_extra_server_args=prefill_extra_server_args,
        decode_extra_server_args=decode_extra_server_args,
        direct_extra_server_args=direct_extra_server_args,
    )
--- a/src/agentic_pd_hybrid/trace.py
+++ b/src/agentic_pd_hybrid/trace.py
@@ -0,0 +1,106 @@
 from __future__ import annotations
 import json
 from dataclasses import dataclass
 from pathlib import Path
@dataclass(frozen=True)
 class TraceRequest:
    request_id: str
    session_id: str
    chat_id: int
    parent_chat_id: int
    timestamp_s: float
    input_length: int
    output_length: int
    request_type: str
    turn_id: int
    hash_ids: tuple[int, ...]
 def load_trace(path: Path, *, request_limit: int | None = None) -> list[TraceRequest]:
    chat_to_session: dict[int, str] = {}
    requests: list[TraceRequest] = []
    with path.open("r", encoding="utf-8") as handle:
        for index, line in enumerate(handle):
            if request_limit is not None and len(requests) >= request_limit:
                break
            payload = json.loads(line)
            chat_id = int(payload["chat_id"])
            parent_chat_id = int(payload["parent_chat_id"])
            session_id = _resolve_session_id(
                chat_id=chat_id,
                parent_chat_id=parent_chat_id,
                chat_to_session=chat_to_session,
            )
            turn_id = int(payload["turn"])
            request_id = f"{session_id}:{turn_id}:{chat_id}:{index}"
            requests.append(
                TraceRequest(
                    request_id=request_id,
                    session_id=session_id,
                    chat_id=chat_id,
                    parent_chat_id=parent_chat_id,
                    timestamp_s=float(payload["timestamp"]),
                    input_length=int(payload["input_length"]),
                    output_length=int(payload["output_length"]),
                    request_type=str(payload["type"]),
                    turn_id=turn_id,
                    hash_ids=tuple(int(item) for item in payload.get("hash_ids", [])),
                )
            )
    return requests
 def build_synthetic_prompt(
    request: TraceRequest,
    *,
    block_token_budget: int = 24,
 ) -> str:
    return " ".join(build_synthetic_prompt_tokens(request, block_token_budget=block_token_budget))
 def build_synthetic_prompt_tokens(
    request: TraceRequest,
    *,
    block_token_budget: int = 24,
 ) -> list[str]:
    tokens: list[str] = []
    for hash_id in request.hash_ids:
        for offset in range(block_token_budget):
            tokens.append(f"blk{hash_id}_{offset}")
    while len(tokens) < request.input_length:
        tokens.append(f"fill_{len(tokens) % 64}")
    return tokens[: request.input_length]
 def build_synthetic_append_chunk(
    request: TraceRequest,
    append_length: int,
 ) -> str:
    if append_length <= 0:
        return ""
    return " ".join(
        f"turn{request.turn_id}_append_{request.chat_id}_{offset}"
        for offset in range(append_length)
    )
 def _resolve_session_id(
    *,
    chat_id: int,
    parent_chat_id: int,
    chat_to_session: dict[int, str],
 ) -> str:
    if parent_chat_id < 0:
        session_id = str(chat_id)
    else:
        session_id = chat_to_session.get(parent_chat_id, str(parent_chat_id))
    chat_to_session[chat_id] = session_id
    return session_id
--- a/src/agentic_pd_hybrid/trace_profiles.py
+++ b/src/agentic_pd_hybrid/trace_profiles.py
@@ -0,0 +1,127 @@
 from __future__ import annotations
 import json
 from collections import defaultdict
 from dataclasses import asdict, dataclass
 from math import ceil
 from pathlib import Path
 from agentic_pd_hybrid.trace import TraceRequest, load_trace
 BLOCK_TOKEN_BUDGET = 24
@dataclass(frozen=True)
 class NormalizeTraceLengthsConfig:
    trace_path: Path
    output_path: Path
    initial_input_length: int = 10_000
    append_input_length: int = 1_000
    output_length: int = 1_000
    max_requests: int | None = None
@dataclass(frozen=True)
 class NormalizeTraceLengthsSummary:
    input_trace_path: str
    output_trace_path: str
    request_count: int
    session_count: int
    multi_turn_session_count: int
    initial_input_length: int
    append_input_length: int
    output_length: int
    max_turns_per_session: int
    max_input_length: int
 def normalize_trace_lengths(
    config: NormalizeTraceLengthsConfig,
 ) -> NormalizeTraceLengthsSummary:
    if config.initial_input_length < 0:
        raise ValueError("initial_input_length must be >= 0")
    if config.append_input_length < 0:
        raise ValueError("append_input_length must be >= 0")
    if config.output_length < 0:
        raise ValueError("output_length must be >= 0")
    requests = load_trace(config.trace_path, request_limit=config.max_requests)
    sessions: dict[str, list[TraceRequest]] = defaultdict(list)
    for request in requests:
        sessions[request.session_id].append(request)
    normalized_records: list[dict[str, object]] = []
    max_turns_per_session = 0
    max_input_length = 0
    for session_idx, session_id in enumerate(sorted(sessions, key=_session_sort_key)):
        session_requests = sorted(
            sessions[session_id],
            key=lambda request: (request.timestamp_s, request.turn_id, request.chat_id),
        )
        max_turns_per_session = max(max_turns_per_session, len(session_requests))
        base_block_count = ceil(config.initial_input_length / BLOCK_TOKEN_BUDGET)
        base_hash_ids = [
            _hash_id_for(session_idx=session_idx, block_idx=block_idx)
            for block_idx in range(base_block_count)
        ]
        for turn_idx, request in enumerate(session_requests):
            input_length = config.initial_input_length + turn_idx * (
                config.append_input_length + config.output_length
            )
            total_block_count = ceil(input_length / BLOCK_TOKEN_BUDGET)
            hash_ids = base_hash_ids + [
                _hash_id_for(
                    session_idx=session_idx,
                    block_idx=base_block_count + append_block_idx,
                )
                for append_block_idx in range(max(0, total_block_count - base_block_count))
            ]
            max_input_length = max(max_input_length, input_length)
            normalized_records.append(
                {
                    "chat_id": request.chat_id,
                    "parent_chat_id": request.parent_chat_id,
                    "timestamp": request.timestamp_s,
                    "input_length": input_length,
                    "output_length": config.output_length,
                    "type": request.request_type,
                    "turn": request.turn_id,
                    "hash_ids": hash_ids,
                }
            )
    normalized_records.sort(key=lambda item: float(item["timestamp"]))
    config.output_path.parent.mkdir(parents=True, exist_ok=True)
    with config.output_path.open("w", encoding="utf-8") as handle:
        for record in normalized_records:
            handle.write(json.dumps(record, sort_keys=True) + "\n")
    summary = NormalizeTraceLengthsSummary(
        input_trace_path=str(config.trace_path),
        output_trace_path=str(config.output_path),
        request_count=len(normalized_records),
        session_count=len(sessions),
        multi_turn_session_count=sum(
            1 for session_requests in sessions.values() if len(session_requests) > 1
        ),
        initial_input_length=config.initial_input_length,
        append_input_length=config.append_input_length,
        output_length=config.output_length,
        max_turns_per_session=max_turns_per_session,
        max_input_length=max_input_length,
    )
    summary_path = config.output_path.with_suffix(config.output_path.suffix + ".summary.json")
    with summary_path.open("w", encoding="utf-8") as handle:
        json.dump(asdict(summary), handle, indent=2, sort_keys=True)
    return summary
 def _hash_id_for(*, session_idx: int, block_idx: int) -> int:
    return session_idx * 1_000_000 + block_idx
 def _session_sort_key(session_id: str) -> tuple[int, str]:
    return (0, session_id) if session_id.isdigit() else (1, session_id)
--- a/uv.lock
+++ b/uv.lock