{ "suite_id": "rs3_tiny_smoke", "sim": "frontier_patched", "frontier": { "root": "/tmp/replayserve-frontier-rs1b", "mode": "patched_scratch", "patch": "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch" }, "fixtures": [ "coder_100" ], "defaults": { "simulation_mode": "online", "sys_arch": "co-location", "cluster_scheduler": "sticky_round_robin", "replica_scheduler": "vllm_v1", "model_name": "Qwen/Qwen3-32B", "device": "a800", "network_device": "a800_dgx", "attn_tensor_parallel_size": 2, "attn_data_parallel_size": 1, "moe_tensor_parallel_size": 1, "moe_expert_parallel_size": 1, "num_pipeline_stages": 1, "num_replicas": 1, "batch_size_cap": 128, "max_tokens_in_batch": 32768, "block_size": 16, "enable_prefix_caching": true, "enable_chunked_prefill": true, "long_prefill_token_threshold": 64, "trace_max_tokens": 32768, "num_blocks_mode": "memory_planner", "gpu_memory_utilization": 0.9, "non_kv_cache_overhead_bytes": 0, "dummy_execution_time_ms": 1.0 }, "configs": [ { "id": "fixed_prefix_on", "description": "RS1 fixed config on patched Frontier scratch." }, { "id": "prefix_cache_off", "description": "Diagnosis/control config with prefix cache disabled; all other fixed scheduler knobs unchanged.", "overrides": { "enable_prefix_caching": false } } ] }