replaysim/configs/rs3_tiny_sweep.json

{
  "suite_id": "rs3_tiny_smoke",
  "sim": "frontier_patched",
  "frontier": {
    "root": "/tmp/replayserve-frontier-rs1b",
    "mode": "patched_scratch",
    "patch": "patches/frontier-vllm-v1-prefix-cache-chunked-prefill.patch"
  },
  "fixtures": [
    "coder_100"
  ],
  "defaults": {
    "simulation_mode": "online",
    "sys_arch": "co-location",
    "cluster_scheduler": "sticky_round_robin",
    "replica_scheduler": "vllm_v1",
    "model_name": "Qwen/Qwen3-32B",
    "device": "a800",
    "network_device": "a800_dgx",
    "attn_tensor_parallel_size": 2,
    "attn_data_parallel_size": 1,
    "moe_tensor_parallel_size": 1,
    "moe_expert_parallel_size": 1,
    "num_pipeline_stages": 1,
    "num_replicas": 1,
    "batch_size_cap": 128,
    "max_tokens_in_batch": 32768,
    "block_size": 16,
    "enable_prefix_caching": true,
    "enable_chunked_prefill": true,
    "long_prefill_token_threshold": 64,
    "trace_max_tokens": 32768,
    "num_blocks_mode": "memory_planner",
    "gpu_memory_utilization": 0.9,
    "non_kv_cache_overhead_bytes": 0,
    "dummy_execution_time_ms": 1.0
  },
  "configs": [
    {
      "id": "fixed_prefix_on",
      "description": "RS1 fixed config on patched Frontier scratch."
    },
    {
      "id": "prefix_cache_off",
      "description": "Diagnosis/control config with prefix cache disabled; all other fixed scheduler knobs unchanged.",
      "overrides": {
        "enable_prefix_caching": false
      }
    }
  ]
}