{ "study_id": "example-chat-window", "hardware": { "gpu_count": 8, "gpu_model": "H20", "host_candidates": ["dash0", "dash1"] }, "model": { "model_id": "qwen3-30b", "served_model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507" }, "engine": { "engine_name": "vllm", "engine_version": "0.x", "exec_path": "/usr/local/bin/vllm", "cwd": ".", "host": "127.0.0.1", "port": 8000, "healthcheck_path": "/v1/models", "ready_timeout_s": 600, "request_timeout_s": 600, "launch_args": [ "serve", "/path/to/model" ], "base_envs": {}, "base_flags": { "host": "127.0.0.1", "port": 8000, "served-model-name": "Qwen/Qwen3-30B-A3B-Instruct-2507" }, "tunable_envs": [ "VLLM_ATTENTION_BACKEND", "CUDA_GRAPH_MAX_BATCH_SIZE" ], "tunable_flags": [ "tensor-parallel-size", "data-parallel-size", "pipeline-parallel-size", "max-num-seqs", "max-num-batched-tokens", "gpu-memory-utilization", "enable-prefix-caching", "block-size" ], "python_executable": "python3" }, "trace": { "windows_path": "trace_windows/windows.json", "window_id": "chat_w_example_peak_0001", "u_field": "sampling_u", "timestamp_field": "timestamp", "max_concurrency": 64 }, "slo": { "target_pass_rate": 0.95, "ttft_rule": { "kind": "step_ms", "buckets": [ { "max_input_tokens": 4096, "threshold_ms": 2000 }, { "max_input_tokens": 16384, "threshold_ms": 4000 }, { "threshold_ms": 8000 } ] }, "tpot_rule": { "kind": "fixed_ms", "threshold_ms": 120 } }, "search": { "low": 0.0, "high": 1.0, "tolerance": 0.01, "max_probes": 8, "sample_seed": 20260325 }, "llm": { "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target.", "max_history_trials": 8, "endpoint": { "base_url": "https://example-openai-compatible-endpoint", "model": "gpt-4.1-mini", "api_key_env": "OPENAI_API_KEY", "timeout_s": 120 } }, "capability_profile_path": "capability.example.json" }