Tighten LLM proposal schema

2026-04-04 23:24:32 +08:00
parent 00778eff42
commit 8b024c72f1
4 changed files with 133 additions and 0 deletions
--- a/configs/examples/dash0_llm_10min_study_run1f.json
+++ b/configs/examples/dash0_llm_10min_study_run1f.json
@@ -0,0 +1,103 @@
 {
  "study_id": "dash0-qwen30b-chat-10min-llm-run1f",
  "hardware": {
    "gpu_count": 4,
    "gpu_model": "H20",
    "host_candidates": [
      "dash0"
    ]
  },
  "model": {
    "model_id": "qwen3-30b-a3b",
    "served_model_name": "qwen3-30b-smoke"
  },
  "engine": {
    "engine_name": "vllm",
    "engine_version": "0.13.0rc2.dev2111+gb44b43f43.d20260309",
    "exec_path": "/usr/local/bin/vllm",
    "cwd": "/home/admin/cpfs/wjh/aituner/aituner",
    "host": "127.0.0.1",
    "port": 18081,
    "healthcheck_path": "/v1/models",
    "ready_timeout_s": 900,
    "request_timeout_s": 900,
    "launch_args": [
      "serve",
      "/home/admin/resource/model/464482ce.qwen3-30b-a3b/1m-instruct-0726-fp4"
    ],
    "base_envs": {
      "CUDA_VISIBLE_DEVICES": "4,5,6,7",
      "VLLM_FP8_USE_BLADNN": "1",
      "VLLM_MOE_USE_BLADNN": "1"
    },
    "base_flags": {
      "host": "127.0.0.1",
      "port": 18081,
      "served-model-name": "qwen3-30b-smoke",
      "max-model-len": 65536,
      "disable-log-requests": true,
      "trust-remote-code": true
    },
    "tunable_envs": [
      "VLLM_ATTENTION_BACKEND"
    ],
    "tunable_flags": [
      "tensor-parallel-size",
      "max-num-seqs",
      "max-num-batched-tokens",
      "gpu-memory-utilization",
      "block-size"
    ],
    "python_executable": "python3"
  },
  "trace": {
    "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
    "window_id": "chat_w20260311_1000",
    "u_field": "sampling_u",
    "timestamp_field": "timestamp",
    "max_concurrency": 64,
    "replay_time_scale": 1.0,
    "early_stop_max_lag_s": 120.0,
    "early_stop_max_elapsed_s": 900.0
  },
  "slo": {
    "target_pass_rate": 0.95,
    "ttft_rule": {
      "kind": "step_ms",
      "buckets": [
        {
          "max_input_tokens": 4096,
          "threshold_ms": 15000
        },
        {
          "max_input_tokens": 16384,
          "threshold_ms": 30000
        },
        {
          "threshold_ms": 45000
        }
      ]
    },
    "tpot_rule": {
      "kind": "fixed_ms",
      "threshold_ms": 1500
    }
  },
  "search": {
    "low": 0.0,
    "high": 1.0,
    "tolerance": 0.1,
    "max_probes": 4,
    "sample_seed": 20260325
  },
  "llm": {
    "system_prompt": "Propose a single engine config patch that increases the maximum feasible sampling_u under the SLO target.",
    "max_history_trials": 8,
    "endpoint": {
      "base_url": "http://tianx.ipads-lab.se.sjtu.edu.cn:8317/v1",
      "model": "gpt-5.4",
      "api_key_env": "OPENAI_API_KEY",
      "timeout_s": 180
    }
  }
 }
--- a/src/aituner/llm.py
+++ b/src/aituner/llm.py
@@ -31,7 +31,9 @@ def build_prompt(
        "You are tuning an OpenAI-compatible serving engine.",
        "Return exactly one JSON object with keys: observation, diagnosis, config_patch, expected_effects, why_not_previous_failures.",
        "config_patch must contain env_patch and flag_patch.",
        "expected_effects must be a JSON array of short strings, not an object.",
        "Only use allowed tunable env keys and allowed tunable flag keys.",
        "Do not wrap the JSON in markdown fences or any extra text.",
        "",
        "Study stack:",
        json.dumps(
--- a/src/aituner/spec.py
+++ b/src/aituner/spec.py
@@ -398,6 +398,17 @@ class Proposal:
        expected_effects = data.get("expected_effects")
        if isinstance(expected_effects, str):
            expected_effects_value = [expected_effects.strip()] if expected_effects.strip() else []
        elif isinstance(expected_effects, Mapping):
            expected_effects_value = []
            for key, value in expected_effects.items():
                key_text = str(key).strip()
                value_text = str(value).strip()
                if key_text and value_text:
                    expected_effects_value.append(f"{key_text}: {value_text}")
                elif key_text:
                    expected_effects_value.append(key_text)
                elif value_text:
                    expected_effects_value.append(value_text)
        else:
            expected_effects_value = _coerce_str_list(
                expected_effects, context="proposal.expected_effects"
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -682,6 +682,23 @@ class CoreFlowTests(unittest.TestCase):
        )
        self.assertEqual(proposal.expected_effects, ["higher throughput"])
    def test_proposal_expected_effects_accepts_object(self) -> None:
        proposal = Proposal.from_dict(
            {
                "observation": "obs",
                "diagnosis": "diag",
                "config_patch": {"env_patch": {}, "flag_patch": {}},
                "expected_effects": {
                    "throughput": "higher",
                    "ttft": "lower",
                },
            }
        )
        self.assertEqual(
            proposal.expected_effects,
            ["throughput: higher", "ttft: lower"],
        )
    def test_replay_requests_early_stops_when_slo_is_unrecoverable(self) -> None:
        requests = [
            TraceRequest(