diff --git a/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json b/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json index 1e6fa48..74c2440 100644 --- a/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json +++ b/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json @@ -1,5 +1,5 @@ { - "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-probe512-harness", + "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-probe512-harness", "hardware": { "gpu_count": 8, "gpu_model": "H20", @@ -64,6 +64,7 @@ "trace": { "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", "window_id": "chat_w20260311_1000", + "completion_tokens_override": 128, "u_field": "sampling_u", "timestamp_field": "timestamp", "max_concurrency": 64, @@ -103,7 +104,7 @@ "low": 0.0, "high": 0.125, "tolerance": 0.001, - "max_probes": 6, + "max_probes": 4, "sample_seed": 20260325 }, "llm": { diff --git a/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json b/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json index 8008fb8..ef0933d 100644 --- a/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json +++ b/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json @@ -1,5 +1,5 @@ { - "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-probe512-noharness", + "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-probe512-noharness", "hardware": { "gpu_count": 8, "gpu_model": "H20", @@ -64,6 +64,7 @@ "trace": { "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", "window_id": "chat_w20260311_1000", + "completion_tokens_override": 128, "u_field": "sampling_u", "timestamp_field": "timestamp", "max_concurrency": 64, @@ -103,7 +104,7 @@ "low": 0.0, "high": 0.125, "tolerance": 0.001, - "max_probes": 6, + "max_probes": 4, "sample_seed": 20260325 }, "llm": { diff --git a/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md b/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md index 3349f82..839373e 100644 --- a/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md +++ b/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md @@ -38,15 +38,18 @@ The experiment reuses the 0-8k chat window that has already been used for qwen27 | window | `chat_w20260311_1000` | | source rows | 32606 | | input filter | 0 to 8192 tokens | +| completion tokens | fixed 128 via `trace.completion_tokens_override` | | max requests per probe | 512 | | target pass rate | 0.95 | | TTFT SLO | 2s up to 4k, 4s up to 32k, 6s above | | TPOT SLO | 50ms | | search high | 0.125 sampling_u | -| max probes per trial | 6 | +| max probes per trial | 4 | The `max_requests_per_probe=512` cap keeps the fresh community-vLLM ablation practical while preserving a real trace-shaped replay, SLO scoring, and binary-search threshold probe. A trace-only count check gives 31 to 65 selected requests across the six binary-search thresholds, avoiding the invalid low-cap case where early thresholds can select zero requests. +The first full-output attempt showed why a bounded replay is needed for a 12-iteration ablation: at the first threshold (`0.0625`), 31 selected requests contained 14,849 output tokens with `out_max=2981`. That makes one probe too slow to finish a full no-harness/harness pair. The active ablation therefore fixes output length at 128 tokens and limits each trial to four binary-search probes. This changes the decode mix, so the result should be interpreted as a community-vLLM harness convergence test under a bounded chat replay, not as a full-output production benchmark. + ## Harness Update Under Test This run tests a stricter early-stop harness: @@ -88,8 +91,8 @@ Pending dash0 runs: | Variant | tmux session | Log | Study root | | --- | --- | --- | --- | -| no-harness | `qwen30b_vllm020_noharness_probe512_20260502` | `logs/qwen30b_vllm020_noharness_probe512_20260502.log` | `.aituner-community-vllm020/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-probe512-noharness` | -| harness | `qwen30b_vllm020_harness_probe512_20260502` | `logs/qwen30b_vllm020_harness_probe512_20260502.log` | `.aituner-community-vllm020/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-probe512-harness` | +| no-harness | `qwen30b_vllm020_noharness_out128_probe512_20260502` | `logs/qwen30b_vllm020_noharness_out128_probe512_20260502.log` | `.aituner-community-vllm020/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-probe512-noharness` | +| harness | `qwen30b_vllm020_harness_out128_probe512_20260502` | `logs/qwen30b_vllm020_harness_out128_probe512_20260502.log` | `.aituner-community-vllm020/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-probe512-harness` | The harness run should be judged by best-so-far `request_rate_per_gpu` per tuning iteration, plus whether it stops only after validation evidence. The no-harness run should use the same trial budget so the ablation exposes whether the early-stop harness saves iterations without hiding a later better point.