From ccbf24ac47a142b525cc4a90a791302e36c8d54a Mon Sep 17 00:00:00 2001
From: Gahow Wang <gahow.wang@gmail.com>
Date: Sat, 2 May 2026 10:03:59 +0800
Subject: [PATCH] Use time-compressed community vllm ablation

---
 .../dash0_qwen30b_a3b_community_vllm020_harness.json       | 4 ++--
 .../dash0_qwen30b_a3b_community_vllm020_noharness.json     | 4 ++--
 .../harness-early-stop-ablation-20260502.md                | 7 ++++---
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json b/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json
index 74c2440..d98d7b0 100644
--- a/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json
+++ b/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json
@@ -1,5 +1,5 @@
 {
-  "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-probe512-harness",
+  "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-scale01-harness",
   "hardware": {
     "gpu_count": 8,
     "gpu_model": "H20",
@@ -73,7 +73,7 @@
       "max_input_tokens": 8192
     },
     "max_requests_per_probe": 512,
-    "replay_time_scale": 1.0,
+    "replay_time_scale": 0.1,
     "early_stop_max_lag_s": 120.0,
     "early_stop_max_elapsed_s": 900.0
   },
diff --git a/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json b/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json
index ef0933d..92c2be0 100644
--- a/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json
+++ b/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json
@@ -1,5 +1,5 @@
 {
-  "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-probe512-noharness",
+  "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-scale01-noharness",
   "hardware": {
     "gpu_count": 8,
     "gpu_model": "H20",
@@ -73,7 +73,7 @@
       "max_input_tokens": 8192
     },
     "max_requests_per_probe": 512,
-    "replay_time_scale": 1.0,
+    "replay_time_scale": 0.1,
     "early_stop_max_lag_s": 120.0,
     "early_stop_max_elapsed_s": 900.0
   },
diff --git a/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md b/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md
index 839373e..16b0787 100644
--- a/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md
+++ b/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md
@@ -40,6 +40,7 @@ The experiment reuses the 0-8k chat window that has already been used for qwen27
 | input filter | 0 to 8192 tokens |
 | completion tokens | fixed 128 via `trace.completion_tokens_override` |
 | max requests per probe | 512 |
+| replay time scale | 0.1 |
 | target pass rate | 0.95 |
 | TTFT SLO | 2s up to 4k, 4s up to 32k, 6s above |
 | TPOT SLO | 50ms |
@@ -48,7 +49,7 @@ The experiment reuses the 0-8k chat window that has already been used for qwen27
 
 The `max_requests_per_probe=512` cap keeps the fresh community-vLLM ablation practical while preserving a real trace-shaped replay, SLO scoring, and binary-search threshold probe. A trace-only count check gives 31 to 65 selected requests across the six binary-search thresholds, avoiding the invalid low-cap case where early thresholds can select zero requests.
 
-The first full-output attempt showed why a bounded replay is needed for a 12-iteration ablation: at the first threshold (`0.0625`), 31 selected requests contained 14,849 output tokens with `out_max=2981`. That makes one probe too slow to finish a full no-harness/harness pair. The active ablation therefore fixes output length at 128 tokens and limits each trial to four binary-search probes. This changes the decode mix, so the result should be interpreted as a community-vLLM harness convergence test under a bounded chat replay, not as a full-output production benchmark.
+The first full-output attempt showed why a bounded replay is needed for a 12-iteration ablation: at the first threshold (`0.0625`), 31 selected requests contained 14,849 output tokens with `out_max=2981`. That makes one probe too slow to finish a full no-harness/harness pair. The first out128 attempt with `replay_time_scale=1.0` was still bounded by real window time, so each probe waited close to the original window duration. The active ablation therefore fixes output length at 128 tokens, uses `replay_time_scale=0.1`, and limits each trial to four binary-search probes. `load_trace_requests` scales both request arrivals and the window duration, so reported request rates are the actual compressed replay request rates. This changes the load/decode mix, so the result should be interpreted as a community-vLLM harness convergence test under a bounded, time-compressed chat replay, not as a full-output production benchmark.
 
 ## Harness Update Under Test
 
@@ -91,8 +92,8 @@ Pending dash0 runs:
 
 | Variant | tmux session | Log | Study root |
 | --- | --- | --- | --- |
-| no-harness | `qwen30b_vllm020_noharness_out128_probe512_20260502` | `logs/qwen30b_vllm020_noharness_out128_probe512_20260502.log` | `.aituner-community-vllm020/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-probe512-noharness` |
-| harness | `qwen30b_vllm020_harness_out128_probe512_20260502` | `logs/qwen30b_vllm020_harness_out128_probe512_20260502.log` | `.aituner-community-vllm020/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-probe512-harness` |
+| no-harness | `qwen30b_vllm020_noharness_out128_scale01_20260502` | `logs/qwen30b_vllm020_noharness_out128_scale01_20260502.log` | `.aituner-community-vllm020/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-scale01-noharness` |
+| harness | `qwen30b_vllm020_harness_out128_scale01_20260502` | `logs/qwen30b_vllm020_harness_out128_scale01_20260502.log` | `.aituner-community-vllm020/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-out128-scale01-harness` |
 
 The harness run should be judged by best-so-far `request_rate_per_gpu` per tuning iteration, plus whether it stops only after validation evidence. The no-harness run should use the same trial budget so the ablation exposes whether the early-stop harness saves iterations without hiding a later better point.