diff --git a/docs/harness-tuning-progress.md b/docs/harness-tuning-progress.md
index d9b1e2b..d489de4 100644
--- a/docs/harness-tuning-progress.md
+++ b/docs/harness-tuning-progress.md
@@ -61,6 +61,21 @@ Improve AITuner convergence for the `dash0` internal vLLM + Qwen3.5-27B 0-8k cha
 - Remote `compileall` passed.
 - Remote `unittest discover` initially exposed two pre-existing path-sensitive tests that hardcoded `/home/gahow/phd/aituner`; fixed them to derive `REPO_ROOT` from the test file path.
 
+### 2026-04-25 16:38-16:58 CST
+
+- Started real run in tmux session `aituner_harness_qwen27b_0_8k_20260425`.
+- Store root: `.aituner/harness-studies-20260425`.
+- First proposal followed the harness:
+  - proposal: `tensor-parallel-size: 2`;
+  - rationale: L profile is prefill-sensitive, prefix reuse is low, arrivals are smooth, so probe adjacent TP before runtime batching knobs.
+- First high-load probe at `sampling_u=0.03125` was infeasible:
+  - request rate 0.895 req/s;
+  - pass rate 0.145;
+  - p95 TTFT 4063 ms and p95 TPOT 113 ms;
+  - failed reasons included `tpot_ms>50.0` and `slo_pass_rate_unrecoverable`.
+- Important implementation issue found: after an early-stopped probe, the worker returned while in-flight HTTP requests could continue occupying the engine, stalling/polluting the next binary-search probe.
+- Action: stopped the run and freed GPUs. Updating `worker._replay_requests` to drain in-flight requests after early stop before the next probe starts.
+
 Remaining next steps:
 
 1. Start a real harness-guided Qwen3.5-27B 0-8k chat tuning run from `configs/examples/dash0_qwen27b_tight_slo_run4_0_8k.json`.
diff --git a/src/aituner/worker.py b/src/aituner/worker.py
index 49c5916..3e7bba9 100644
--- a/src/aituner/worker.py
+++ b/src/aituner/worker.py
@@ -135,6 +135,7 @@ def _replay_requests(
     max_lag_s: float | None,
     max_elapsed_s: float | None,
     evaluate_outcome: Callable[[RequestOutcome], Any],
+    drain_inflight_on_early_stop: bool = True,
 ) -> tuple[list[RequestOutcome], bool, str]:
     outcomes_by_id: dict[str, RequestOutcome] = {}
     lock = threading.Lock()
@@ -209,18 +210,47 @@ def _replay_requests(
                 if sleep_for > 0:
                     time.sleep(min(sleep_for, 0.1))
         if early_stopped:
-            for future in list(futures_by_request):
-                future.cancel()
-            for request in futures_by_request.values():
-                outcomes_by_id[request.row_id] = RequestOutcome(
-                    request_id=request.row_id,
-                    success=False,
-                    ttft_ms=None,
-                    tpot_ms=None,
-                    prompt_tokens=request.prompt_tokens_hint,
-                    completion_tokens=request.completion_tokens_hint,
-                    error=early_stop_reason or "probe_early_stopped",
-                )
+            if drain_inflight_on_early_stop and futures_by_request:
+                done, not_done = wait(list(futures_by_request), timeout=timeout_s)
+                for future in done:
+                    request = futures_by_request[future]
+                    try:
+                        outcomes_by_id[request.row_id] = future.result(timeout=0)
+                    except Exception:  # noqa: BLE001
+                        outcomes_by_id[request.row_id] = RequestOutcome(
+                            request_id=request.row_id,
+                            success=False,
+                            ttft_ms=None,
+                            tpot_ms=None,
+                            prompt_tokens=request.prompt_tokens_hint,
+                            completion_tokens=request.completion_tokens_hint,
+                            error=early_stop_reason or "probe_early_stopped",
+                        )
+                for future in not_done:
+                    future.cancel()
+                    request = futures_by_request[future]
+                    outcomes_by_id[request.row_id] = RequestOutcome(
+                        request_id=request.row_id,
+                        success=False,
+                        ttft_ms=None,
+                        tpot_ms=None,
+                        prompt_tokens=request.prompt_tokens_hint,
+                        completion_tokens=request.completion_tokens_hint,
+                        error=early_stop_reason or "probe_early_stopped",
+                    )
+            else:
+                for future in list(futures_by_request):
+                    future.cancel()
+                for request in futures_by_request.values():
+                    outcomes_by_id[request.row_id] = RequestOutcome(
+                        request_id=request.row_id,
+                        success=False,
+                        ttft_ms=None,
+                        tpot_ms=None,
+                        prompt_tokens=request.prompt_tokens_hint,
+                        completion_tokens=request.completion_tokens_hint,
+                        error=early_stop_reason or "probe_early_stopped",
+                    )
             for request in requests:
                 if request.row_id in submitted_ids:
                     continue
diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py
index 71affe5..0f1c591 100644
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -2191,6 +2191,7 @@ class CoreFlowTests(unittest.TestCase):
                     max_lag_s=None,
                     max_elapsed_s=None,
                     evaluate_outcome=fake_evaluate,
+                    drain_inflight_on_early_stop=False,
                 )
 
         self.assertEqual(submitted, ["r0", "r1"])
@@ -2254,6 +2255,7 @@ class CoreFlowTests(unittest.TestCase):
                         max_lag_s=None,
                         max_elapsed_s=1.0,
                         evaluate_outcome=fake_evaluate,
+                        drain_inflight_on_early_stop=False,
                     )
 
         self.assertEqual(submitted, ["r0"])