diff --git a/docs/qwen235b-thinking-decode/harness-20260428.md b/docs/qwen235b-thinking-decode/harness-20260428.md index b3abb77..192e960 100644 --- a/docs/qwen235b-thinking-decode/harness-20260428.md +++ b/docs/qwen235b-thinking-decode/harness-20260428.md @@ -50,3 +50,5 @@ The active run is now seeded from the real run5 baseline and continues from `tri ## Follow-up Fix The seeded prompt exposed a generic diagnosis issue: if the best feasible probe had no latency failures, the harness could miss the prior infeasible probe that showed the real bottleneck at higher load. The harness now scans the probe sequence backward and uses the nearest non-trivial bottleneck before falling back to the best feasible probe. This keeps decode-only runs focused on `decode_tpot` after a feasible low-load point, without adding testcase thresholds. + +A second generic diagnosis bug was fixed: non-SLO bookkeeping counts such as `probe_elapsed_s>...` no longer collapse to `ttft_prefill` when TTFT/TPOT/request failure counts are all zero. diff --git a/src/aituner/harness.py b/src/aituner/harness.py index fa829de..8f9b10d 100644 --- a/src/aituner/harness.py +++ b/src/aituner/harness.py @@ -341,6 +341,8 @@ def _active_bottleneck(probe: dict[str, Any] | None) -> str: and not str(k).startswith("tpot") and not str(k).startswith("probe_elapsed_s>") ) + if ttft_count == 0 and tpot_count == 0 and request_failed_count == 0: + return "none_obvious" if ttft_count >= max(tpot_count, request_failed_count): return "ttft_prefill" if tpot_count >= max(ttft_count, request_failed_count): diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index ef83fc5..68fd1bc 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -576,7 +576,10 @@ class CoreFlowTests(unittest.TestCase): "payload": { "request_rate": 1.0, "pass_rate": 1.0, - "latency_summary": {"failed_reason_counts": {}}, + "early_stop_reason": "probe_elapsed_s>1200.0", + "latency_summary": { + "failed_reason_counts": {"probe_elapsed_s>1200.0": 1} + }, }, }, ],