Add linear_ms SLO rule (length-aware TTFT budget)

threshold_ms = intercept_ms + per_token_ms * input_tokens. Lets the TTFT target scale with prefill work, e.g. "4s + L_in/8k" => intercept_ms=4000, per_token_ms=0.125 (4s base, +1s per 8k input tokens). slo + spec + test. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 20:35:23 +08:00
parent 77af4ded2a
commit ed2bbe0323
3 changed files with 46 additions and 0 deletions
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -44,6 +44,7 @@ from aituner.spec import (
    ConfigPatch,
    LLMEndpointSpec,
    Proposal,
+    SloSpec,
    SpecError,
    StudyState,
    TrialSummary,
@@ -531,6 +532,34 @@ class CoreFlowTests(unittest.TestCase):
            )
        )

+    def test_linear_ms_ttft_rule_scales_with_input_length(self) -> None:
+        slo = SloSpec.from_dict(
+            {
+                "target_pass_rate": 0.95,
+                "ttft_rule": {"kind": "linear_ms", "intercept_ms": 4000, "per_token_ms": 0.125},
+                "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
+            }
+        )
+
+        def ev(prompt_tokens: int, ttft_ms: float):
+            return evaluate_request(
+                RequestOutcome(
+                    request_id="r",
+                    success=True,
+                    ttft_ms=ttft_ms,
+                    tpot_ms=10.0,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=8,
+                ),
+                slo,
+            )
+
+        # threshold = 4000 + 0.125*L_in : 8k->5000ms, 0->4000ms
+        self.assertTrue(ev(8000, 4900).passed)
+        self.assertFalse(ev(8000, 5100).passed)
+        self.assertTrue(ev(0, 3900).passed)
+        self.assertFalse(ev(0, 4100).passed)
+
    def test_lca_similarity_matrix_separates_different_profiles(self) -> None:
        window = WindowRecord(
            window_id="base",