Add trace length bucket tuning support

This commit is contained in:
2026-04-07 11:03:16 +08:00
parent e9b5e9b957
commit 46ed688ace
12 changed files with 922 additions and 14 deletions

View File

@@ -13,7 +13,7 @@ from aituner.job import append_job, build_trial_job
from aituner.llm import build_prompt, parse_proposal_text
from aituner.search import ThresholdProbe, binary_search_max_feasible
from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
from aituner.spec import Proposal, StudyState, TrialSummary, load_study_spec
from aituner.spec import Proposal, SpecError, StudyState, TrialSummary, load_study_spec
from aituner.store import StudyStore
from aituner.trace import load_trace_requests, summarize_window
from aituner.worker import (
@@ -25,7 +25,9 @@ from aituner.worker import (
from aituner.trace import TraceRequest
def _write_study_assets(tmp_path: Path) -> Path:
def _write_study_assets(
tmp_path: Path, *, trace_overrides: dict[str, object] | None = None
) -> Path:
trace_dir = tmp_path / "trace_windows" / "traces"
trace_dir.mkdir(parents=True)
trace_path = trace_dir / "chat_w1.jsonl"
@@ -81,6 +83,16 @@ def _write_study_assets(tmp_path: Path) -> Path:
)
study_path = tmp_path / "study.json"
trace_payload: dict[str, object] = {
"windows_path": str(windows_path),
"window_id": "chat_w1",
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 4,
}
if trace_overrides:
trace_payload.update(trace_overrides)
study_payload = {
"study_id": "study-1",
"hardware": {"gpu_count": 8, "gpu_model": "H20", "host_candidates": ["dash0"]},
@@ -105,13 +117,7 @@ def _write_study_assets(tmp_path: Path) -> Path:
"tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
"python_executable": "python3"
},
"trace": {
"windows_path": str(windows_path),
"window_id": "chat_w1",
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 4
},
"trace": trace_payload,
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
@@ -161,9 +167,53 @@ class CoreFlowTests(unittest.TestCase):
)
self.assertIn("allowed_flag_keys", prompt)
self.assertIn("study-1", prompt)
self.assertIn('"current_best"', prompt)
self.assertIn("queueing_knee_by_bucket", prompt)
self.assertTrue(study_root.exists())
def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={
"input_length_filter": {
"min_input_tokens": 0,
"max_input_tokens": 8192,
}
},
)
study = load_study_spec(study_path)
window, requests = load_trace_requests(study, study_spec_path=study_path)
summary = summarize_window(requests, window)
self.assertEqual(len(requests), 2)
self.assertEqual([item.prompt_tokens_hint for item in requests], [1000, 5000])
self.assertEqual(summary["request_count"], 2)
self.assertEqual(summary["prompt_tokens_p95"], 5000.0)
prompt = build_prompt(
study=study,
window_summary=summary,
state=StudyState(study_id=study.study_id),
capability_profile=None,
)
self.assertIn('"input_length_filter"', prompt)
self.assertIn('"max_input_tokens": 8192', prompt)
def test_trace_input_length_filter_rejects_invalid_bounds(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(
tmp_path,
trace_overrides={
"input_length_filter": {
"min_input_tokens": 8193,
"max_input_tokens": 8192,
}
},
)
with self.assertRaisesRegex(SpecError, "min_input_tokens must be <="):
load_study_spec(study_path)
def test_prompt_includes_failed_trial_context(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
@@ -652,8 +702,36 @@ class CoreFlowTests(unittest.TestCase):
)
next_state = store.ingest_trial_results(study.study_id)
self.assertEqual(next_state.best_trial_id, trial.trial_id)
self.assertEqual(next_state.best_sampling_u, 0.75)
self.assertEqual(next_state.best_request_rate, 12.5)
def test_materialize_trial_uses_incumbent_sampling_u_as_search_floor(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store = StudyStore(tmp_path / ".aituner" / "studies")
store.init_study(spec_path=study_path, study=study)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0001",
best_sampling_u=0.375,
best_request_rate=3.0,
next_trial_index=2,
trials=[],
)
proposal = Proposal.from_dict(
{
"observation": "Obs",
"diagnosis": "Diag",
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
"expected_effects": ["raise rate"],
}
)
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
self.assertEqual(trial.search.low, 0.375)
self.assertEqual(trial.search.high, 1.0)
def test_ingest_trial_results_records_failure_reason(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
@@ -764,6 +842,7 @@ class CoreFlowTests(unittest.TestCase):
store = StudyStore(store_root)
state = store.load_state("study-1")
self.assertEqual(state.best_trial_id, "trial-0002")
self.assertEqual(state.best_sampling_u, 0.75)
self.assertEqual(state.best_request_rate, 2.0)
self.assertEqual(state.next_trial_index, 3)
@@ -795,6 +874,20 @@ class CoreFlowTests(unittest.TestCase):
["throughput: higher", "ttft: lower"],
)
def test_parse_proposal_text_accepts_wrapped_json(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
proposal = parse_proposal_text(
"""Here is the proposal:
```json
{"observation":"obs","diagnosis":"diag","config_patch":{"env_patch":{},"flag_patch":{"max-num-seqs":32}},"expected_effects":["higher throughput"],"why_not_previous_failures":"keeps supported knobs"}
```""",
study,
)
self.assertEqual(proposal.config_patch.flag_patch["max-num-seqs"], 32)
def test_replay_requests_early_stops_when_slo_is_unrecoverable(self) -> None:
requests = [
TraceRequest(
@@ -929,6 +1022,71 @@ class CoreFlowTests(unittest.TestCase):
self.assertEqual(len(replayed), 2)
self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable")
def test_replay_requests_respects_max_elapsed_while_waiting_for_inflight(self) -> None:
requests = [
TraceRequest(
row_id="r0",
arrival_s=0.0,
sampling_u=0.1,
body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
prompt_tokens_hint=8,
completion_tokens_hint=4,
)
]
class FakeFuture:
def result(self, timeout=None):
raise AssertionError("future should not be awaited after elapsed early stop")
def cancel(self):
return True
submitted = []
class FakeExecutor:
def __init__(self, max_workers):
self.max_workers = max_workers
def submit(self, fn, request, **kwargs):
submitted.append(request.row_id)
return FakeFuture()
def shutdown(self, wait=False, cancel_futures=True):
return None
wait_timeouts: list[float] = []
def fake_wait(futures, timeout=None, return_when=None):
wait_timeouts.append(timeout)
return set(), set(futures)
def fake_evaluate(outcome: RequestOutcome):
return type("Eval", (), {"passed": outcome.success})()
monotonic_values = iter([0.0, 0.0, 0.4, 1.2])
with mock.patch("aituner.worker.ThreadPoolExecutor", FakeExecutor):
with mock.patch("aituner.worker.wait", side_effect=fake_wait):
with mock.patch("aituner.worker.time.monotonic", side_effect=lambda: next(monotonic_values)):
replayed, early_stopped, reason = _replay_requests(
requests,
base_url="http://127.0.0.1:8000",
timeout_s=30.0,
max_concurrency=1,
target_pass_rate=0.95,
max_lag_s=None,
max_elapsed_s=1.0,
evaluate_outcome=fake_evaluate,
)
self.assertEqual(submitted, ["r0"])
self.assertTrue(early_stopped)
self.assertEqual(reason, "probe_elapsed_s>1.0")
self.assertEqual(len(replayed), 1)
self.assertEqual(replayed[0].error, "probe_elapsed_s>1.0")
self.assertTrue(wait_timeouts)
self.assertLessEqual(wait_timeouts[0], 0.5)
def test_latency_summary_reports_quantiles_and_slo(self) -> None:
study = load_study_spec(_write_study_assets(Path(tempfile.mkdtemp())))
outcomes = [