Add trace length bucket tuning support
This commit is contained in:
@@ -13,7 +13,7 @@ from aituner.job import append_job, build_trial_job
|
||||
from aituner.llm import build_prompt, parse_proposal_text
|
||||
from aituner.search import ThresholdProbe, binary_search_max_feasible
|
||||
from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
|
||||
from aituner.spec import Proposal, StudyState, TrialSummary, load_study_spec
|
||||
from aituner.spec import Proposal, SpecError, StudyState, TrialSummary, load_study_spec
|
||||
from aituner.store import StudyStore
|
||||
from aituner.trace import load_trace_requests, summarize_window
|
||||
from aituner.worker import (
|
||||
@@ -25,7 +25,9 @@ from aituner.worker import (
|
||||
from aituner.trace import TraceRequest
|
||||
|
||||
|
||||
def _write_study_assets(tmp_path: Path) -> Path:
|
||||
def _write_study_assets(
|
||||
tmp_path: Path, *, trace_overrides: dict[str, object] | None = None
|
||||
) -> Path:
|
||||
trace_dir = tmp_path / "trace_windows" / "traces"
|
||||
trace_dir.mkdir(parents=True)
|
||||
trace_path = trace_dir / "chat_w1.jsonl"
|
||||
@@ -81,6 +83,16 @@ def _write_study_assets(tmp_path: Path) -> Path:
|
||||
)
|
||||
|
||||
study_path = tmp_path / "study.json"
|
||||
trace_payload: dict[str, object] = {
|
||||
"windows_path": str(windows_path),
|
||||
"window_id": "chat_w1",
|
||||
"u_field": "sampling_u",
|
||||
"timestamp_field": "timestamp",
|
||||
"max_concurrency": 4,
|
||||
}
|
||||
if trace_overrides:
|
||||
trace_payload.update(trace_overrides)
|
||||
|
||||
study_payload = {
|
||||
"study_id": "study-1",
|
||||
"hardware": {"gpu_count": 8, "gpu_model": "H20", "host_candidates": ["dash0"]},
|
||||
@@ -105,13 +117,7 @@ def _write_study_assets(tmp_path: Path) -> Path:
|
||||
"tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
|
||||
"python_executable": "python3"
|
||||
},
|
||||
"trace": {
|
||||
"windows_path": str(windows_path),
|
||||
"window_id": "chat_w1",
|
||||
"u_field": "sampling_u",
|
||||
"timestamp_field": "timestamp",
|
||||
"max_concurrency": 4
|
||||
},
|
||||
"trace": trace_payload,
|
||||
"slo": {
|
||||
"target_pass_rate": 0.95,
|
||||
"ttft_rule": {
|
||||
@@ -161,9 +167,53 @@ class CoreFlowTests(unittest.TestCase):
|
||||
)
|
||||
self.assertIn("allowed_flag_keys", prompt)
|
||||
self.assertIn("study-1", prompt)
|
||||
self.assertIn('"current_best"', prompt)
|
||||
self.assertIn("queueing_knee_by_bucket", prompt)
|
||||
self.assertTrue(study_root.exists())
|
||||
|
||||
def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(
|
||||
tmp_path,
|
||||
trace_overrides={
|
||||
"input_length_filter": {
|
||||
"min_input_tokens": 0,
|
||||
"max_input_tokens": 8192,
|
||||
}
|
||||
},
|
||||
)
|
||||
study = load_study_spec(study_path)
|
||||
window, requests = load_trace_requests(study, study_spec_path=study_path)
|
||||
summary = summarize_window(requests, window)
|
||||
self.assertEqual(len(requests), 2)
|
||||
self.assertEqual([item.prompt_tokens_hint for item in requests], [1000, 5000])
|
||||
self.assertEqual(summary["request_count"], 2)
|
||||
self.assertEqual(summary["prompt_tokens_p95"], 5000.0)
|
||||
prompt = build_prompt(
|
||||
study=study,
|
||||
window_summary=summary,
|
||||
state=StudyState(study_id=study.study_id),
|
||||
capability_profile=None,
|
||||
)
|
||||
self.assertIn('"input_length_filter"', prompt)
|
||||
self.assertIn('"max_input_tokens": 8192', prompt)
|
||||
|
||||
def test_trace_input_length_filter_rejects_invalid_bounds(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(
|
||||
tmp_path,
|
||||
trace_overrides={
|
||||
"input_length_filter": {
|
||||
"min_input_tokens": 8193,
|
||||
"max_input_tokens": 8192,
|
||||
}
|
||||
},
|
||||
)
|
||||
with self.assertRaisesRegex(SpecError, "min_input_tokens must be <="):
|
||||
load_study_spec(study_path)
|
||||
|
||||
def test_prompt_includes_failed_trial_context(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
@@ -652,8 +702,36 @@ class CoreFlowTests(unittest.TestCase):
|
||||
)
|
||||
next_state = store.ingest_trial_results(study.study_id)
|
||||
self.assertEqual(next_state.best_trial_id, trial.trial_id)
|
||||
self.assertEqual(next_state.best_sampling_u, 0.75)
|
||||
self.assertEqual(next_state.best_request_rate, 12.5)
|
||||
|
||||
def test_materialize_trial_uses_incumbent_sampling_u_as_search_floor(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(tmp_path)
|
||||
study = load_study_spec(study_path)
|
||||
store = StudyStore(tmp_path / ".aituner" / "studies")
|
||||
store.init_study(spec_path=study_path, study=study)
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0001",
|
||||
best_sampling_u=0.375,
|
||||
best_request_rate=3.0,
|
||||
next_trial_index=2,
|
||||
trials=[],
|
||||
)
|
||||
proposal = Proposal.from_dict(
|
||||
{
|
||||
"observation": "Obs",
|
||||
"diagnosis": "Diag",
|
||||
"config_patch": {"env_patch": {}, "flag_patch": {"tensor-parallel-size": 4}},
|
||||
"expected_effects": ["raise rate"],
|
||||
}
|
||||
)
|
||||
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
|
||||
self.assertEqual(trial.search.low, 0.375)
|
||||
self.assertEqual(trial.search.high, 1.0)
|
||||
|
||||
def test_ingest_trial_results_records_failure_reason(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
@@ -764,6 +842,7 @@ class CoreFlowTests(unittest.TestCase):
|
||||
store = StudyStore(store_root)
|
||||
state = store.load_state("study-1")
|
||||
self.assertEqual(state.best_trial_id, "trial-0002")
|
||||
self.assertEqual(state.best_sampling_u, 0.75)
|
||||
self.assertEqual(state.best_request_rate, 2.0)
|
||||
self.assertEqual(state.next_trial_index, 3)
|
||||
|
||||
@@ -795,6 +874,20 @@ class CoreFlowTests(unittest.TestCase):
|
||||
["throughput: higher", "ttft: lower"],
|
||||
)
|
||||
|
||||
def test_parse_proposal_text_accepts_wrapped_json(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(tmp_path)
|
||||
study = load_study_spec(study_path)
|
||||
proposal = parse_proposal_text(
|
||||
"""Here is the proposal:
|
||||
```json
|
||||
{"observation":"obs","diagnosis":"diag","config_patch":{"env_patch":{},"flag_patch":{"max-num-seqs":32}},"expected_effects":["higher throughput"],"why_not_previous_failures":"keeps supported knobs"}
|
||||
```""",
|
||||
study,
|
||||
)
|
||||
self.assertEqual(proposal.config_patch.flag_patch["max-num-seqs"], 32)
|
||||
|
||||
def test_replay_requests_early_stops_when_slo_is_unrecoverable(self) -> None:
|
||||
requests = [
|
||||
TraceRequest(
|
||||
@@ -929,6 +1022,71 @@ class CoreFlowTests(unittest.TestCase):
|
||||
self.assertEqual(len(replayed), 2)
|
||||
self.assertEqual(replayed[1].error, "slo_pass_rate_unrecoverable")
|
||||
|
||||
def test_replay_requests_respects_max_elapsed_while_waiting_for_inflight(self) -> None:
|
||||
requests = [
|
||||
TraceRequest(
|
||||
row_id="r0",
|
||||
arrival_s=0.0,
|
||||
sampling_u=0.1,
|
||||
body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
|
||||
prompt_tokens_hint=8,
|
||||
completion_tokens_hint=4,
|
||||
)
|
||||
]
|
||||
|
||||
class FakeFuture:
|
||||
def result(self, timeout=None):
|
||||
raise AssertionError("future should not be awaited after elapsed early stop")
|
||||
|
||||
def cancel(self):
|
||||
return True
|
||||
|
||||
submitted = []
|
||||
|
||||
class FakeExecutor:
|
||||
def __init__(self, max_workers):
|
||||
self.max_workers = max_workers
|
||||
|
||||
def submit(self, fn, request, **kwargs):
|
||||
submitted.append(request.row_id)
|
||||
return FakeFuture()
|
||||
|
||||
def shutdown(self, wait=False, cancel_futures=True):
|
||||
return None
|
||||
|
||||
wait_timeouts: list[float] = []
|
||||
|
||||
def fake_wait(futures, timeout=None, return_when=None):
|
||||
wait_timeouts.append(timeout)
|
||||
return set(), set(futures)
|
||||
|
||||
def fake_evaluate(outcome: RequestOutcome):
|
||||
return type("Eval", (), {"passed": outcome.success})()
|
||||
|
||||
monotonic_values = iter([0.0, 0.0, 0.4, 1.2])
|
||||
|
||||
with mock.patch("aituner.worker.ThreadPoolExecutor", FakeExecutor):
|
||||
with mock.patch("aituner.worker.wait", side_effect=fake_wait):
|
||||
with mock.patch("aituner.worker.time.monotonic", side_effect=lambda: next(monotonic_values)):
|
||||
replayed, early_stopped, reason = _replay_requests(
|
||||
requests,
|
||||
base_url="http://127.0.0.1:8000",
|
||||
timeout_s=30.0,
|
||||
max_concurrency=1,
|
||||
target_pass_rate=0.95,
|
||||
max_lag_s=None,
|
||||
max_elapsed_s=1.0,
|
||||
evaluate_outcome=fake_evaluate,
|
||||
)
|
||||
|
||||
self.assertEqual(submitted, ["r0"])
|
||||
self.assertTrue(early_stopped)
|
||||
self.assertEqual(reason, "probe_elapsed_s>1.0")
|
||||
self.assertEqual(len(replayed), 1)
|
||||
self.assertEqual(replayed[0].error, "probe_elapsed_s>1.0")
|
||||
self.assertTrue(wait_timeouts)
|
||||
self.assertLessEqual(wait_timeouts[0], 0.5)
|
||||
|
||||
def test_latency_summary_reports_quantiles_and_slo(self) -> None:
|
||||
study = load_study_spec(_write_study_assets(Path(tempfile.mkdtemp())))
|
||||
outcomes = [
|
||||
|
||||
Reference in New Issue
Block a user