Harden trial measurement accounting
This commit is contained in:
@@ -9,9 +9,9 @@ from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from aituner.cli import main as cli_main
|
||||
from aituner.compare import load_compare_spec, run_compare
|
||||
from aituner.compare import _aggregate_summary, load_compare_spec, run_compare
|
||||
from aituner.engine import build_launch_recipe
|
||||
from aituner.http_client import _auth_headers, _openai_url, _should_bypass_proxy
|
||||
from aituner.http_client import StreamMetrics, _auth_headers, _openai_url, _should_bypass_proxy
|
||||
from aituner.job import append_job, build_trial_job
|
||||
from aituner.harness import (
|
||||
build_harness_context,
|
||||
@@ -34,9 +34,11 @@ from aituner.store import StudyStore
|
||||
from aituner.trace import load_trace_requests, summarize_window
|
||||
from aituner.worker import (
|
||||
_latency_summary,
|
||||
_run_one_request,
|
||||
_replay_requests,
|
||||
_terminate_process_tree,
|
||||
_wait_for_server_or_exit,
|
||||
run_trial,
|
||||
)
|
||||
from aituner.trace import TraceRequest
|
||||
|
||||
@@ -863,6 +865,24 @@ class CoreFlowTests(unittest.TestCase):
|
||||
with self.assertRaisesRegex(SpecError, "min_input_tokens must be <="):
|
||||
load_study_spec(study_path)
|
||||
|
||||
def test_trace_rejects_non_positive_max_requests_per_probe(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
study_path = _write_study_assets(
|
||||
Path(tmp),
|
||||
trace_overrides={"max_requests_per_probe": 0},
|
||||
)
|
||||
with self.assertRaisesRegex(SpecError, "max_requests_per_probe must be > 0"):
|
||||
load_study_spec(study_path)
|
||||
|
||||
def test_trace_rejects_invalid_replay_time_scale(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
study_path = _write_study_assets(
|
||||
Path(tmp),
|
||||
trace_overrides={"replay_time_scale": 0.0},
|
||||
)
|
||||
with self.assertRaisesRegex(SpecError, "replay_time_scale must be > 0"):
|
||||
load_study_spec(study_path)
|
||||
|
||||
def test_decode_only_mode_is_loaded_and_prompt_mentions_it(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
@@ -1456,6 +1476,34 @@ class CoreFlowTests(unittest.TestCase):
|
||||
self.assertEqual(requests[2].body["min_tokens"], 1)
|
||||
self.assertEqual(requests[2].body["max_tokens"], 1)
|
||||
|
||||
def test_run_one_request_fails_fixed_length_completion_mismatch(self) -> None:
|
||||
request = TraceRequest(
|
||||
row_id="r1",
|
||||
arrival_s=0.0,
|
||||
sampling_u=0.1,
|
||||
body={"model": "m", "messages": [{"role": "user", "content": "x"}]},
|
||||
prompt_tokens_hint=8,
|
||||
completion_tokens_hint=2,
|
||||
)
|
||||
|
||||
with mock.patch(
|
||||
"aituner.worker.stream_chat_completion",
|
||||
return_value=StreamMetrics(
|
||||
ttft_ms=10.0,
|
||||
tpot_ms=5.0,
|
||||
completion_tokens=1,
|
||||
),
|
||||
):
|
||||
outcome = _run_one_request(
|
||||
request,
|
||||
base_url="http://127.0.0.1:8000",
|
||||
timeout_s=1.0,
|
||||
)
|
||||
|
||||
self.assertFalse(outcome.success)
|
||||
self.assertEqual(outcome.error, "completion_tokens_mismatch expected=2 actual=1")
|
||||
self.assertEqual(outcome.completion_tokens, 1)
|
||||
|
||||
def test_build_prompt_mentions_completion_tokens_override(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
study_path = _write_study_assets(
|
||||
@@ -1950,6 +1998,86 @@ class CoreFlowTests(unittest.TestCase):
|
||||
3.125,
|
||||
)
|
||||
|
||||
def test_run_trial_persists_probe_request_details(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(tmp_path)
|
||||
payload = json.loads(study_path.read_text(encoding="utf-8"))
|
||||
payload["search"]["max_probes"] = 1
|
||||
study_path.write_text(json.dumps(payload), encoding="utf-8")
|
||||
study = load_study_spec(study_path)
|
||||
store = StudyStore(tmp_path / ".aituner" / "studies")
|
||||
store.init_study(spec_path=study_path, study=study)
|
||||
state = store.load_state(study.study_id)
|
||||
proposal = Proposal.from_dict(
|
||||
{
|
||||
"observation": "baseline",
|
||||
"diagnosis": "baseline",
|
||||
"config_patch": {"env_patch": {}, "flag_patch": {}},
|
||||
"expected_effects": ["measure"],
|
||||
}
|
||||
)
|
||||
trial, _ = store.materialize_trial(study=study, state=state, proposal=proposal)
|
||||
|
||||
def fake_replay(requests, **kwargs):
|
||||
return (
|
||||
[
|
||||
RequestOutcome(
|
||||
request_id=request.row_id,
|
||||
success=True,
|
||||
ttft_ms=10.0,
|
||||
tpot_ms=5.0,
|
||||
prompt_tokens=request.prompt_tokens_hint,
|
||||
completion_tokens=request.completion_tokens_hint,
|
||||
)
|
||||
for request in requests
|
||||
],
|
||||
False,
|
||||
"",
|
||||
)
|
||||
|
||||
process = mock.Mock()
|
||||
process.poll.return_value = 0
|
||||
with mock.patch("aituner.worker.subprocess.Popen", return_value=process):
|
||||
with mock.patch("aituner.worker._wait_for_server_or_exit", return_value=None):
|
||||
with mock.patch("aituner.worker._terminate_process_tree", return_value=None):
|
||||
with mock.patch("aituner.worker._replay_requests", side_effect=fake_replay):
|
||||
result = run_trial(Path(trial.artifact_dir) / "trial_spec.json")
|
||||
|
||||
self.assertEqual(result["status"], "completed")
|
||||
details_path = Path(trial.artifact_dir) / "probe_details.jsonl"
|
||||
self.assertTrue(details_path.exists())
|
||||
rows = [
|
||||
json.loads(line)
|
||||
for line in details_path.read_text(encoding="utf-8").splitlines()
|
||||
]
|
||||
self.assertEqual(len(rows), 1)
|
||||
self.assertEqual(rows[0]["threshold"], 0.5)
|
||||
self.assertEqual(rows[0]["outcomes"][0]["request_id"], "r1")
|
||||
self.assertEqual(rows[0]["outcomes"][0]["sampling_u"], 0.1)
|
||||
|
||||
def test_materialize_trial_does_not_mutate_input_state_trials(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(tmp_path)
|
||||
study = load_study_spec(study_path)
|
||||
store = StudyStore(tmp_path / ".aituner" / "studies")
|
||||
store.init_study(spec_path=study_path, study=study)
|
||||
state = store.load_state(study.study_id)
|
||||
proposal = Proposal.from_dict(
|
||||
{
|
||||
"observation": "baseline",
|
||||
"diagnosis": "baseline",
|
||||
"config_patch": {"env_patch": {}, "flag_patch": {}},
|
||||
"expected_effects": ["measure"],
|
||||
}
|
||||
)
|
||||
|
||||
_, next_state = store.materialize_trial(study=study, state=state, proposal=proposal)
|
||||
|
||||
self.assertEqual(state.trials, [])
|
||||
self.assertEqual(len(next_state.trials), 1)
|
||||
|
||||
def test_materialize_trial_uses_incumbent_sampling_u_as_search_floor(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
@@ -2969,6 +3097,44 @@ class CoreFlowTests(unittest.TestCase):
|
||||
self.assertTrue((tmp_path / ".compare" / "summary.json").exists())
|
||||
self.assertTrue((tmp_path / ".compare" / "report.md").exists())
|
||||
|
||||
def test_compare_aggregate_counts_failed_and_no_feasible_windows(self) -> None:
|
||||
summary = _aggregate_summary(
|
||||
[
|
||||
{
|
||||
"baseline": {
|
||||
"status": "completed",
|
||||
"best_request_rate": 1.0,
|
||||
"best_request_rate_per_gpu": 1.0,
|
||||
},
|
||||
"tuned": {
|
||||
"status": "completed",
|
||||
"best_request_rate": None,
|
||||
"best_request_rate_per_gpu": None,
|
||||
},
|
||||
"delta": {"winner": "baseline"},
|
||||
},
|
||||
{
|
||||
"baseline": {
|
||||
"status": "failed",
|
||||
"best_request_rate": None,
|
||||
"best_request_rate_per_gpu": None,
|
||||
},
|
||||
"tuned": {
|
||||
"status": "completed",
|
||||
"best_request_rate": 2.0,
|
||||
"best_request_rate_per_gpu": 2.0,
|
||||
},
|
||||
"delta": {"winner": "tuned"},
|
||||
},
|
||||
]
|
||||
)
|
||||
self.assertEqual(summary["baseline_completed_window_count"], 1)
|
||||
self.assertEqual(summary["baseline_failed_window_count"], 1)
|
||||
self.assertEqual(summary["baseline_no_feasible_window_count"], 1)
|
||||
self.assertEqual(summary["tuned_completed_window_count"], 2)
|
||||
self.assertEqual(summary["tuned_failed_window_count"], 0)
|
||||
self.assertEqual(summary["tuned_no_feasible_window_count"], 1)
|
||||
|
||||
def test_run_compare_resolves_trial_ref_candidate(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
|
||||
Reference in New Issue
Block a user