Relaunch engine after early-stopped probes
This commit is contained in:
@@ -341,7 +341,8 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
||||
artifact_dir.mkdir(parents=True, exist_ok=True)
|
||||
engine_log_path = Path(trial.engine_log_path)
|
||||
with engine_log_path.open("w", encoding="utf-8") as engine_log:
|
||||
process = subprocess.Popen( # noqa: S603
|
||||
def launch_process() -> subprocess.Popen[str]:
|
||||
return subprocess.Popen( # noqa: S603
|
||||
recipe.argv,
|
||||
cwd=recipe.cwd,
|
||||
env=recipe.env,
|
||||
@@ -350,6 +351,8 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
||||
text=True,
|
||||
start_new_session=True,
|
||||
)
|
||||
|
||||
process = launch_process()
|
||||
probe_history: list[dict[str, Any]] = []
|
||||
failure_stage = "engine_launch"
|
||||
try:
|
||||
@@ -362,6 +365,7 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
||||
failure_stage = "probe_search"
|
||||
|
||||
def evaluator(threshold: float) -> ThresholdProbe[ProbePayload]:
|
||||
nonlocal process
|
||||
selected = select_requests_for_threshold(requests, threshold=threshold)
|
||||
outcomes, early_stopped, early_stop_reason = _replay_requests(
|
||||
selected,
|
||||
@@ -372,6 +376,7 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
||||
max_lag_s=study.trace.early_stop_max_lag_s,
|
||||
max_elapsed_s=study.trace.early_stop_max_elapsed_s,
|
||||
evaluate_outcome=lambda outcome: evaluate_request(outcome, study.slo),
|
||||
drain_inflight_on_early_stop=False,
|
||||
)
|
||||
evaluations, summary = summarize_evaluations(outcomes, study.slo)
|
||||
request_rate = (
|
||||
@@ -418,6 +423,15 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
||||
}
|
||||
probe_history.append(probe_record)
|
||||
StudyStore.write_json(Path(trial.probe_log_path), probe_history)
|
||||
if early_stopped:
|
||||
_terminate_process_tree(process, timeout_s=30.0)
|
||||
process = launch_process()
|
||||
_wait_for_server_or_exit(
|
||||
process,
|
||||
base_url=recipe.base_url,
|
||||
healthcheck_path=recipe.healthcheck_path,
|
||||
ready_timeout_s=recipe.ready_timeout_s,
|
||||
)
|
||||
return ThresholdProbe(
|
||||
threshold=threshold,
|
||||
feasible=payload.feasible,
|
||||
|
||||
Reference in New Issue
Block a user