diff --git a/src/aituner/worker.py b/src/aituner/worker.py index 71075ce..c3312d5 100644 --- a/src/aituner/worker.py +++ b/src/aituner/worker.py @@ -210,6 +210,50 @@ def _probe_outcome_details( } +_SIGTERM_NOT_INSTALLED = object() + + +def _install_sigterm_as_keyboardinterrupt() -> Any: + """Make SIGTERM raise KeyboardInterrupt so the engine-teardown finally runs. + + When `study tune` is killed, a default SIGTERM skips the finally blocks and + orphans the vLLM engine (and its EngineCore workers) on the GPUs. Converting + SIGTERM to KeyboardInterrupt lets _terminate_process_tree run. Only installable + from the main thread; returns the previous handler (or a sentinel). + """ + if threading.current_thread() is not threading.main_thread(): + return _SIGTERM_NOT_INSTALLED + + def _handler(signum: int, frame: Any) -> None: + raise KeyboardInterrupt() + + try: + return signal.signal(signal.SIGTERM, _handler) + except (ValueError, OSError): + return _SIGTERM_NOT_INSTALLED + + +def _restore_sigterm(previous: Any) -> None: + if previous is _SIGTERM_NOT_INSTALLED: + return + if threading.current_thread() is not threading.main_thread(): + return + try: + signal.signal(signal.SIGTERM, previous) + except (ValueError, OSError): + pass + + +def _ignore_sigterm_if_main() -> None: + """Ignore SIGTERM during teardown so a second signal cannot orphan the engine.""" + if threading.current_thread() is not threading.main_thread(): + return + try: + signal.signal(signal.SIGTERM, signal.SIG_IGN) + except (ValueError, OSError): + pass + + def _adaptive_replay_set( selected: list[TraceRequest], *, @@ -568,6 +612,7 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]: ) process = launch_process() + previous_sigterm = _install_sigterm_as_keyboardinterrupt() probe_history: list[dict[str, Any]] = [] failure_stage = "engine_launch" try: @@ -826,4 +871,6 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]: StudyStore.write_json(Path(trial.result_path), result) return result finally: + _ignore_sigterm_if_main() _terminate_process_tree(process, timeout_s=30.0, marker_env=trial_marker_env) + _restore_sigterm(previous_sigterm) diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 2772159..646c0be 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -55,6 +55,8 @@ from aituner.store import StudyStore from aituner.trace import load_trace_requests, summarize_window from aituner.worker import ( _adaptive_replay_set, + _install_sigterm_as_keyboardinterrupt, + _restore_sigterm, _should_extend_on_boundary, _best_feasible_probe_record, _latency_summary, @@ -589,6 +591,18 @@ class CoreFlowTests(unittest.TestCase): self.assertFalse(outcome.success) self.assertIn("timed out", outcome.error) + def test_sigterm_is_converted_to_keyboardinterrupt(self) -> None: + # So a killed `study tune` runs the engine-teardown finally instead of + # orphaning the vLLM EngineCore workers on the GPUs. + import signal as _signal + + previous = _install_sigterm_as_keyboardinterrupt() + try: + with self.assertRaises(KeyboardInterrupt): + _signal.raise_signal(_signal.SIGTERM) + finally: + _restore_sigterm(previous) + def test_lca_similarity_matrix_separates_different_profiles(self) -> None: window = WindowRecord( window_id="base",