Tear down the engine on SIGTERM instead of orphaning it
Killing `study tune` with a default SIGTERM skipped the finally blocks, leaving the vLLM engine and its EngineCore workers (which inherit the AITUNER_* marker env) alive on the GPUs — twice leaking GPU memory that needed a root reset. Install a SIGTERM handler in run_trial that raises KeyboardInterrupt so _terminate_process_tree runs, ignore SIGTERM during teardown so a second signal can't re-orphan it, and restore the prior handler afterward. Main-thread-guarded; unit-tested. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -210,6 +210,50 @@ def _probe_outcome_details(
|
||||
}
|
||||
|
||||
|
||||
_SIGTERM_NOT_INSTALLED = object()
|
||||
|
||||
|
||||
def _install_sigterm_as_keyboardinterrupt() -> Any:
|
||||
"""Make SIGTERM raise KeyboardInterrupt so the engine-teardown finally runs.
|
||||
|
||||
When `study tune` is killed, a default SIGTERM skips the finally blocks and
|
||||
orphans the vLLM engine (and its EngineCore workers) on the GPUs. Converting
|
||||
SIGTERM to KeyboardInterrupt lets _terminate_process_tree run. Only installable
|
||||
from the main thread; returns the previous handler (or a sentinel).
|
||||
"""
|
||||
if threading.current_thread() is not threading.main_thread():
|
||||
return _SIGTERM_NOT_INSTALLED
|
||||
|
||||
def _handler(signum: int, frame: Any) -> None:
|
||||
raise KeyboardInterrupt()
|
||||
|
||||
try:
|
||||
return signal.signal(signal.SIGTERM, _handler)
|
||||
except (ValueError, OSError):
|
||||
return _SIGTERM_NOT_INSTALLED
|
||||
|
||||
|
||||
def _restore_sigterm(previous: Any) -> None:
|
||||
if previous is _SIGTERM_NOT_INSTALLED:
|
||||
return
|
||||
if threading.current_thread() is not threading.main_thread():
|
||||
return
|
||||
try:
|
||||
signal.signal(signal.SIGTERM, previous)
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
|
||||
def _ignore_sigterm_if_main() -> None:
|
||||
"""Ignore SIGTERM during teardown so a second signal cannot orphan the engine."""
|
||||
if threading.current_thread() is not threading.main_thread():
|
||||
return
|
||||
try:
|
||||
signal.signal(signal.SIGTERM, signal.SIG_IGN)
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
|
||||
def _adaptive_replay_set(
|
||||
selected: list[TraceRequest],
|
||||
*,
|
||||
@@ -568,6 +612,7 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
||||
)
|
||||
|
||||
process = launch_process()
|
||||
previous_sigterm = _install_sigterm_as_keyboardinterrupt()
|
||||
probe_history: list[dict[str, Any]] = []
|
||||
failure_stage = "engine_launch"
|
||||
try:
|
||||
@@ -826,4 +871,6 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
|
||||
StudyStore.write_json(Path(trial.result_path), result)
|
||||
return result
|
||||
finally:
|
||||
_ignore_sigterm_if_main()
|
||||
_terminate_process_tree(process, timeout_s=30.0, marker_env=trial_marker_env)
|
||||
_restore_sigterm(previous_sigterm)
|
||||
|
||||
@@ -55,6 +55,8 @@ from aituner.store import StudyStore
|
||||
from aituner.trace import load_trace_requests, summarize_window
|
||||
from aituner.worker import (
|
||||
_adaptive_replay_set,
|
||||
_install_sigterm_as_keyboardinterrupt,
|
||||
_restore_sigterm,
|
||||
_should_extend_on_boundary,
|
||||
_best_feasible_probe_record,
|
||||
_latency_summary,
|
||||
@@ -589,6 +591,18 @@ class CoreFlowTests(unittest.TestCase):
|
||||
self.assertFalse(outcome.success)
|
||||
self.assertIn("timed out", outcome.error)
|
||||
|
||||
def test_sigterm_is_converted_to_keyboardinterrupt(self) -> None:
|
||||
# So a killed `study tune` runs the engine-teardown finally instead of
|
||||
# orphaning the vLLM EngineCore workers on the GPUs.
|
||||
import signal as _signal
|
||||
|
||||
previous = _install_sigterm_as_keyboardinterrupt()
|
||||
try:
|
||||
with self.assertRaises(KeyboardInterrupt):
|
||||
_signal.raise_signal(_signal.SIGTERM)
|
||||
finally:
|
||||
_restore_sigterm(previous)
|
||||
|
||||
def test_lca_similarity_matrix_separates_different_profiles(self) -> None:
|
||||
window = WindowRecord(
|
||||
window_id="base",
|
||||
|
||||
Reference in New Issue
Block a user