Tear down the engine on SIGTERM instead of orphaning it

Killing `study tune` with a default SIGTERM skipped the finally blocks, leaving the
vLLM engine and its EngineCore workers (which inherit the AITUNER_* marker env) alive
on the GPUs — twice leaking GPU memory that needed a root reset. Install a SIGTERM
handler in run_trial that raises KeyboardInterrupt so _terminate_process_tree runs,
ignore SIGTERM during teardown so a second signal can't re-orphan it, and restore the
prior handler afterward. Main-thread-guarded; unit-tested.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-16 09:08:06 +08:00
parent 93ce339d61
commit b17b213575
2 changed files with 61 additions and 0 deletions

View File

@@ -210,6 +210,50 @@ def _probe_outcome_details(
}
_SIGTERM_NOT_INSTALLED = object()
def _install_sigterm_as_keyboardinterrupt() -> Any:
"""Make SIGTERM raise KeyboardInterrupt so the engine-teardown finally runs.
When `study tune` is killed, a default SIGTERM skips the finally blocks and
orphans the vLLM engine (and its EngineCore workers) on the GPUs. Converting
SIGTERM to KeyboardInterrupt lets _terminate_process_tree run. Only installable
from the main thread; returns the previous handler (or a sentinel).
"""
if threading.current_thread() is not threading.main_thread():
return _SIGTERM_NOT_INSTALLED
def _handler(signum: int, frame: Any) -> None:
raise KeyboardInterrupt()
try:
return signal.signal(signal.SIGTERM, _handler)
except (ValueError, OSError):
return _SIGTERM_NOT_INSTALLED
def _restore_sigterm(previous: Any) -> None:
if previous is _SIGTERM_NOT_INSTALLED:
return
if threading.current_thread() is not threading.main_thread():
return
try:
signal.signal(signal.SIGTERM, previous)
except (ValueError, OSError):
pass
def _ignore_sigterm_if_main() -> None:
"""Ignore SIGTERM during teardown so a second signal cannot orphan the engine."""
if threading.current_thread() is not threading.main_thread():
return
try:
signal.signal(signal.SIGTERM, signal.SIG_IGN)
except (ValueError, OSError):
pass
def _adaptive_replay_set(
selected: list[TraceRequest],
*,
@@ -568,6 +612,7 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
)
process = launch_process()
previous_sigterm = _install_sigterm_as_keyboardinterrupt()
probe_history: list[dict[str, Any]] = []
failure_stage = "engine_launch"
try:
@@ -826,4 +871,6 @@ def run_trial(trial_spec_path: Path) -> dict[str, Any]:
StudyStore.write_json(Path(trial.result_path), result)
return result
finally:
_ignore_sigterm_if_main()
_terminate_process_tree(process, timeout_s=30.0, marker_env=trial_marker_env)
_restore_sigterm(previous_sigterm)

View File

@@ -55,6 +55,8 @@ from aituner.store import StudyStore
from aituner.trace import load_trace_requests, summarize_window
from aituner.worker import (
_adaptive_replay_set,
_install_sigterm_as_keyboardinterrupt,
_restore_sigterm,
_should_extend_on_boundary,
_best_feasible_probe_record,
_latency_summary,
@@ -589,6 +591,18 @@ class CoreFlowTests(unittest.TestCase):
self.assertFalse(outcome.success)
self.assertIn("timed out", outcome.error)
def test_sigterm_is_converted_to_keyboardinterrupt(self) -> None:
# So a killed `study tune` runs the engine-teardown finally instead of
# orphaning the vLLM EngineCore workers on the GPUs.
import signal as _signal
previous = _install_sigterm_as_keyboardinterrupt()
try:
with self.assertRaises(KeyboardInterrupt):
_signal.raise_signal(_signal.SIGTERM)
finally:
_restore_sigterm(previous)
def test_lca_similarity_matrix_separates_different_profiles(self) -> None:
window = WindowRecord(
window_id="base",