Tear down the engine on SIGTERM instead of orphaning it

Killing `study tune` with a default SIGTERM skipped the finally blocks, leaving the
vLLM engine and its EngineCore workers (which inherit the AITUNER_* marker env) alive
on the GPUs — twice leaking GPU memory that needed a root reset. Install a SIGTERM
handler in run_trial that raises KeyboardInterrupt so _terminate_process_tree runs,
ignore SIGTERM during teardown so a second signal can't re-orphan it, and restore the
prior handler afterward. Main-thread-guarded; unit-tested.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-16 09:08:06 +08:00
parent 93ce339d61
commit b17b213575
2 changed files with 61 additions and 0 deletions

View File

@@ -55,6 +55,8 @@ from aituner.store import StudyStore
from aituner.trace import load_trace_requests, summarize_window
from aituner.worker import (
_adaptive_replay_set,
_install_sigterm_as_keyboardinterrupt,
_restore_sigterm,
_should_extend_on_boundary,
_best_feasible_probe_record,
_latency_summary,
@@ -589,6 +591,18 @@ class CoreFlowTests(unittest.TestCase):
self.assertFalse(outcome.success)
self.assertIn("timed out", outcome.error)
def test_sigterm_is_converted_to_keyboardinterrupt(self) -> None:
# So a killed `study tune` runs the engine-teardown finally instead of
# orphaning the vLLM EngineCore workers on the GPUs.
import signal as _signal
previous = _install_sigterm_as_keyboardinterrupt()
try:
with self.assertRaises(KeyboardInterrupt):
_signal.raise_signal(_signal.SIGTERM)
finally:
_restore_sigterm(previous)
def test_lca_similarity_matrix_separates_different_profiles(self) -> None:
window = WindowRecord(
window_id="base",