Tear down the engine on SIGTERM instead of orphaning it
Killing `study tune` with a default SIGTERM skipped the finally blocks, leaving the vLLM engine and its EngineCore workers (which inherit the AITUNER_* marker env) alive on the GPUs — twice leaking GPU memory that needed a root reset. Install a SIGTERM handler in run_trial that raises KeyboardInterrupt so _terminate_process_tree runs, ignore SIGTERM during teardown so a second signal can't re-orphan it, and restore the prior handler afterward. Main-thread-guarded; unit-tested. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -55,6 +55,8 @@ from aituner.store import StudyStore
|
||||
from aituner.trace import load_trace_requests, summarize_window
|
||||
from aituner.worker import (
|
||||
_adaptive_replay_set,
|
||||
_install_sigterm_as_keyboardinterrupt,
|
||||
_restore_sigterm,
|
||||
_should_extend_on_boundary,
|
||||
_best_feasible_probe_record,
|
||||
_latency_summary,
|
||||
@@ -589,6 +591,18 @@ class CoreFlowTests(unittest.TestCase):
|
||||
self.assertFalse(outcome.success)
|
||||
self.assertIn("timed out", outcome.error)
|
||||
|
||||
def test_sigterm_is_converted_to_keyboardinterrupt(self) -> None:
|
||||
# So a killed `study tune` runs the engine-teardown finally instead of
|
||||
# orphaning the vLLM EngineCore workers on the GPUs.
|
||||
import signal as _signal
|
||||
|
||||
previous = _install_sigterm_as_keyboardinterrupt()
|
||||
try:
|
||||
with self.assertRaises(KeyboardInterrupt):
|
||||
_signal.raise_signal(_signal.SIGTERM)
|
||||
finally:
|
||||
_restore_sigterm(previous)
|
||||
|
||||
def test_lca_similarity_matrix_separates_different_profiles(self) -> None:
|
||||
window = WindowRecord(
|
||||
window_id="base",
|
||||
|
||||
Reference in New Issue
Block a user