Integrate descriptor runtime candidates into harness
This commit is contained in:
@@ -2594,6 +2594,119 @@ class CoreFlowTests(unittest.TestCase):
|
||||
)
|
||||
self.assertNotIn("tensor-parallel-size", proposal.config_patch.flag_patch)
|
||||
|
||||
def test_descriptor_candidates_expose_bad_runtime_recovery_without_preempting_topology(
|
||||
self,
|
||||
) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
study_path = _write_study_assets(
|
||||
tmp_path,
|
||||
slo_overrides={
|
||||
"ttft_rule": {"kind": "fixed_ms", "threshold_ms": 4000},
|
||||
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 50},
|
||||
},
|
||||
engine_overrides={
|
||||
"base_flags": {
|
||||
"host": "127.0.0.1",
|
||||
"port": 8000,
|
||||
"tensor-parallel-size": 2,
|
||||
"data-parallel-size": 1,
|
||||
"gpu-memory-utilization": 0.5,
|
||||
"max-num-seqs": 8,
|
||||
},
|
||||
"tunable_flags": [
|
||||
"tensor-parallel-size",
|
||||
"data-parallel-size",
|
||||
"gpu-memory-utilization",
|
||||
"max-num-seqs",
|
||||
],
|
||||
"topology_constraints": {
|
||||
"allowed_tensor_parallel_sizes": [2, 4, 8],
|
||||
"allowed_data_parallel_sizes": [1],
|
||||
"allowed_tp_dp_products": [2, 4, 8],
|
||||
},
|
||||
},
|
||||
)
|
||||
study = load_study_spec(study_path)
|
||||
result_path = tmp_path / "trial-0001.json"
|
||||
result_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"status": "completed",
|
||||
"best_sampling_u": 0.05,
|
||||
"best_request_rate": 3.4667,
|
||||
"best_pass_rate": 0.9663,
|
||||
"probes": [
|
||||
{
|
||||
"threshold": 0.05,
|
||||
"feasible": True,
|
||||
"payload": {
|
||||
"request_rate": 3.4667,
|
||||
"pass_rate": 0.9663,
|
||||
"latency_summary": {"failed_reason_counts": {}},
|
||||
},
|
||||
},
|
||||
{
|
||||
"threshold": 0.08,
|
||||
"feasible": False,
|
||||
"payload": {
|
||||
"request_rate": 4.0,
|
||||
"pass_rate": 0.5,
|
||||
"early_stop_reason": "slo_pass_rate_unrecoverable",
|
||||
"latency_summary": {
|
||||
"failed_reason_counts": {"ttft_ms>4000.0": 120}
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
state = StudyState(
|
||||
study_id=study.study_id,
|
||||
best_trial_id="trial-0001",
|
||||
best_request_rate=3.4667,
|
||||
best_request_rate_per_gpu=1.73335,
|
||||
trials=[
|
||||
TrialSummary(
|
||||
trial_id="trial-0001",
|
||||
status="completed",
|
||||
parallel_size=2,
|
||||
best_request_rate=3.4667,
|
||||
best_request_rate_per_gpu=1.73335,
|
||||
result_path=str(result_path),
|
||||
config_patch={"env_patch": {}, "flag_patch": {}},
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
context = build_harness_context(
|
||||
study=study,
|
||||
window_summary={
|
||||
"prompt_tokens_p95": 6500,
|
||||
"prompt_tail_ratio_p95_p50": 3.0,
|
||||
},
|
||||
state=state,
|
||||
)
|
||||
|
||||
next_action = context["experiment_plan"]["next_action"]
|
||||
self.assertEqual(next_action["knob_family"], "topology")
|
||||
descriptor_patches = [
|
||||
action["config_patch"]["flag_patch"]
|
||||
for action in context["experiment_plan"]["candidate_actions"]
|
||||
if str(action["knob_family"]).startswith("descriptor:")
|
||||
]
|
||||
self.assertTrue(
|
||||
any(patch.get("max-num-seqs") == 24 for patch in descriptor_patches)
|
||||
)
|
||||
self.assertTrue(
|
||||
any(
|
||||
patch.get("gpu-memory-utilization") == 0.9
|
||||
for patch in descriptor_patches
|
||||
)
|
||||
)
|
||||
|
||||
def test_harness_stops_gpu_mem_util_climb_after_tied_same_topology_probe(self) -> None:
|
||||
"""A same-topology gpu-memory-utilization probe must improve per-GPU rate before
|
||||
the hill-climb continues; launch success alone is not evidence to keep climbing."""
|
||||
|
||||
@@ -4,7 +4,10 @@ import unittest
|
||||
|
||||
from aituner.engine_adapters.vllm import default_vllm_descriptors
|
||||
from aituner.knob_descriptor import KnobConstraints, KnobDescriptor
|
||||
from aituner.mechanism_planner import coordinate_line_search_candidates
|
||||
from aituner.mechanism_planner import (
|
||||
CoordinateSearchPolicy,
|
||||
coordinate_line_search_candidates,
|
||||
)
|
||||
|
||||
|
||||
class MechanismPlannerTests(unittest.TestCase):
|
||||
@@ -56,12 +59,26 @@ class MechanismPlannerTests(unittest.TestCase):
|
||||
descriptor = default_vllm_descriptors(tunable_flags=("gpu-memory-utilization",))[0]
|
||||
|
||||
candidates = coordinate_line_search_candidates(
|
||||
current_config={"gpu-memory-utilization": 0.98},
|
||||
current_config={"gpu-memory-utilization": 0.96},
|
||||
descriptors=(descriptor,),
|
||||
evidence_weights={"kv_memory_capacity": 0.8},
|
||||
)
|
||||
|
||||
self.assertEqual(candidates[0].patch, {"gpu-memory-utilization": 1.0})
|
||||
self.assertEqual(candidates[0].patch, {"gpu-memory-utilization": 0.97})
|
||||
|
||||
def test_coordinate_search_can_emit_larger_same_operator_steps(self) -> None:
|
||||
descriptor = default_vllm_descriptors(tunable_flags=("max-num-seqs",))[0]
|
||||
|
||||
candidates = coordinate_line_search_candidates(
|
||||
current_config={"max-num-seqs": 8},
|
||||
descriptors=(descriptor,),
|
||||
evidence_weights={"admission_capacity": 0.9},
|
||||
policy=CoordinateSearchPolicy(step_multipliers=(1.0, 2.0)),
|
||||
)
|
||||
|
||||
patches = [candidate.patch for candidate in candidates]
|
||||
self.assertIn({"max-num-seqs": 16}, patches)
|
||||
self.assertIn({"max-num-seqs": 24}, patches)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user