Add profile-driven harness planner
This commit is contained in:
@@ -0,0 +1,73 @@
|
|||||||
|
# Profile-Driven Harness Implementation Log
|
||||||
|
|
||||||
|
Date: 2026-05-12
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
The harness should accelerate AITuner as a general tuning system, not as a collection of case-specific rules. The current implementation moves the harness toward a performance-engineering loop:
|
||||||
|
|
||||||
|
1. extract a compact profile from each measured trial;
|
||||||
|
2. rank bottleneck hypotheses from workload and probe evidence;
|
||||||
|
3. generate generic candidate actions from a knob-effect model;
|
||||||
|
4. score candidates by expected bottleneck relief, information gain, launch safety, and regression risk;
|
||||||
|
5. block early stop while a high-value untested candidate remains.
|
||||||
|
|
||||||
|
This is intended to apply across qwen3.5-27b chat, qwen3-235b prefill-only, qwen3-235b decode-only, and different SLOs without encoding model names, SLO constants, or known winning configs.
|
||||||
|
|
||||||
|
## Code Changes
|
||||||
|
|
||||||
|
- `src/aituner/harness.py`
|
||||||
|
- Added `trial_profiles` to normalize trial topology, performance, probe failures, latency quantiles, and launch failure evidence.
|
||||||
|
- Added `bottleneck_hypotheses`, a ranked list instead of a single active bottleneck label.
|
||||||
|
- Added `candidate_actions`, generated from topology and runtime knob families.
|
||||||
|
- Added `experiment_plan`, which selects the next high-score candidate or declares stop readiness.
|
||||||
|
- Updated harness proposal generation to prefer the profile-driven next action before falling back to legacy deterministic proposal code.
|
||||||
|
- Updated harness stop logic so convergence/validation stop is blocked when the planner still has a high-value untested candidate.
|
||||||
|
|
||||||
|
- `tests/test_core_flow.py`
|
||||||
|
- Added coverage that a strong TP=2 incumbent with TTFT pressure still selects an unmeasured TP=4 topology candidate.
|
||||||
|
- Added coverage that decode-only TPOT pressure at max TP can prefer lowering `max-num-seqs` instead of blindly lowering TP.
|
||||||
|
|
||||||
|
## Current Scoring Model
|
||||||
|
|
||||||
|
The candidate score is intentionally generic:
|
||||||
|
|
||||||
|
```text
|
||||||
|
score = expected_bottleneck_relief * bottleneck_confidence
|
||||||
|
+ information_gain
|
||||||
|
+ launch_safety
|
||||||
|
- regression_risk
|
||||||
|
```
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
- TTFT/prefill bottleneck: increasing TP and prefill batching candidates receive relief score.
|
||||||
|
- Decode TPOT bottleneck: increasing TP is useful if a higher legal TP exists; if already at high TP, lowering decode concurrency can become the higher-value candidate.
|
||||||
|
- Admission/queueing bottleneck: more DP or higher safe concurrency receives relief score.
|
||||||
|
|
||||||
|
The scores are not tied to qwen27b/qwen235b or a fixed TPOT/TTFT threshold. They are tied to the measured bottleneck class and legal tunable space.
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
Local:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 -m compileall -q src tests
|
||||||
|
PYTHONPATH=src python3 -m unittest tests.test_core_flow
|
||||||
|
```
|
||||||
|
|
||||||
|
Result: `93` tests passed.
|
||||||
|
|
||||||
|
## Next Experiment
|
||||||
|
|
||||||
|
Run the same qwen3.5-27b chat 0-8k setup as the current ablation baseline:
|
||||||
|
|
||||||
|
- workload: chat, input length 0-8k
|
||||||
|
- SLO: TTFT p95 <= 4000ms, TPOT p95 <= 25ms, target pass rate 0.95
|
||||||
|
- search: full range, `inherit_incumbent_floor=false`
|
||||||
|
- budget: 12 total tuning iterations
|
||||||
|
- LLM model: `gpt-5.4`
|
||||||
|
- variant: harness enabled with profile-driven planner
|
||||||
|
|
||||||
|
The no-harness min-prompt baseline is already available and only needs to be reused for comparison unless the setup changes.
|
||||||
|
|
||||||
@@ -32,21 +32,45 @@ def build_harness_context(
|
|||||||
state: StudyState,
|
state: StudyState,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
recent_diagnostics = _recent_trial_diagnostics(state)
|
recent_diagnostics = _recent_trial_diagnostics(state)
|
||||||
|
trial_profiles = _trial_profiles(study, recent_diagnostics)
|
||||||
|
bottleneck_hypotheses = _rank_bottleneck_hypotheses(
|
||||||
|
study,
|
||||||
|
window_summary,
|
||||||
|
trial_profiles,
|
||||||
|
)
|
||||||
|
experiment_plan = _experiment_plan(
|
||||||
|
study,
|
||||||
|
window_summary,
|
||||||
|
state,
|
||||||
|
recent_diagnostics,
|
||||||
|
trial_profiles,
|
||||||
|
bottleneck_hypotheses,
|
||||||
|
)
|
||||||
return {
|
return {
|
||||||
"paper_alignment": {
|
"paper_alignment": {
|
||||||
"goal": "Use workload-feature-to-knob harnesses to reduce wasted trials and avoid regressing after a good configuration is found.",
|
"goal": "Use workload-feature-to-knob harnesses to reduce wasted trials and avoid regressing after a good configuration is found.",
|
||||||
"feature_model": "L-C-A: request lengths, inter-request KV-cache reuse, and arrival dynamics.",
|
"feature_model": "L-C-A: request lengths, inter-request KV-cache reuse, and arrival dynamics.",
|
||||||
"trial_policy": "Map the active bottleneck to one knob family, apply guard conditions, and stop when the incumbent has converged.",
|
"trial_policy": "Profile measured trials, rank bottleneck hypotheses, score generic candidate actions, and stop only when no useful measured hypothesis remains.",
|
||||||
},
|
},
|
||||||
"workload_lca_profile": _workload_lca_profile(window_summary),
|
"workload_lca_profile": _workload_lca_profile(window_summary),
|
||||||
"recent_trial_diagnostics": recent_diagnostics,
|
"recent_trial_diagnostics": recent_diagnostics,
|
||||||
|
"trial_profiles": trial_profiles,
|
||||||
|
"bottleneck_hypotheses": bottleneck_hypotheses,
|
||||||
|
"candidate_actions": experiment_plan["candidate_actions"],
|
||||||
|
"experiment_plan": experiment_plan,
|
||||||
"convergence_guard": _convergence_guard(state, recent_diagnostics),
|
"convergence_guard": _convergence_guard(state, recent_diagnostics),
|
||||||
"harness_stop": _harness_stop_decision(study, state, recent_diagnostics),
|
"harness_stop": _harness_stop_decision(
|
||||||
|
study,
|
||||||
|
state,
|
||||||
|
recent_diagnostics,
|
||||||
|
experiment_plan=experiment_plan,
|
||||||
|
),
|
||||||
"harness_proposal": _harness_proposal_decision(
|
"harness_proposal": _harness_proposal_decision(
|
||||||
study,
|
study,
|
||||||
window_summary,
|
window_summary,
|
||||||
state,
|
state,
|
||||||
recent_diagnostics,
|
recent_diagnostics,
|
||||||
|
experiment_plan=experiment_plan,
|
||||||
),
|
),
|
||||||
"knob_harnesses": _knob_harnesses(study, window_summary, recent_diagnostics),
|
"knob_harnesses": _knob_harnesses(study, window_summary, recent_diagnostics),
|
||||||
"proposal_rules": _proposal_rules(),
|
"proposal_rules": _proposal_rules(),
|
||||||
@@ -348,6 +372,164 @@ def _recent_trial_diagnostics(state: StudyState) -> list[dict[str, Any]]:
|
|||||||
return diagnostics
|
return diagnostics
|
||||||
|
|
||||||
|
|
||||||
|
def _trial_profiles(
|
||||||
|
study: StudySpec,
|
||||||
|
recent_diagnostics: list[dict[str, Any]],
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
profiles: list[dict[str, Any]] = []
|
||||||
|
for item in recent_diagnostics:
|
||||||
|
flags = _effective_flags_for_item(study, item)
|
||||||
|
probe_summary = item.get("probe_summary")
|
||||||
|
best_probe = None
|
||||||
|
last_probe = None
|
||||||
|
all_infeasible = None
|
||||||
|
if isinstance(probe_summary, dict):
|
||||||
|
best_probe = probe_summary.get("best_feasible_probe")
|
||||||
|
last_probe = probe_summary.get("last_probe")
|
||||||
|
all_infeasible = probe_summary.get("all_infeasible")
|
||||||
|
limiting_probe = (
|
||||||
|
last_probe
|
||||||
|
if isinstance(last_probe, dict)
|
||||||
|
else all_infeasible
|
||||||
|
if isinstance(all_infeasible, dict)
|
||||||
|
else best_probe
|
||||||
|
if isinstance(best_probe, dict)
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
latency = limiting_probe.get("latency_summary") if isinstance(limiting_probe, dict) else {}
|
||||||
|
if not isinstance(latency, dict):
|
||||||
|
latency = {}
|
||||||
|
failed_counts = latency.get("failed_reason_counts")
|
||||||
|
if not isinstance(failed_counts, dict):
|
||||||
|
failed_counts = {}
|
||||||
|
profile = {
|
||||||
|
"trial_id": item.get("trial_id"),
|
||||||
|
"status": item.get("status"),
|
||||||
|
"config_patch": item.get("config_patch") if isinstance(item.get("config_patch"), dict) else {},
|
||||||
|
"topology": {
|
||||||
|
"tensor_parallel_size": _parse_int_like(
|
||||||
|
flags.get("tensor-parallel-size"),
|
||||||
|
default=1,
|
||||||
|
),
|
||||||
|
"data_parallel_size": _parse_int_like(
|
||||||
|
flags.get("data-parallel-size"),
|
||||||
|
default=1,
|
||||||
|
),
|
||||||
|
"expert_parallel_size": _parse_int_like(
|
||||||
|
flags.get("expert-parallel-size"),
|
||||||
|
default=1,
|
||||||
|
),
|
||||||
|
"enable_expert_parallel": bool(flags.get("enable-expert-parallel", False)),
|
||||||
|
},
|
||||||
|
"performance": {
|
||||||
|
"best_request_rate": item.get("best_request_rate"),
|
||||||
|
"best_request_rate_per_gpu": item.get("best_request_rate_per_gpu"),
|
||||||
|
"best_pass_rate": item.get("best_pass_rate"),
|
||||||
|
},
|
||||||
|
"probe_profile": {
|
||||||
|
"best_feasible_probe": best_probe,
|
||||||
|
"limiting_probe": limiting_probe,
|
||||||
|
"active_bottleneck": item.get("active_bottleneck"),
|
||||||
|
"failed_reason_counts": failed_counts,
|
||||||
|
"latency_quantiles": {
|
||||||
|
"ttft_ms": latency.get("ttft_ms") if isinstance(latency.get("ttft_ms"), dict) else {},
|
||||||
|
"tpot_ms": latency.get("tpot_ms") if isinstance(latency.get("tpot_ms"), dict) else {},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"failure_profile": {
|
||||||
|
"failure_stage": item.get("failure_stage"),
|
||||||
|
"failure_reason": item.get("failure_reason"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
profiles.append(profile)
|
||||||
|
return profiles
|
||||||
|
|
||||||
|
|
||||||
|
def _rank_bottleneck_hypotheses(
|
||||||
|
study: StudySpec,
|
||||||
|
window_summary: dict[str, Any],
|
||||||
|
trial_profiles: list[dict[str, Any]],
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
scores = {
|
||||||
|
"ttft_prefill": 0.0,
|
||||||
|
"decode_tpot": 0.0,
|
||||||
|
"admission_or_queueing": 0.0,
|
||||||
|
"launch_or_memory": 0.0,
|
||||||
|
}
|
||||||
|
evidence: dict[str, list[str]] = {name: [] for name in scores}
|
||||||
|
|
||||||
|
default = _workload_default_bottleneck(study, window_summary)
|
||||||
|
if default in scores:
|
||||||
|
scores[default] += 0.18
|
||||||
|
evidence[default].append(f"workload default bottleneck is {default}")
|
||||||
|
|
||||||
|
if study.trace.request_mode == "decode_only" and study.slo.tpot_rule is not None:
|
||||||
|
scores["decode_tpot"] += 0.22
|
||||||
|
evidence["decode_tpot"].append("decode_only study with configured TPOT SLO")
|
||||||
|
if study.slo.ttft_rule is not None:
|
||||||
|
prompt_p95 = _as_float(window_summary.get("prompt_tokens_p95"))
|
||||||
|
if prompt_p95 >= 4096:
|
||||||
|
scores["ttft_prefill"] += 0.14
|
||||||
|
evidence["ttft_prefill"].append("long prompt p95 makes TTFT/prefill plausible")
|
||||||
|
|
||||||
|
for profile in trial_profiles[-stateful_history_limit() :]:
|
||||||
|
active = str(profile.get("probe_profile", {}).get("active_bottleneck") or "")
|
||||||
|
if active in scores:
|
||||||
|
weight = 0.34 if profile is trial_profiles[-1] else 0.18
|
||||||
|
scores[active] += weight
|
||||||
|
evidence[active].append(
|
||||||
|
f"{profile.get('trial_id')} probe diagnosis is {active}"
|
||||||
|
)
|
||||||
|
failed = profile.get("probe_profile", {}).get("failed_reason_counts")
|
||||||
|
if not isinstance(failed, dict):
|
||||||
|
failed = {}
|
||||||
|
ttft_count = sum(int(v) for k, v in failed.items() if str(k).startswith("ttft"))
|
||||||
|
tpot_count = sum(int(v) for k, v in failed.items() if str(k).startswith("tpot"))
|
||||||
|
elapsed_count = sum(
|
||||||
|
int(v)
|
||||||
|
for k, v in failed.items()
|
||||||
|
if str(k).startswith("probe_elapsed_s>")
|
||||||
|
or str(k).startswith("arrival_lag_s>")
|
||||||
|
)
|
||||||
|
total = max(ttft_count + tpot_count + elapsed_count, 1)
|
||||||
|
if ttft_count:
|
||||||
|
scores["ttft_prefill"] += min(0.24, 0.24 * ttft_count / total)
|
||||||
|
evidence["ttft_prefill"].append(
|
||||||
|
f"{profile.get('trial_id')} TTFT failures={ttft_count}"
|
||||||
|
)
|
||||||
|
if tpot_count:
|
||||||
|
scores["decode_tpot"] += min(0.24, 0.24 * tpot_count / total)
|
||||||
|
evidence["decode_tpot"].append(
|
||||||
|
f"{profile.get('trial_id')} TPOT failures={tpot_count}"
|
||||||
|
)
|
||||||
|
if elapsed_count:
|
||||||
|
scores["admission_or_queueing"] += min(0.18, 0.18 * elapsed_count / total)
|
||||||
|
evidence["admission_or_queueing"].append(
|
||||||
|
f"{profile.get('trial_id')} queue/elapsed failures={elapsed_count}"
|
||||||
|
)
|
||||||
|
failure_stage = str(profile.get("failure_profile", {}).get("failure_stage") or "")
|
||||||
|
failure_reason = str(profile.get("failure_profile", {}).get("failure_reason") or "")
|
||||||
|
if failure_stage == "engine_launch" or "out of memory" in failure_reason.lower():
|
||||||
|
scores["launch_or_memory"] += 0.4
|
||||||
|
evidence["launch_or_memory"].append(
|
||||||
|
f"{profile.get('trial_id')} launch or memory failure"
|
||||||
|
)
|
||||||
|
|
||||||
|
ranked = []
|
||||||
|
for name, score in scores.items():
|
||||||
|
if score <= 0:
|
||||||
|
continue
|
||||||
|
ranked.append(
|
||||||
|
{
|
||||||
|
"name": name,
|
||||||
|
"confidence": min(0.99, round(score, 4)),
|
||||||
|
"evidence": evidence[name][:6],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
ranked.sort(key=lambda item: item["confidence"], reverse=True)
|
||||||
|
return ranked
|
||||||
|
|
||||||
|
|
||||||
def stateful_history_limit() -> int:
|
def stateful_history_limit() -> int:
|
||||||
return 8
|
return 8
|
||||||
|
|
||||||
@@ -513,6 +695,8 @@ def _harness_stop_decision(
|
|||||||
study: StudySpec,
|
study: StudySpec,
|
||||||
state: StudyState,
|
state: StudyState,
|
||||||
recent_diagnostics: list[dict[str, Any]],
|
recent_diagnostics: list[dict[str, Any]],
|
||||||
|
*,
|
||||||
|
experiment_plan: dict[str, Any] | None = None,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
high_saturation = _search_high_saturation_guard(study, state, recent_diagnostics)
|
high_saturation = _search_high_saturation_guard(study, state, recent_diagnostics)
|
||||||
if high_saturation["saturated"]:
|
if high_saturation["saturated"]:
|
||||||
@@ -528,6 +712,17 @@ def _harness_stop_decision(
|
|||||||
"reason": "topology_frontier_requires_probe",
|
"reason": "topology_frontier_requires_probe",
|
||||||
"evidence": topology_frontier,
|
"evidence": topology_frontier,
|
||||||
}
|
}
|
||||||
|
if experiment_plan is not None and experiment_plan.get("next_action"):
|
||||||
|
action = experiment_plan["next_action"]
|
||||||
|
if isinstance(action, dict) and _as_float(action.get("score")) >= 0.35:
|
||||||
|
return {
|
||||||
|
"should_stop": False,
|
||||||
|
"reason": "experiment_plan_has_high_value_candidate",
|
||||||
|
"evidence": {
|
||||||
|
"summary": "The profile-driven planner still has a useful measured hypothesis to test.",
|
||||||
|
"next_action": action,
|
||||||
|
},
|
||||||
|
}
|
||||||
guard = _convergence_guard(state, recent_diagnostics)
|
guard = _convergence_guard(state, recent_diagnostics)
|
||||||
if guard["deterministic_stop"]:
|
if guard["deterministic_stop"]:
|
||||||
return {
|
return {
|
||||||
@@ -562,6 +757,8 @@ def _harness_proposal_decision(
|
|||||||
window_summary: dict[str, Any],
|
window_summary: dict[str, Any],
|
||||||
state: StudyState,
|
state: StudyState,
|
||||||
recent_diagnostics: list[dict[str, Any]],
|
recent_diagnostics: list[dict[str, Any]],
|
||||||
|
*,
|
||||||
|
experiment_plan: dict[str, Any] | None = None,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
default = {
|
default = {
|
||||||
"should_propose": False,
|
"should_propose": False,
|
||||||
@@ -575,6 +772,26 @@ def _harness_proposal_decision(
|
|||||||
for item in recent_diagnostics
|
for item in recent_diagnostics
|
||||||
}
|
}
|
||||||
tested_signatures.update(_state_tested_signatures(state))
|
tested_signatures.update(_state_tested_signatures(state))
|
||||||
|
if experiment_plan is not None:
|
||||||
|
next_action = experiment_plan.get("next_action")
|
||||||
|
if isinstance(next_action, dict) and _as_float(next_action.get("score")) >= 0.35:
|
||||||
|
patch = next_action.get("config_patch")
|
||||||
|
if isinstance(patch, dict):
|
||||||
|
signature = _config_signature(patch)
|
||||||
|
if signature not in tested_signatures:
|
||||||
|
return {
|
||||||
|
"should_propose": True,
|
||||||
|
"reason": str(next_action.get("action_id") or "profile_driven_candidate"),
|
||||||
|
"diagnosis": str(next_action.get("hypothesis") or "Profile-driven harness candidate."),
|
||||||
|
"config_patch": patch,
|
||||||
|
"expected_effects": [
|
||||||
|
str(item)
|
||||||
|
for item in next_action.get("expected_effects", [])
|
||||||
|
if isinstance(item, str)
|
||||||
|
],
|
||||||
|
"candidate_score": next_action.get("score"),
|
||||||
|
"bottleneck_hypotheses": experiment_plan.get("bottleneck_hypotheses", []),
|
||||||
|
}
|
||||||
baseline = recent_diagnostics[0] if recent_diagnostics else {}
|
baseline = recent_diagnostics[0] if recent_diagnostics else {}
|
||||||
topology_frontier = _topology_frontier_proposal(
|
topology_frontier = _topology_frontier_proposal(
|
||||||
study,
|
study,
|
||||||
@@ -691,6 +908,526 @@ def _topology_frontier_proposal(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _experiment_plan(
|
||||||
|
study: StudySpec,
|
||||||
|
window_summary: dict[str, Any],
|
||||||
|
state: StudyState,
|
||||||
|
recent_diagnostics: list[dict[str, Any]],
|
||||||
|
trial_profiles: list[dict[str, Any]],
|
||||||
|
bottleneck_hypotheses: list[dict[str, Any]],
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
tested_signatures = {
|
||||||
|
_config_signature(item.get("config_patch") if isinstance(item, dict) else None)
|
||||||
|
for item in recent_diagnostics
|
||||||
|
}
|
||||||
|
tested_signatures.update(_state_tested_signatures(state))
|
||||||
|
candidates = _candidate_actions(
|
||||||
|
study,
|
||||||
|
window_summary,
|
||||||
|
state,
|
||||||
|
recent_diagnostics,
|
||||||
|
trial_profiles,
|
||||||
|
bottleneck_hypotheses,
|
||||||
|
tested_signatures=tested_signatures,
|
||||||
|
)
|
||||||
|
candidates.sort(key=lambda item: _as_float(item.get("score")), reverse=True)
|
||||||
|
next_action = candidates[0] if candidates else None
|
||||||
|
return {
|
||||||
|
"planner_version": "profile-driven-v1",
|
||||||
|
"bottleneck_hypotheses": bottleneck_hypotheses,
|
||||||
|
"candidate_actions": candidates[:8],
|
||||||
|
"next_action": next_action,
|
||||||
|
"stop_ready": next_action is None or _as_float(next_action.get("score")) < 0.35,
|
||||||
|
"stop_rationale": (
|
||||||
|
"no untested high-value candidate remains"
|
||||||
|
if not candidates or _as_float(candidates[0].get("score")) < 0.35
|
||||||
|
else "continue with the highest-scoring measured hypothesis"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _candidate_actions(
|
||||||
|
study: StudySpec,
|
||||||
|
window_summary: dict[str, Any],
|
||||||
|
state: StudyState,
|
||||||
|
recent_diagnostics: list[dict[str, Any]],
|
||||||
|
trial_profiles: list[dict[str, Any]],
|
||||||
|
bottleneck_hypotheses: list[dict[str, Any]],
|
||||||
|
*,
|
||||||
|
tested_signatures: set[str],
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
if not recent_diagnostics:
|
||||||
|
return []
|
||||||
|
anchor = _anchor_profile(study, state, recent_diagnostics, trial_profiles)
|
||||||
|
if anchor is None:
|
||||||
|
return []
|
||||||
|
top_bottleneck = (
|
||||||
|
str(bottleneck_hypotheses[0]["name"])
|
||||||
|
if bottleneck_hypotheses
|
||||||
|
else str(anchor.get("probe_profile", {}).get("active_bottleneck") or "")
|
||||||
|
)
|
||||||
|
candidates: list[dict[str, Any]] = []
|
||||||
|
candidates.extend(
|
||||||
|
_topology_candidate_actions(
|
||||||
|
study,
|
||||||
|
anchor,
|
||||||
|
top_bottleneck,
|
||||||
|
bottleneck_hypotheses,
|
||||||
|
tested_signatures,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
candidates.extend(
|
||||||
|
_runtime_candidate_actions(
|
||||||
|
study,
|
||||||
|
window_summary,
|
||||||
|
anchor,
|
||||||
|
top_bottleneck,
|
||||||
|
bottleneck_hypotheses,
|
||||||
|
tested_signatures,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return candidates
|
||||||
|
|
||||||
|
|
||||||
|
def _anchor_profile(
|
||||||
|
study: StudySpec,
|
||||||
|
state: StudyState,
|
||||||
|
recent_diagnostics: list[dict[str, Any]],
|
||||||
|
trial_profiles: list[dict[str, Any]],
|
||||||
|
) -> dict[str, Any] | None:
|
||||||
|
if state.best_trial_id:
|
||||||
|
for profile in reversed(trial_profiles):
|
||||||
|
if profile.get("trial_id") == state.best_trial_id:
|
||||||
|
return profile
|
||||||
|
for profile in reversed(trial_profiles):
|
||||||
|
if profile.get("status") == "completed":
|
||||||
|
return profile
|
||||||
|
return trial_profiles[-1] if trial_profiles else None
|
||||||
|
|
||||||
|
|
||||||
|
def _topology_candidate_actions(
|
||||||
|
study: StudySpec,
|
||||||
|
anchor: dict[str, Any],
|
||||||
|
top_bottleneck: str,
|
||||||
|
bottleneck_hypotheses: list[dict[str, Any]],
|
||||||
|
tested_signatures: set[str],
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
if not ({"tensor-parallel-size", "data-parallel-size"} & set(study.engine.tunable_flags)):
|
||||||
|
return []
|
||||||
|
anchor_flags = _effective_flags_for_item(study, anchor)
|
||||||
|
current_tp = _parse_int_like(anchor_flags.get("tensor-parallel-size"), default=1)
|
||||||
|
current_dp = _parse_int_like(anchor_flags.get("data-parallel-size"), default=1)
|
||||||
|
current_ep = _parse_int_like(anchor_flags.get("expert-parallel-size"), default=1)
|
||||||
|
current_enable_ep = bool(anchor_flags.get("enable-expert-parallel", False))
|
||||||
|
legal = _legal_topology_points(
|
||||||
|
study,
|
||||||
|
current_tp=current_tp,
|
||||||
|
current_dp=current_dp,
|
||||||
|
current_ep=current_ep,
|
||||||
|
current_enable_ep=current_enable_ep,
|
||||||
|
)
|
||||||
|
actions: list[dict[str, Any]] = []
|
||||||
|
for point in legal:
|
||||||
|
if point["tensor-parallel-size"] == current_tp and point["data-parallel-size"] == current_dp:
|
||||||
|
continue
|
||||||
|
patch = _topology_patch(study, point)
|
||||||
|
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
||||||
|
if signature in tested_signatures:
|
||||||
|
continue
|
||||||
|
score, factors = _score_topology_candidate(
|
||||||
|
top_bottleneck,
|
||||||
|
bottleneck_hypotheses,
|
||||||
|
current_tp=current_tp,
|
||||||
|
current_dp=current_dp,
|
||||||
|
candidate_tp=point["tensor-parallel-size"],
|
||||||
|
candidate_dp=point["data-parallel-size"],
|
||||||
|
)
|
||||||
|
if score <= 0:
|
||||||
|
continue
|
||||||
|
action_id = _topology_action_id(current_tp, current_dp, point)
|
||||||
|
actions.append(
|
||||||
|
{
|
||||||
|
"action_id": action_id,
|
||||||
|
"knob_family": "topology",
|
||||||
|
"score": round(score, 4),
|
||||||
|
"score_factors": factors,
|
||||||
|
"config_patch": {"env_patch": {}, "flag_patch": patch},
|
||||||
|
"hypothesis": _topology_hypothesis(
|
||||||
|
top_bottleneck,
|
||||||
|
current_tp=current_tp,
|
||||||
|
current_dp=current_dp,
|
||||||
|
candidate_tp=point["tensor-parallel-size"],
|
||||||
|
candidate_dp=point["data-parallel-size"],
|
||||||
|
),
|
||||||
|
"expected_effects": [
|
||||||
|
"measure whether topology changes relieve the ranked bottleneck",
|
||||||
|
"compare request_rate_per_gpu under the configured SLO, not raw throughput alone",
|
||||||
|
"reject this hypothesis if latency improves but per-GPU throughput regresses materially",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return actions
|
||||||
|
|
||||||
|
|
||||||
|
def _runtime_candidate_actions(
|
||||||
|
study: StudySpec,
|
||||||
|
window_summary: dict[str, Any],
|
||||||
|
anchor: dict[str, Any],
|
||||||
|
top_bottleneck: str,
|
||||||
|
bottleneck_hypotheses: list[dict[str, Any]],
|
||||||
|
tested_signatures: set[str],
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
tunable = set(study.engine.tunable_flags)
|
||||||
|
anchor_flags = _effective_flags_for_item(study, anchor)
|
||||||
|
topology_patch = _preserve_topology_patch(study, anchor_flags)
|
||||||
|
actions: list[dict[str, Any]] = []
|
||||||
|
|
||||||
|
if "max-num-batched-tokens" in tunable:
|
||||||
|
current_mbt = _parse_int_like(anchor_flags.get("max-num-batched-tokens"), default=0)
|
||||||
|
mbt_targets: list[tuple[str, int]] = []
|
||||||
|
if top_bottleneck == "ttft_prefill":
|
||||||
|
target = (
|
||||||
|
_initial_mbt_from_window(window_summary)
|
||||||
|
if current_mbt <= 0
|
||||||
|
else _next_mbt_step(current_mbt)
|
||||||
|
)
|
||||||
|
if target is not None:
|
||||||
|
mbt_targets.append(("raise_mbt", target))
|
||||||
|
elif top_bottleneck == "decode_tpot" and current_mbt > 8192:
|
||||||
|
mbt_targets.append(("lower_mbt", max(8192, current_mbt // 2)))
|
||||||
|
for action_id, target in mbt_targets:
|
||||||
|
patch = {**topology_patch, "max-num-batched-tokens": target}
|
||||||
|
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
||||||
|
if signature in tested_signatures:
|
||||||
|
continue
|
||||||
|
relief = 0.24 if top_bottleneck == "ttft_prefill" else 0.14
|
||||||
|
actions.append(
|
||||||
|
_runtime_action(
|
||||||
|
action_id=action_id,
|
||||||
|
knob_family="max-num-batched-tokens",
|
||||||
|
score=relief + _information_gain(bottleneck_hypotheses, "runtime"),
|
||||||
|
patch=patch,
|
||||||
|
hypothesis=(
|
||||||
|
"Adjust max-num-batched-tokens to test whether batching, not topology, "
|
||||||
|
"is limiting the active latency objective."
|
||||||
|
),
|
||||||
|
expected_effects=[
|
||||||
|
"change prefill/decode batching pressure on the incumbent topology",
|
||||||
|
"confirm if the latency knee moves without requiring another topology change",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if "max-num-seqs" in tunable:
|
||||||
|
current_mns = _parse_int_like(anchor_flags.get("max-num-seqs"), default=0)
|
||||||
|
mns_targets: list[tuple[str, int]] = []
|
||||||
|
if top_bottleneck == "admission_or_queueing":
|
||||||
|
target = max(8, int(current_mns * 1.5)) if current_mns > 0 else 64
|
||||||
|
mns_targets.append(("raise_max_num_seqs", _round_up_to_multiple(target, 8)))
|
||||||
|
elif top_bottleneck == "decode_tpot" and current_mns > 8:
|
||||||
|
mns_targets.append(("lower_max_num_seqs", max(8, current_mns // 2)))
|
||||||
|
for action_id, target in mns_targets:
|
||||||
|
patch = {**topology_patch, "max-num-seqs": target}
|
||||||
|
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
||||||
|
if signature in tested_signatures:
|
||||||
|
continue
|
||||||
|
relief = 0.25 if top_bottleneck in {"decode_tpot", "admission_or_queueing"} else 0.08
|
||||||
|
actions.append(
|
||||||
|
_runtime_action(
|
||||||
|
action_id=action_id,
|
||||||
|
knob_family="max-num-seqs",
|
||||||
|
score=relief + _information_gain(bottleneck_hypotheses, "runtime"),
|
||||||
|
patch=patch,
|
||||||
|
hypothesis=(
|
||||||
|
"Adjust max-num-seqs to test whether concurrency pressure is the "
|
||||||
|
"limiting factor under the configured SLO."
|
||||||
|
),
|
||||||
|
expected_effects=[
|
||||||
|
"change decode/admission concurrency on the incumbent topology",
|
||||||
|
"confirm if TPOT or queueing pressure is caused by sequence concurrency",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if "enable-chunked-prefill" in tunable and top_bottleneck == "ttft_prefill":
|
||||||
|
current = bool(anchor_flags.get("enable-chunked-prefill", False))
|
||||||
|
if not current:
|
||||||
|
patch = {**topology_patch, "enable-chunked-prefill": True}
|
||||||
|
signature = _config_signature({"env_patch": {}, "flag_patch": patch})
|
||||||
|
if signature not in tested_signatures:
|
||||||
|
actions.append(
|
||||||
|
_runtime_action(
|
||||||
|
action_id="enable_chunked_prefill",
|
||||||
|
knob_family="enable-chunked-prefill",
|
||||||
|
score=0.2 + _information_gain(bottleneck_hypotheses, "runtime"),
|
||||||
|
patch=patch,
|
||||||
|
hypothesis=(
|
||||||
|
"Enable chunked prefill to test whether long-prefill head-of-line "
|
||||||
|
"blocking is driving TTFT failures."
|
||||||
|
),
|
||||||
|
expected_effects=[
|
||||||
|
"reduce long-prefill interference for mixed-length chat windows",
|
||||||
|
"reject if chunking overhead worsens request_rate_per_gpu",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return actions
|
||||||
|
|
||||||
|
|
||||||
|
def _runtime_action(
|
||||||
|
*,
|
||||||
|
action_id: str,
|
||||||
|
knob_family: str,
|
||||||
|
score: float,
|
||||||
|
patch: dict[str, Any],
|
||||||
|
hypothesis: str,
|
||||||
|
expected_effects: list[str],
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"action_id": action_id,
|
||||||
|
"knob_family": knob_family,
|
||||||
|
"score": round(score, 4),
|
||||||
|
"score_factors": {
|
||||||
|
"expected_bottleneck_relief": round(max(score - 0.1, 0.0), 4),
|
||||||
|
"information_gain": 0.1,
|
||||||
|
"launch_safety": 0.05,
|
||||||
|
"regression_risk": 0.05,
|
||||||
|
},
|
||||||
|
"config_patch": {"env_patch": {}, "flag_patch": patch},
|
||||||
|
"hypothesis": hypothesis,
|
||||||
|
"expected_effects": expected_effects,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _legal_topology_points(
|
||||||
|
study: StudySpec,
|
||||||
|
*,
|
||||||
|
current_tp: int,
|
||||||
|
current_dp: int,
|
||||||
|
current_ep: int,
|
||||||
|
current_enable_ep: bool,
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
constraints = study.engine.topology_constraints
|
||||||
|
tunable = set(study.engine.tunable_flags)
|
||||||
|
if constraints is not None and constraints.allowed_tensor_parallel_sizes:
|
||||||
|
tp_values = sorted(set(constraints.allowed_tensor_parallel_sizes))
|
||||||
|
elif "tensor-parallel-size" in tunable:
|
||||||
|
tp_values = [value for value in [1, 2, 4, 8] if value <= study.hardware.gpu_count]
|
||||||
|
else:
|
||||||
|
tp_values = [current_tp]
|
||||||
|
|
||||||
|
if constraints is not None and constraints.allowed_data_parallel_sizes:
|
||||||
|
dp_values = sorted(set(constraints.allowed_data_parallel_sizes))
|
||||||
|
elif "data-parallel-size" in tunable:
|
||||||
|
dp_values = [value for value in [1, 2, 4, 8] if value <= study.hardware.gpu_count]
|
||||||
|
else:
|
||||||
|
dp_values = [current_dp]
|
||||||
|
|
||||||
|
if constraints is not None and constraints.allowed_expert_parallel_sizes:
|
||||||
|
ep_values = sorted(set(constraints.allowed_expert_parallel_sizes))
|
||||||
|
elif "expert-parallel-size" in tunable:
|
||||||
|
ep_values = sorted({1, current_ep})
|
||||||
|
else:
|
||||||
|
ep_values = [current_ep]
|
||||||
|
|
||||||
|
points: list[dict[str, Any]] = []
|
||||||
|
for tp in tp_values:
|
||||||
|
for dp in dp_values:
|
||||||
|
tp_dp_product = tp * dp
|
||||||
|
if constraints is not None:
|
||||||
|
if (
|
||||||
|
constraints.allowed_tp_dp_products
|
||||||
|
and tp_dp_product not in constraints.allowed_tp_dp_products
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
if (
|
||||||
|
constraints.require_tp_dp_product_equals_gpu_count
|
||||||
|
and tp_dp_product != study.hardware.gpu_count
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
elif tp_dp_product > study.hardware.gpu_count:
|
||||||
|
continue
|
||||||
|
if constraints is not None and not constraints.allowed_tp_dp_products:
|
||||||
|
if tp_dp_product > study.hardware.gpu_count:
|
||||||
|
continue
|
||||||
|
for ep in ep_values:
|
||||||
|
enable_ep = current_enable_ep or ep > 1
|
||||||
|
if constraints is not None:
|
||||||
|
if constraints.allowed_expert_parallel_sizes and ep not in constraints.allowed_expert_parallel_sizes:
|
||||||
|
continue
|
||||||
|
if constraints.require_ep_size_leq_tp_dp_product and ep > tp_dp_product:
|
||||||
|
continue
|
||||||
|
if constraints.require_ep_size_divides_tp_dp_product and tp_dp_product % ep != 0:
|
||||||
|
continue
|
||||||
|
if (
|
||||||
|
constraints.require_enable_expert_parallel_when_ep_gt_one
|
||||||
|
and ep > 1
|
||||||
|
and not enable_ep
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
points.append(
|
||||||
|
{
|
||||||
|
"tensor-parallel-size": tp,
|
||||||
|
"data-parallel-size": dp,
|
||||||
|
"expert-parallel-size": ep,
|
||||||
|
"enable-expert-parallel": enable_ep,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return points
|
||||||
|
|
||||||
|
|
||||||
|
def _topology_patch(study: StudySpec, point: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
patch: dict[str, Any] = {}
|
||||||
|
tunable = set(study.engine.tunable_flags)
|
||||||
|
base = _normalized_topology_flags(study.engine.base_flags)
|
||||||
|
for key in (
|
||||||
|
"tensor-parallel-size",
|
||||||
|
"data-parallel-size",
|
||||||
|
"expert-parallel-size",
|
||||||
|
"enable-expert-parallel",
|
||||||
|
):
|
||||||
|
if key not in tunable:
|
||||||
|
continue
|
||||||
|
if key in point and point[key] != base.get(key):
|
||||||
|
patch[key] = point[key]
|
||||||
|
return patch
|
||||||
|
|
||||||
|
|
||||||
|
def _preserve_topology_patch(study: StudySpec, flags: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
patch: dict[str, Any] = {}
|
||||||
|
tunable = set(study.engine.tunable_flags)
|
||||||
|
base = _normalized_topology_flags(study.engine.base_flags)
|
||||||
|
normalized = _normalized_topology_flags(flags)
|
||||||
|
for key in (
|
||||||
|
"tensor-parallel-size",
|
||||||
|
"data-parallel-size",
|
||||||
|
"expert-parallel-size",
|
||||||
|
"enable-expert-parallel",
|
||||||
|
):
|
||||||
|
if key not in tunable or key not in normalized:
|
||||||
|
continue
|
||||||
|
if normalized.get(key) != base.get(key):
|
||||||
|
patch[key] = normalized[key]
|
||||||
|
return patch
|
||||||
|
|
||||||
|
|
||||||
|
def _normalized_topology_flags(flags: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"tensor-parallel-size": _parse_int_like(
|
||||||
|
flags.get("tensor-parallel-size"),
|
||||||
|
default=1,
|
||||||
|
),
|
||||||
|
"data-parallel-size": _parse_int_like(
|
||||||
|
flags.get("data-parallel-size"),
|
||||||
|
default=1,
|
||||||
|
),
|
||||||
|
"expert-parallel-size": _parse_int_like(
|
||||||
|
flags.get("expert-parallel-size"),
|
||||||
|
default=1,
|
||||||
|
),
|
||||||
|
"enable-expert-parallel": bool(flags.get("enable-expert-parallel", False)),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _score_topology_candidate(
|
||||||
|
top_bottleneck: str,
|
||||||
|
bottleneck_hypotheses: list[dict[str, Any]],
|
||||||
|
*,
|
||||||
|
current_tp: int,
|
||||||
|
current_dp: int,
|
||||||
|
candidate_tp: int,
|
||||||
|
candidate_dp: int,
|
||||||
|
) -> tuple[float, dict[str, float]]:
|
||||||
|
tp_delta = candidate_tp - current_tp
|
||||||
|
dp_delta = candidate_dp - current_dp
|
||||||
|
confidence = _hypothesis_confidence(bottleneck_hypotheses, top_bottleneck)
|
||||||
|
relief = 0.0
|
||||||
|
if top_bottleneck == "ttft_prefill":
|
||||||
|
relief = 0.42 if tp_delta > 0 else 0.05
|
||||||
|
elif top_bottleneck == "decode_tpot":
|
||||||
|
relief = 0.34 if tp_delta > 0 else 0.02
|
||||||
|
elif top_bottleneck == "admission_or_queueing":
|
||||||
|
relief = 0.34 if dp_delta > 0 else 0.08
|
||||||
|
else:
|
||||||
|
relief = 0.04
|
||||||
|
info_gain = 0.2 if abs(tp_delta) + abs(dp_delta) > 0 else 0.0
|
||||||
|
launch_safety = 0.08 if candidate_tp * candidate_dp <= max(current_tp * current_dp, 1) else 0.04
|
||||||
|
distance = abs(_log2_ratio(candidate_tp, current_tp)) + abs(_log2_ratio(candidate_dp, current_dp))
|
||||||
|
regression_risk = min(0.28, 0.06 * distance)
|
||||||
|
score = relief * max(confidence, 0.35) + info_gain + launch_safety - regression_risk
|
||||||
|
return score, {
|
||||||
|
"expected_bottleneck_relief": round(relief, 4),
|
||||||
|
"bottleneck_confidence": round(confidence, 4),
|
||||||
|
"information_gain": round(info_gain, 4),
|
||||||
|
"launch_safety": round(launch_safety, 4),
|
||||||
|
"regression_risk": round(regression_risk, 4),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _information_gain(bottleneck_hypotheses: list[dict[str, Any]], family: str) -> float:
|
||||||
|
if not bottleneck_hypotheses:
|
||||||
|
return 0.08
|
||||||
|
top_confidence = _as_float(bottleneck_hypotheses[0].get("confidence"))
|
||||||
|
uncertainty = max(0.0, 1.0 - top_confidence)
|
||||||
|
return 0.08 + min(0.12, uncertainty * 0.12)
|
||||||
|
|
||||||
|
|
||||||
|
def _hypothesis_confidence(
|
||||||
|
bottleneck_hypotheses: list[dict[str, Any]],
|
||||||
|
name: str,
|
||||||
|
) -> float:
|
||||||
|
for item in bottleneck_hypotheses:
|
||||||
|
if item.get("name") == name:
|
||||||
|
return _as_float(item.get("confidence"))
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def _topology_action_id(
|
||||||
|
current_tp: int,
|
||||||
|
current_dp: int,
|
||||||
|
point: dict[str, Any],
|
||||||
|
) -> str:
|
||||||
|
candidate_tp = int(point["tensor-parallel-size"])
|
||||||
|
candidate_dp = int(point["data-parallel-size"])
|
||||||
|
if candidate_tp > current_tp:
|
||||||
|
return "topology_frontier_probe_for_slo_pressure"
|
||||||
|
if candidate_dp > current_dp:
|
||||||
|
return "increase_data_parallel_probe"
|
||||||
|
if candidate_tp < current_tp:
|
||||||
|
return "decrease_tensor_parallel_probe"
|
||||||
|
return "redistribute_topology_probe"
|
||||||
|
|
||||||
|
|
||||||
|
def _topology_hypothesis(
|
||||||
|
top_bottleneck: str,
|
||||||
|
*,
|
||||||
|
current_tp: int,
|
||||||
|
current_dp: int,
|
||||||
|
candidate_tp: int,
|
||||||
|
candidate_dp: int,
|
||||||
|
) -> str:
|
||||||
|
return (
|
||||||
|
f"Ranked bottleneck is {top_bottleneck}. Test topology "
|
||||||
|
f"TP={candidate_tp}, DP={candidate_dp} against incumbent TP={current_tp}, "
|
||||||
|
f"DP={current_dp}; this distinguishes compute-latency relief from "
|
||||||
|
"replica/admission effects under the configured SLO."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _log2_ratio(new: int, old: int) -> float:
|
||||||
|
if new <= 0 or old <= 0:
|
||||||
|
return 0.0
|
||||||
|
ratio = new / old
|
||||||
|
steps = 0.0
|
||||||
|
while ratio >= 2.0:
|
||||||
|
steps += 1.0
|
||||||
|
ratio /= 2.0
|
||||||
|
while ratio <= 0.5:
|
||||||
|
steps += 1.0
|
||||||
|
ratio *= 2.0
|
||||||
|
return steps
|
||||||
|
|
||||||
|
|
||||||
def _topology_frontier_status(
|
def _topology_frontier_status(
|
||||||
study: StudySpec,
|
study: StudySpec,
|
||||||
state: StudyState,
|
state: StudyState,
|
||||||
|
|||||||
@@ -920,6 +920,170 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
"topology_frontier_probe_for_slo_pressure",
|
"topology_frontier_probe_for_slo_pressure",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_profile_driven_planner_scores_unmeasured_tp_frontier(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(
|
||||||
|
tmp_path,
|
||||||
|
engine_overrides={
|
||||||
|
"tunable_flags": [
|
||||||
|
"tensor-parallel-size",
|
||||||
|
"max-num-batched-tokens",
|
||||||
|
"enable-chunked-prefill",
|
||||||
|
],
|
||||||
|
"topology_constraints": {
|
||||||
|
"allowed_tensor_parallel_sizes": [1, 2, 4],
|
||||||
|
"allowed_tp_dp_products": [1, 2, 4],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
result_path = tmp_path / "trial-0002.json"
|
||||||
|
result_path.write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"status": "completed",
|
||||||
|
"best_sampling_u": 0.5,
|
||||||
|
"best_request_rate": 2.0,
|
||||||
|
"best_pass_rate": 0.96,
|
||||||
|
"probes": [
|
||||||
|
{
|
||||||
|
"threshold": 0.75,
|
||||||
|
"feasible": False,
|
||||||
|
"payload": {
|
||||||
|
"request_count": 100,
|
||||||
|
"pass_rate": 0.6,
|
||||||
|
"request_rate": 3.0,
|
||||||
|
"early_stop_reason": "slo_pass_rate_unrecoverable",
|
||||||
|
"latency_summary": {
|
||||||
|
"failed_reason_counts": {"ttft_ms>4000.0": 35}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
context = build_harness_context(
|
||||||
|
study=study,
|
||||||
|
window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8},
|
||||||
|
state=StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
best_trial_id="trial-0002",
|
||||||
|
best_request_rate=2.0,
|
||||||
|
best_request_rate_per_gpu=1.0,
|
||||||
|
trials=[
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0001",
|
||||||
|
status="completed",
|
||||||
|
best_request_rate=0.5,
|
||||||
|
best_request_rate_per_gpu=0.5,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0002",
|
||||||
|
status="completed",
|
||||||
|
best_request_rate=2.0,
|
||||||
|
best_request_rate_per_gpu=1.0,
|
||||||
|
result_path=str(result_path),
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {"tensor-parallel-size": 2},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
plan = context["experiment_plan"]
|
||||||
|
self.assertEqual(plan["planner_version"], "profile-driven-v1")
|
||||||
|
self.assertEqual(plan["next_action"]["knob_family"], "topology")
|
||||||
|
self.assertEqual(
|
||||||
|
plan["next_action"]["config_patch"]["flag_patch"],
|
||||||
|
{"tensor-parallel-size": 4},
|
||||||
|
)
|
||||||
|
self.assertIn("ttft_prefill", context["bottleneck_hypotheses"][0]["name"])
|
||||||
|
self.assertFalse(context["harness_stop"]["should_stop"])
|
||||||
|
|
||||||
|
def test_profile_driven_planner_prefers_decode_concurrency_relief(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(
|
||||||
|
tmp_path,
|
||||||
|
trace_overrides={"request_mode": "decode_only"},
|
||||||
|
slo_overrides={
|
||||||
|
"ttft_rule": None,
|
||||||
|
"tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
|
||||||
|
},
|
||||||
|
engine_overrides={
|
||||||
|
"base_flags": {
|
||||||
|
"host": "127.0.0.1",
|
||||||
|
"port": 8000,
|
||||||
|
"tensor-parallel-size": 4,
|
||||||
|
"max-num-seqs": 64,
|
||||||
|
},
|
||||||
|
"tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
|
||||||
|
"topology_constraints": {
|
||||||
|
"allowed_tensor_parallel_sizes": [1, 2, 4],
|
||||||
|
"allowed_tp_dp_products": [1, 2, 4],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
result_path = tmp_path / "trial-0001.json"
|
||||||
|
result_path.write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"status": "completed",
|
||||||
|
"best_sampling_u": 0.25,
|
||||||
|
"best_request_rate": 1.0,
|
||||||
|
"best_pass_rate": 0.97,
|
||||||
|
"probes": [
|
||||||
|
{
|
||||||
|
"threshold": 0.5,
|
||||||
|
"feasible": False,
|
||||||
|
"payload": {
|
||||||
|
"request_count": 100,
|
||||||
|
"pass_rate": 0.5,
|
||||||
|
"request_rate": 2.0,
|
||||||
|
"early_stop_reason": "slo_pass_rate_unrecoverable",
|
||||||
|
"latency_summary": {
|
||||||
|
"failed_reason_counts": {"tpot_ms>20.0": 50}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
context = build_harness_context(
|
||||||
|
study=study,
|
||||||
|
window_summary={},
|
||||||
|
state=StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
best_trial_id="trial-0001",
|
||||||
|
best_request_rate=1.0,
|
||||||
|
best_request_rate_per_gpu=0.25,
|
||||||
|
trials=[
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0001",
|
||||||
|
status="completed",
|
||||||
|
best_request_rate=1.0,
|
||||||
|
best_request_rate_per_gpu=0.25,
|
||||||
|
result_path=str(result_path),
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {}},
|
||||||
|
)
|
||||||
|
],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
plan = context["experiment_plan"]
|
||||||
|
self.assertEqual(plan["next_action"]["knob_family"], "max-num-seqs")
|
||||||
|
self.assertEqual(
|
||||||
|
plan["next_action"]["config_patch"]["flag_patch"],
|
||||||
|
{"max-num-seqs": 32},
|
||||||
|
)
|
||||||
|
|
||||||
def test_harness_stop_blocked_until_slo_driven_topology_frontier_is_measured(self) -> None:
|
def test_harness_stop_blocked_until_slo_driven_topology_frontier_is_measured(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
tmp_path = Path(tmp)
|
tmp_path = Path(tmp)
|
||||||
|
|||||||
Reference in New Issue
Block a user