From 6f8e3c95c1810745ef61ba45b4c53b974880b9cd Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Mon, 15 Jun 2026 14:12:17 +0800 Subject: [PATCH] Unify harness L-C-A on the canonical lca.WorkloadProfile Phase 0 of the two-stop work. The prompt block labeled `workload_lca_profile` previously re-derived L-C-A from summarize_window's ad-hoc percentiles, diverging from the paper's 10-dim RobustScaler vector implemented in lca.py. Make that block authoritative: build_harness_context now accepts an optional workload_profile and renders the canonical 10-dim vector + per-family stats when present, falling back to the legacy rendering only when no profile is supplied (direct unit-test calls). Real call sites (study prompt/llm-propose/tune, run_baseline_then_llm) build the profile via lca.build_study_workload_profile and pass it through build_prompt. The heuristic regime classifiers keep reading window_summary; that is the heuristic layer, distinct from the similarity metric. Co-Authored-By: Claude Opus 4.8 --- scripts/run_baseline_then_llm.py | 2 ++ src/aituner/cli.py | 6 ++++ src/aituner/harness.py | 59 ++++++++++++++++++++++++++++++-- src/aituner/lca.py | 27 ++++++++++++++- src/aituner/llm.py | 7 +++- tests/test_core_flow.py | 29 ++++++++++++++++ 6 files changed, 126 insertions(+), 4 deletions(-) diff --git a/scripts/run_baseline_then_llm.py b/scripts/run_baseline_then_llm.py index 68d753e..5dd59fa 100644 --- a/scripts/run_baseline_then_llm.py +++ b/scripts/run_baseline_then_llm.py @@ -10,6 +10,7 @@ from aituner.llm import ( load_capability_profile, parse_proposal_text, ) +from aituner.lca import build_study_workload_profile from aituner.spec import load_study_spec from aituner.store import StudyStore from aituner.trace import load_trace_requests, summarize_window @@ -89,6 +90,7 @@ def main() -> int: window_summary=summarize_window(requests, window), state=state, capability_profile=capability_profile, + workload_profile=build_study_workload_profile(study, requests, window), ) prompt_name = f"prompt-{state.next_trial_index:04d}" store.write_prompt(study.study_id, prompt_name, prompt) diff --git a/src/aituner/cli.py b/src/aituner/cli.py index e24b99f..7a65c61 100644 --- a/src/aituner/cli.py +++ b/src/aituner/cli.py @@ -14,6 +14,7 @@ from .harness import ( ) from .job import append_job, build_trial_job from .lca import ( + build_study_workload_profile, build_workload_profile, resolve_length_mode, similarity_report, @@ -140,6 +141,7 @@ def cmd_study_prompt(args: argparse.Namespace) -> int: window_summary=summarize_window(requests, window), state=state, capability_profile=capability_profile, + workload_profile=build_study_workload_profile(study, requests, window), ) prompt_name = args.prompt_name or f"prompt-{state.next_trial_index:04d}" path = store.write_prompt(study.study_id, prompt_name, prompt) @@ -160,6 +162,7 @@ def cmd_study_llm_propose(args: argparse.Namespace) -> int: window_summary=summarize_window(requests, window), state=state, capability_profile=capability_profile, + workload_profile=build_study_workload_profile(study, requests, window), ) proposal_text = call_llm_for_proposal( policy=study.llm, @@ -242,11 +245,13 @@ def cmd_study_tune(args: argparse.Namespace) -> int: break window, requests = load_trace_requests(study, study_spec_path=spec_path) window_summary = summarize_window(requests, window) + workload_profile = build_study_workload_profile(study, requests, window) harness_context = ( build_harness_context( study=study, window_summary=window_summary, state=state, + workload_profile=workload_profile, ) if study.llm.use_harness else None @@ -256,6 +261,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int: window_summary=window_summary, state=state, capability_profile=capability_profile, + workload_profile=workload_profile, ) prompt_name = f"prompt-{state.next_trial_index:04d}" store.write_prompt(study.study_id, prompt_name, prompt) diff --git a/src/aituner/harness.py b/src/aituner/harness.py index 693655d..0036c66 100644 --- a/src/aituner/harness.py +++ b/src/aituner/harness.py @@ -4,6 +4,7 @@ import json from pathlib import Path from typing import Any +from .lca import EPSILON, WorkloadProfile from .spec import ConfigPatch, Proposal, StudySpec, StudyState, TrialSummary @@ -30,6 +31,7 @@ def build_harness_context( study: StudySpec, window_summary: dict[str, Any], state: StudyState, + workload_profile: WorkloadProfile | None = None, ) -> dict[str, Any]: recent_diagnostics = _recent_trial_diagnostics(state) trial_profiles = _trial_profiles(study, recent_diagnostics) @@ -52,7 +54,7 @@ def build_harness_context( "feature_model": "L-C-A: request lengths, inter-request KV-cache reuse, and arrival dynamics.", "trial_policy": "Profile measured trials, rank bottleneck hypotheses, score generic candidate actions, and stop only when no useful measured hypothesis remains.", }, - "workload_lca_profile": _workload_lca_profile(window_summary), + "workload_lca_profile": _workload_lca_profile(window_summary, workload_profile), "recent_trial_diagnostics": recent_diagnostics, "trial_profiles": trial_profiles, "bottleneck_hypotheses": bottleneck_hypotheses, @@ -141,7 +143,12 @@ def render_harness_context(context: dict[str, Any]) -> str: return json.dumps(context, ensure_ascii=False, indent=2) -def _workload_lca_profile(window_summary: dict[str, Any]) -> dict[str, Any]: +def _workload_lca_profile( + window_summary: dict[str, Any], + workload_profile: WorkloadProfile | None = None, +) -> dict[str, Any]: + if workload_profile is not None: + return _canonical_lca_profile(workload_profile) prefix_cache = window_summary.get("prefix_cache") if not isinstance(prefix_cache, dict): prefix_cache = {} @@ -178,6 +185,54 @@ def _workload_lca_profile(window_summary: dict[str, Any]) -> dict[str, Any]: } +def _canonical_lca_profile(profile: WorkloadProfile) -> dict[str, Any]: + """Authoritative L-C-A block: the paper's 10-dim RobustScaler vector. + + Sourced from lca.WorkloadProfile so the prompt's L-C-A is the same metric + used for the workload-similarity computations, not an ad-hoc re-derivation. + The regime labels reuse the heuristic classifiers but are fed from the + canonical stats. + """ + stats = profile.stats if isinstance(profile.stats, dict) else {} + length = stats.get("length") if isinstance(stats.get("length"), dict) else {} + cache = stats.get("cache") if isinstance(stats.get("cache"), dict) else {} + arrival = stats.get("arrival") if isinstance(stats.get("arrival"), dict) else {} + length_p95 = _as_float(length.get("p95")) + length_p50 = _as_float(length.get("p50")) + tail_ratio = float(length_p95 / max(length_p50, EPSILON)) if length_p95 else 0.0 + repeated_token_ratio = _as_float(cache.get("input_hit_rate")) + fano_1s = _as_float(arrival.get("fano_1s")) + interarrival_cv = _as_float(arrival.get("interarrival_cv")) + return { + "metric": "paper L-C-A (10-dim, RobustScaler-normalized) from lca.WorkloadProfile", + "length_mode": profile.length_mode, + "feature_names": list(profile.feature_names), + "vector": list(profile.vector), + "L_request_lengths": { + "mean": _as_float(length.get("mean")), + "p50": length_p50, + "p95": length_p95, + "cv": _as_float(length.get("cv")), + "tail_ratio_p95_p50": tail_ratio, + "regime": _length_regime(length_p95, tail_ratio), + }, + "C_prefix_cache": { + "hit_rate": _as_float(cache.get("hit_rate")), + "input_hit_rate": repeated_token_ratio, + "repeated_block_ratio": _as_float(cache.get("repeated_block_ratio")), + "rows_with_hash_ids": int(cache.get("rows_with_hash_ids") or 0), + "regime": _cache_regime(repeated_token_ratio), + }, + "A_arrivals": { + "request_rate": _as_float(arrival.get("request_rate")), + "request_rate_per_gpu": _as_float(arrival.get("request_rate_per_gpu")), + "interarrival_cv": interarrival_cv, + "fano_1s": fano_1s, + "regime": _arrival_regime(fano_1s, interarrival_cv), + }, + } + + def _knob_harnesses( study: StudySpec, window_summary: dict[str, Any], diff --git a/src/aituner/lca.py b/src/aituner/lca.py index 21846ac..eb8549d 100644 --- a/src/aituner/lca.py +++ b/src/aituner/lca.py @@ -4,10 +4,13 @@ import json import math import statistics from dataclasses import dataclass -from typing import Any, Sequence +from typing import TYPE_CHECKING, Any, Sequence from .trace import TraceRequest, WindowRecord +if TYPE_CHECKING: + from .spec import StudySpec + EPSILON = 1e-9 @@ -178,6 +181,28 @@ def build_workload_profile( ) +def build_study_workload_profile( + study: "StudySpec", + requests: list[TraceRequest], + window: WindowRecord, +) -> WorkloadProfile: + """Canonical L-C-A profile for a study's loaded window. + + This is the single source of truth for the paper's 10-dimensional L-C-A + feature vector used by the harness prompt and (later) by Stop-A. + """ + mode = resolve_length_mode( + request_mode=study.trace.request_mode, + length_mode="auto", + ) + return build_workload_profile( + requests, + window, + gpu_count=study.hardware.gpu_count, + length_mode=mode, + ) + + def fit_robust_scale(profiles: Sequence[WorkloadProfile]) -> RobustScale: if not profiles: raise ValueError("At least one profile is required to fit a robust scale.") diff --git a/src/aituner/llm.py b/src/aituner/llm.py index 855fdb5..1a4dc9c 100644 --- a/src/aituner/llm.py +++ b/src/aituner/llm.py @@ -3,12 +3,15 @@ from __future__ import annotations import json import time from pathlib import Path -from typing import Any +from typing import TYPE_CHECKING, Any from .harness import build_harness_context, render_harness_context from .http_client import chat_completion, stream_text_completion from .spec import LLMPolicySpec, Proposal, SpecError, StudySpec, StudyState +if TYPE_CHECKING: + from .lca import WorkloadProfile + def _parse_bool_like(value: Any, *, context: str) -> bool: if isinstance(value, bool): @@ -178,6 +181,7 @@ def build_prompt( window_summary: dict[str, Any], state: StudyState, capability_profile: dict[str, Any] | None, + workload_profile: "WorkloadProfile | None" = None, ) -> str: objective_notes: list[str] = [] if study.trace.request_mode == "decode_only": @@ -409,6 +413,7 @@ def build_prompt( study=study, window_summary=window_summary, state=state, + workload_profile=workload_profile, ) ), "", diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 1fd5494..f0aca0d 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -28,6 +28,7 @@ from aituner.harness import ( build_harness_stop_proposal, ) from aituner.lca import ( + build_study_workload_profile, build_workload_profile, profile_similarity, resolve_length_mode, @@ -298,6 +299,34 @@ class CoreFlowTests(unittest.TestCase): self.assertAlmostEqual(profile.stats["arrival"]["fano_1s"], 0.5) self.assertEqual(resolve_length_mode(request_mode="decode_only"), "output") + def test_harness_context_uses_canonical_lca_vector(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets(tmp_path) + study = load_study_spec(study_path) + window, requests = load_trace_requests(study, study_spec_path=study_path) + profile = build_study_workload_profile(study, requests, window) + state = StudyState(study_id=study.study_id, trials=[]) + summary = summarize_window(requests, window) + context = build_harness_context( + study=study, + window_summary=summary, + state=state, + workload_profile=profile, + ) + block = context["workload_lca_profile"] + # The labeled L-C-A block is the canonical 10-dim metric, not ad-hoc. + self.assertEqual(block["vector"], profile.vector) + self.assertEqual(len(block["vector"]), 10) + self.assertIn("RobustScaler", block["metric"]) + # Without a profile it falls back to the legacy ad-hoc rendering. + legacy = build_harness_context( + study=study, + window_summary=summary, + state=state, + )["workload_lca_profile"] + self.assertNotIn("vector", legacy) + def test_lca_similarity_matrix_separates_different_profiles(self) -> None: window = WindowRecord( window_id="base",