Unify harness L-C-A on the canonical lca.WorkloadProfile

Phase 0 of the two-stop work. The prompt block labeled `workload_lca_profile` previously re-derived L-C-A from summarize_window's ad-hoc percentiles, diverging from the paper's 10-dim RobustScaler vector implemented in lca.py. Make that block authoritative: build_harness_context now accepts an optional workload_profile and renders the canonical 10-dim vector + per-family stats when present, falling back to the legacy rendering only when no profile is supplied (direct unit-test calls). Real call sites (study prompt/llm-propose/tune, run_baseline_then_llm) build the profile via lca.build_study_workload_profile and pass it through build_prompt. The heuristic regime classifiers keep reading window_summary; that is the heuristic layer, distinct from the similarity metric. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 14:12:17 +08:00
parent 8b4116fad0
commit 6f8e3c95c1
6 changed files with 126 additions and 4 deletions
--- a/scripts/run_baseline_then_llm.py
+++ b/scripts/run_baseline_then_llm.py
@@ -10,6 +10,7 @@ from aituner.llm import (
    load_capability_profile,
    parse_proposal_text,
 )
 from aituner.lca import build_study_workload_profile
 from aituner.spec import load_study_spec
 from aituner.store import StudyStore
 from aituner.trace import load_trace_requests, summarize_window
@@ -89,6 +90,7 @@ def main() -> int:
            window_summary=summarize_window(requests, window),
            state=state,
            capability_profile=capability_profile,
            workload_profile=build_study_workload_profile(study, requests, window),
        )
        prompt_name = f"prompt-{state.next_trial_index:04d}"
        store.write_prompt(study.study_id, prompt_name, prompt)
--- a/src/aituner/cli.py
+++ b/src/aituner/cli.py
@@ -14,6 +14,7 @@ from .harness import (
 )
 from .job import append_job, build_trial_job
 from .lca import (
    build_study_workload_profile,
    build_workload_profile,
    resolve_length_mode,
    similarity_report,
@@ -140,6 +141,7 @@ def cmd_study_prompt(args: argparse.Namespace) -> int:
        window_summary=summarize_window(requests, window),
        state=state,
        capability_profile=capability_profile,
        workload_profile=build_study_workload_profile(study, requests, window),
    )
    prompt_name = args.prompt_name or f"prompt-{state.next_trial_index:04d}"
    path = store.write_prompt(study.study_id, prompt_name, prompt)
@@ -160,6 +162,7 @@ def cmd_study_llm_propose(args: argparse.Namespace) -> int:
        window_summary=summarize_window(requests, window),
        state=state,
        capability_profile=capability_profile,
        workload_profile=build_study_workload_profile(study, requests, window),
    )
    proposal_text = call_llm_for_proposal(
        policy=study.llm,
@@ -242,11 +245,13 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
            break
        window, requests = load_trace_requests(study, study_spec_path=spec_path)
        window_summary = summarize_window(requests, window)
        workload_profile = build_study_workload_profile(study, requests, window)
        harness_context = (
            build_harness_context(
                study=study,
                window_summary=window_summary,
                state=state,
                workload_profile=workload_profile,
            )
            if study.llm.use_harness
            else None
@@ -256,6 +261,7 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
            window_summary=window_summary,
            state=state,
            capability_profile=capability_profile,
            workload_profile=workload_profile,
        )
        prompt_name = f"prompt-{state.next_trial_index:04d}"
        store.write_prompt(study.study_id, prompt_name, prompt)
--- a/src/aituner/harness.py
+++ b/src/aituner/harness.py
@@ -4,6 +4,7 @@ import json
 from pathlib import Path
 from typing import Any
 from .lca import EPSILON, WorkloadProfile
 from .spec import ConfigPatch, Proposal, StudySpec, StudyState, TrialSummary
@@ -30,6 +31,7 @@ def build_harness_context(
    study: StudySpec,
    window_summary: dict[str, Any],
    state: StudyState,
    workload_profile: WorkloadProfile | None = None,
 ) -> dict[str, Any]:
    recent_diagnostics = _recent_trial_diagnostics(state)
    trial_profiles = _trial_profiles(study, recent_diagnostics)
@@ -52,7 +54,7 @@ def build_harness_context(
            "feature_model": "L-C-A: request lengths, inter-request KV-cache reuse, and arrival dynamics.",
            "trial_policy": "Profile measured trials, rank bottleneck hypotheses, score generic candidate actions, and stop only when no useful measured hypothesis remains.",
        },
-        "workload_lca_profile": _workload_lca_profile(window_summary),
+        "workload_lca_profile": _workload_lca_profile(window_summary, workload_profile),
        "recent_trial_diagnostics": recent_diagnostics,
        "trial_profiles": trial_profiles,
        "bottleneck_hypotheses": bottleneck_hypotheses,
@@ -141,7 +143,12 @@ def render_harness_context(context: dict[str, Any]) -> str:
    return json.dumps(context, ensure_ascii=False, indent=2)
-def _workload_lca_profile(window_summary: dict[str, Any]) -> dict[str, Any]:
+def _workload_lca_profile(
    window_summary: dict[str, Any],
    workload_profile: WorkloadProfile | None = None,
 ) -> dict[str, Any]:
    if workload_profile is not None:
        return _canonical_lca_profile(workload_profile)
    prefix_cache = window_summary.get("prefix_cache")
    if not isinstance(prefix_cache, dict):
        prefix_cache = {}
@@ -178,6 +185,54 @@ def _workload_lca_profile(window_summary: dict[str, Any]) -> dict[str, Any]:
    }
 def _canonical_lca_profile(profile: WorkloadProfile) -> dict[str, Any]:
    """Authoritative L-C-A block: the paper's 10-dim RobustScaler vector.
    Sourced from lca.WorkloadProfile so the prompt's L-C-A is the same metric
    used for the workload-similarity computations, not an ad-hoc re-derivation.
    The regime labels reuse the heuristic classifiers but are fed from the
    canonical stats.
    """
    stats = profile.stats if isinstance(profile.stats, dict) else {}
    length = stats.get("length") if isinstance(stats.get("length"), dict) else {}
    cache = stats.get("cache") if isinstance(stats.get("cache"), dict) else {}
    arrival = stats.get("arrival") if isinstance(stats.get("arrival"), dict) else {}
    length_p95 = _as_float(length.get("p95"))
    length_p50 = _as_float(length.get("p50"))
    tail_ratio = float(length_p95 / max(length_p50, EPSILON)) if length_p95 else 0.0
    repeated_token_ratio = _as_float(cache.get("input_hit_rate"))
    fano_1s = _as_float(arrival.get("fano_1s"))
    interarrival_cv = _as_float(arrival.get("interarrival_cv"))
    return {
        "metric": "paper L-C-A (10-dim, RobustScaler-normalized) from lca.WorkloadProfile",
        "length_mode": profile.length_mode,
        "feature_names": list(profile.feature_names),
        "vector": list(profile.vector),
        "L_request_lengths": {
            "mean": _as_float(length.get("mean")),
            "p50": length_p50,
            "p95": length_p95,
            "cv": _as_float(length.get("cv")),
            "tail_ratio_p95_p50": tail_ratio,
            "regime": _length_regime(length_p95, tail_ratio),
        },
        "C_prefix_cache": {
            "hit_rate": _as_float(cache.get("hit_rate")),
            "input_hit_rate": repeated_token_ratio,
            "repeated_block_ratio": _as_float(cache.get("repeated_block_ratio")),
            "rows_with_hash_ids": int(cache.get("rows_with_hash_ids") or 0),
            "regime": _cache_regime(repeated_token_ratio),
        },
        "A_arrivals": {
            "request_rate": _as_float(arrival.get("request_rate")),
            "request_rate_per_gpu": _as_float(arrival.get("request_rate_per_gpu")),
            "interarrival_cv": interarrival_cv,
            "fano_1s": fano_1s,
            "regime": _arrival_regime(fano_1s, interarrival_cv),
        },
    }
 def _knob_harnesses(
    study: StudySpec,
    window_summary: dict[str, Any],
--- a/src/aituner/lca.py
+++ b/src/aituner/lca.py
@@ -4,10 +4,13 @@ import json
 import math
 import statistics
 from dataclasses import dataclass
-from typing import Any, Sequence
+from typing import TYPE_CHECKING, Any, Sequence
 from .trace import TraceRequest, WindowRecord
 if TYPE_CHECKING:
    from .spec import StudySpec
 EPSILON = 1e-9
@@ -178,6 +181,28 @@ def build_workload_profile(
    )
 def build_study_workload_profile(
    study: "StudySpec",
    requests: list[TraceRequest],
    window: WindowRecord,
 ) -> WorkloadProfile:
    """Canonical L-C-A profile for a study's loaded window.
    This is the single source of truth for the paper's 10-dimensional L-C-A
    feature vector used by the harness prompt and (later) by Stop-A.
    """
    mode = resolve_length_mode(
        request_mode=study.trace.request_mode,
        length_mode="auto",
    )
    return build_workload_profile(
        requests,
        window,
        gpu_count=study.hardware.gpu_count,
        length_mode=mode,
    )
 def fit_robust_scale(profiles: Sequence[WorkloadProfile]) -> RobustScale:
    if not profiles:
        raise ValueError("At least one profile is required to fit a robust scale.")
--- a/src/aituner/llm.py
+++ b/src/aituner/llm.py
@@ -3,12 +3,15 @@ from __future__ import annotations
 import json
 import time
 from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
 from .harness import build_harness_context, render_harness_context
 from .http_client import chat_completion, stream_text_completion
 from .spec import LLMPolicySpec, Proposal, SpecError, StudySpec, StudyState
 if TYPE_CHECKING:
    from .lca import WorkloadProfile
 def _parse_bool_like(value: Any, *, context: str) -> bool:
    if isinstance(value, bool):
@@ -178,6 +181,7 @@ def build_prompt(
    window_summary: dict[str, Any],
    state: StudyState,
    capability_profile: dict[str, Any] | None,
    workload_profile: "WorkloadProfile | None" = None,
 ) -> str:
    objective_notes: list[str] = []
    if study.trace.request_mode == "decode_only":
@@ -409,6 +413,7 @@ def build_prompt(
                    study=study,
                    window_summary=window_summary,
                    state=state,
                    workload_profile=workload_profile,
                )
            ),
            "",
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -28,6 +28,7 @@ from aituner.harness import (
    build_harness_stop_proposal,
 )
 from aituner.lca import (
    build_study_workload_profile,
    build_workload_profile,
    profile_similarity,
    resolve_length_mode,
@@ -298,6 +299,34 @@ class CoreFlowTests(unittest.TestCase):
        self.assertAlmostEqual(profile.stats["arrival"]["fano_1s"], 0.5)
        self.assertEqual(resolve_length_mode(request_mode="decode_only"), "output")
    def test_harness_context_uses_canonical_lca_vector(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            window, requests = load_trace_requests(study, study_spec_path=study_path)
            profile = build_study_workload_profile(study, requests, window)
            state = StudyState(study_id=study.study_id, trials=[])
            summary = summarize_window(requests, window)
            context = build_harness_context(
                study=study,
                window_summary=summary,
                state=state,
                workload_profile=profile,
            )
            block = context["workload_lca_profile"]
            # The labeled L-C-A block is the canonical 10-dim metric, not ad-hoc.
            self.assertEqual(block["vector"], profile.vector)
            self.assertEqual(len(block["vector"]), 10)
            self.assertIn("RobustScaler", block["metric"])
            # Without a profile it falls back to the legacy ad-hoc rendering.
            legacy = build_harness_context(
                study=study,
                window_summary=summary,
                state=state,
            )["workload_lca_profile"]
            self.assertNotIn("vector", legacy)
    def test_lca_similarity_matrix_separates_different_profiles(self) -> None:
        window = WindowRecord(
            window_id="base",