test(policy): unit tests for Algorithm 1 lex scoring

Adds the project's first test suite. Covers the score_candidate() pure function from the previous refactor commit, validating the qualitative properties that KVC_ROUTER_ALGORITHM.md §3.1 and §4.2 rely on. Tests / properties: - determinism: same args -> same tuple - shape: 4-int tuple - primary term: overlap dominates pure sticky - primary term: sticky_bonus credited - tie-2 inflight: lower wins - tie-3 assigned: lower wins - strict lex order: sticky wins position-1 over fresh-idle - load_floor disabled by default - load_floor gated off when sticky=True - load_floor zero during warmup (mean=0) - load_floor proportional to deficit (200/100/0 at 0/50/100% load) - load_floor does not underflow when overloaded - real per-session overlap beats load_floor on warm D - boilerplate overlap loses to load_floor on cold D (the cold-D fix from E1_E2_FIX_DESIGN §Q2) Test infrastructure: - tests/ package with README explaining the GPU-free scope and the run instruction - pyproject.toml [dependency-groups] test = [pytest>=8] (install via `uv sync --group test`) - pyproject.toml [tool.pytest.ini_options] sets testpaths Verified locally: 14/14 passing under pytest 9.0.3 in an isolated 3.13 venv. No SGLang / GPU touched.
2026-05-12 23:54:48 +08:00
parent 76a79dfdda
commit a785b83023
4 changed files with 241 additions and 0 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,8 +20,21 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools.packages.find]
 where = ["src"]

+[dependency-groups]
+# Pure-Python unit tests. Install via:
+#   uv sync --group test
+# These tests deliberately import only the algorithm-layer modules
+# (policies, trace, topology) so they run without SGLang / GPU / CUDA.
+test = [
+    "pytest>=8.0",
+]
+
 [tool.uv]
 prerelease = "allow"

 [tool.uv.sources]
 sglang = { path = "third_party/sglang/python", editable = true }
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = "-q"
--- a/tests/README.md
+++ b/tests/README.md
@@ -0,0 +1,39 @@
+# Tests
+
+Pure-Python unit + property tests for the algorithm layer. These tests do
+**not** import SGLang and do **not** need a GPU — they validate the routing
+algorithm (Algorithm 1/2/3 in `docs/KVC_ROUTER_ALGORITHM.md`) and its
+theorems against the pure functions extracted from `policies.py`.
+
+## Run
+
+```bash
+uv sync --group test
+uv run pytest
+```
+
+Or, without uv:
+
+```bash
+pip install pytest
+PYTHONPATH=src pytest tests
+```
+
+## Scope
+
+- `test_policy_scoring.py` — Algorithm 1 lex-score properties (overlap
+  dominates sticky, load-floor gating, tie-breakers).
+- `test_no_starvation.py` — Theorem 1: bounded retries before some D either
+  accepts or the least-rejected D is forced through the degenerate path.
+
+Future:
+- block-level eviction `MockRadixCache` tests (see
+  `docs/BLOCK_LEVEL_EVICTION_DESIGN_ZH.md` §5).
+- D→P sync `staleness_budget` property tests (see
+  `docs/D_TO_P_SYNC_CONTRACT_ZH.md` §1).
+
+## Why no integration tests here
+
+Anything that needs SGLang, mooncake, or a real model is an integration
+test and must run on hardware. Those tests live as `scripts/sweep_*.sh`
+under the evaluation protocol in `docs/EVALUATION_PROTOCOL_ZH.md`.
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/test_policy_scoring.py
+++ b/tests/test_policy_scoring.py
@@ -0,0 +1,189 @@
+"""Unit tests for Algorithm 1 (KvAwarePolicy score_candidate).
+
+Reference: docs/KVC_ROUTER_ALGORITHM.md §3.1. The lex-score is
+
+    (overlap + sticky_bonus*sticky + floor_bonus,
+     sticky,
+     -inflight,
+     -assigned)
+
+These tests pin down the qualitative properties that the algorithm's
+correctness arguments rely on. They run without SGLang/GPU.
+"""
+
+from __future__ import annotations
+
+from agentic_pd_hybrid.policies import score_candidate
+
+
+def _score(**overrides):
+    """Helper: build a score with all defaults and per-test overrides."""
+    args = dict(
+        overlap=0,
+        sticky=False,
+        inflight=0,
+        assigned=0,
+        mean_assigned=0.0,
+        sticky_bonus=1,
+        load_floor_bonus=0,
+    )
+    args.update(overrides)
+    return score_candidate(**args)
+
+
+# -- Determinism ----------------------------------------------------------------
+
+
+def test_score_is_pure():
+    """Same kwargs must produce the same tuple (no hidden state)."""
+    a = _score(overlap=3, sticky=True, inflight=1, assigned=7)
+    b = _score(overlap=3, sticky=True, inflight=1, assigned=7)
+    assert a == b
+
+
+def test_score_returns_4_tuple():
+    s = _score()
+    assert isinstance(s, tuple)
+    assert len(s) == 4
+    assert all(isinstance(x, int) for x in s)
+
+
+# -- Primary term: overlap dominates sticky --------------------------------------
+
+
+def test_overlap_strictly_dominates_pure_sticky():
+    """Theorem-2 building block: any positive overlap on a non-sticky D wins
+    against a sticky-only D with zero overlap (sticky_bonus=1)."""
+    overlap = _score(overlap=2, sticky=False)
+    sticky_only = _score(overlap=0, sticky=True)
+    assert overlap > sticky_only
+
+
+def test_overlap_plus_sticky_beats_overlap_alone():
+    """Two D's with equal overlap: sticky one wins (sticky_bonus contributes
+    to primary AND wins tie-1)."""
+    sticky_d = _score(overlap=5, sticky=True)
+    fresh_d = _score(overlap=5, sticky=False)
+    assert sticky_d > fresh_d
+
+
+# -- Tie breakers ----------------------------------------------------------------
+
+
+def test_tiebreaker_inflight_lower_wins():
+    """Equal primary & sticky: prefer the D with fewer in-flight requests."""
+    low = _score(overlap=3, sticky=False, inflight=0, assigned=10)
+    high = _score(overlap=3, sticky=False, inflight=5, assigned=10)
+    assert low > high
+
+
+def test_tiebreaker_assigned_lower_wins():
+    """Equal primary & sticky & inflight: prefer rarely-picked D."""
+    rare = _score(overlap=3, sticky=False, inflight=2, assigned=1)
+    frequent = _score(overlap=3, sticky=False, inflight=2, assigned=99)
+    assert rare > frequent
+
+
+def test_tiebreaker_strict_lex_order():
+    """Sticky always beats non-sticky on tie-1 even if non-sticky has lower
+    inflight (the lex order is strict, position 1 outranks positions 2/3)."""
+    sticky_busy = _score(overlap=4, sticky=True, inflight=10, assigned=10)
+    fresh_idle = _score(overlap=4, sticky=False, inflight=0, assigned=0)
+    # Note: with sticky_bonus=1 added to position 0, sticky_busy actually wins
+    # on position 0 first (5 > 4). Force equal primary by lowering sticky's
+    # overlap.
+    sticky_busy_eq_primary = _score(overlap=3, sticky=True, inflight=10, assigned=10)
+    fresh_idle_eq_primary = _score(overlap=4, sticky=False, inflight=0, assigned=0)
+    # Now equal primary (3+1=4 vs 4). Sticky wins position 1.
+    assert sticky_busy_eq_primary > fresh_idle_eq_primary
+
+
+# -- Load-floor bonus ------------------------------------------------------------
+
+
+def test_load_floor_disabled_by_default():
+    """load_floor_bonus=0 → no contribution to primary."""
+    s = _score(overlap=0, sticky=False, mean_assigned=10, assigned=0)
+    assert s[0] == 0
+
+
+def test_load_floor_gated_off_when_sticky():
+    """Even with load_floor_bonus>0, sticky D does NOT receive the boost.
+    Otherwise a session would migrate away from its warm D under load."""
+    sticky_under_loaded = _score(
+        overlap=0, sticky=True, mean_assigned=10, assigned=0, load_floor_bonus=200
+    )
+    # primary = overlap(0) + sticky_bonus(1) + floor(0) = 1
+    assert sticky_under_loaded[0] == 1
+
+
+def test_load_floor_zero_when_mean_zero():
+    """Warmup case: mean_assigned=0 -> no D gets boost -> degenerate to lex
+    tiebreak by iteration order."""
+    s = _score(
+        overlap=0, sticky=False, mean_assigned=0, assigned=0, load_floor_bonus=200
+    )
+    assert s[0] == 0
+
+
+def test_load_floor_proportional_to_deficit():
+    """floor_bonus = K * deficit / mean. assigned=0, mean=10, K=200 -> 200."""
+    s_zero = _score(
+        overlap=0, sticky=False, mean_assigned=10, assigned=0, load_floor_bonus=200
+    )
+    s_half = _score(
+        overlap=0, sticky=False, mean_assigned=10, assigned=5, load_floor_bonus=200
+    )
+    s_full = _score(
+        overlap=0, sticky=False, mean_assigned=10, assigned=10, load_floor_bonus=200
+    )
+    # deficit = max(0, 10-0)=10 -> bonus = int(200*10/10) = 200
+    # deficit = max(0, 10-5)=5  -> bonus = int(200*5/10)  = 100
+    # deficit = max(0, 10-10)=0 -> bonus = 0
+    assert s_zero[0] == 200
+    assert s_half[0] == 100
+    assert s_full[0] == 0
+
+
+def test_load_floor_does_not_underflow_when_overloaded():
+    """assigned > mean -> deficit clamped to 0, no negative bonus."""
+    s = _score(
+        overlap=0, sticky=False, mean_assigned=10, assigned=50, load_floor_bonus=200
+    )
+    assert s[0] == 0
+
+
+# -- Routing intent: real overlap beats load-floor bonus -------------------------
+
+
+def test_real_prefix_overlap_beats_load_floor_on_warm_d():
+    """E1_E2_FIX_DESIGN_ZH §Q2: load_floor should be set such that
+    real per-session prefix overlap outweighs the cold-D bonus.
+    With overlap=800 (a per-session prefix) and load_floor_bonus=200,
+    a warm D (high overlap, possibly high load) should still win against
+    a cold D with floor bonus."""
+    warm = _score(
+        overlap=800, sticky=True, mean_assigned=10, assigned=10, load_floor_bonus=200
+    )
+    cold = _score(
+        overlap=0, sticky=False, mean_assigned=10, assigned=0, load_floor_bonus=200
+    )
+    # warm primary = 800 + 1 + 0 = 801. cold primary = 0 + 0 + 200 = 200.
+    assert warm[0] == 801
+    assert cold[0] == 200
+    assert warm > cold
+
+
+def test_boilerplate_overlap_loses_to_load_floor_for_cold_d():
+    """Same §Q2: load_floor should beat cross-session boilerplate overlap.
+    If load_floor_bonus=200 and the worst-case boilerplate overlap is ~50,
+    a fresh cold D should still win against a slightly-warm-from-boilerplate D."""
+    warm_boilerplate = _score(
+        overlap=50, sticky=False, mean_assigned=10, assigned=10, load_floor_bonus=200
+    )
+    cold_under_loaded = _score(
+        overlap=0, sticky=False, mean_assigned=10, assigned=0, load_floor_bonus=200
+    )
+    # warm_boilerplate primary = 50 + 0 + 0 = 50 (assigned=mean, no deficit).
+    # cold_under_loaded primary = 0 + 0 + 200 = 200.
+    assert cold_under_loaded > warm_boilerplate