diff --git a/pyproject.toml b/pyproject.toml index 78269e1..ee278b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,8 +20,21 @@ build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] where = ["src"] +[dependency-groups] +# Pure-Python unit tests. Install via: +# uv sync --group test +# These tests deliberately import only the algorithm-layer modules +# (policies, trace, topology) so they run without SGLang / GPU / CUDA. +test = [ + "pytest>=8.0", +] + [tool.uv] prerelease = "allow" [tool.uv.sources] sglang = { path = "third_party/sglang/python", editable = true } + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "-q" diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..a91160a --- /dev/null +++ b/tests/README.md @@ -0,0 +1,39 @@ +# Tests + +Pure-Python unit + property tests for the algorithm layer. These tests do +**not** import SGLang and do **not** need a GPU — they validate the routing +algorithm (Algorithm 1/2/3 in `docs/KVC_ROUTER_ALGORITHM.md`) and its +theorems against the pure functions extracted from `policies.py`. + +## Run + +```bash +uv sync --group test +uv run pytest +``` + +Or, without uv: + +```bash +pip install pytest +PYTHONPATH=src pytest tests +``` + +## Scope + +- `test_policy_scoring.py` — Algorithm 1 lex-score properties (overlap + dominates sticky, load-floor gating, tie-breakers). +- `test_no_starvation.py` — Theorem 1: bounded retries before some D either + accepts or the least-rejected D is forced through the degenerate path. + +Future: +- block-level eviction `MockRadixCache` tests (see + `docs/BLOCK_LEVEL_EVICTION_DESIGN_ZH.md` §5). +- D→P sync `staleness_budget` property tests (see + `docs/D_TO_P_SYNC_CONTRACT_ZH.md` §1). + +## Why no integration tests here + +Anything that needs SGLang, mooncake, or a real model is an integration +test and must run on hardware. Those tests live as `scripts/sweep_*.sh` +under the evaluation protocol in `docs/EVALUATION_PROTOCOL_ZH.md`. diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_policy_scoring.py b/tests/test_policy_scoring.py new file mode 100644 index 0000000..2e1f614 --- /dev/null +++ b/tests/test_policy_scoring.py @@ -0,0 +1,189 @@ +"""Unit tests for Algorithm 1 (KvAwarePolicy score_candidate). + +Reference: docs/KVC_ROUTER_ALGORITHM.md §3.1. The lex-score is + + (overlap + sticky_bonus*sticky + floor_bonus, + sticky, + -inflight, + -assigned) + +These tests pin down the qualitative properties that the algorithm's +correctness arguments rely on. They run without SGLang/GPU. +""" + +from __future__ import annotations + +from agentic_pd_hybrid.policies import score_candidate + + +def _score(**overrides): + """Helper: build a score with all defaults and per-test overrides.""" + args = dict( + overlap=0, + sticky=False, + inflight=0, + assigned=0, + mean_assigned=0.0, + sticky_bonus=1, + load_floor_bonus=0, + ) + args.update(overrides) + return score_candidate(**args) + + +# -- Determinism ---------------------------------------------------------------- + + +def test_score_is_pure(): + """Same kwargs must produce the same tuple (no hidden state).""" + a = _score(overlap=3, sticky=True, inflight=1, assigned=7) + b = _score(overlap=3, sticky=True, inflight=1, assigned=7) + assert a == b + + +def test_score_returns_4_tuple(): + s = _score() + assert isinstance(s, tuple) + assert len(s) == 4 + assert all(isinstance(x, int) for x in s) + + +# -- Primary term: overlap dominates sticky -------------------------------------- + + +def test_overlap_strictly_dominates_pure_sticky(): + """Theorem-2 building block: any positive overlap on a non-sticky D wins + against a sticky-only D with zero overlap (sticky_bonus=1).""" + overlap = _score(overlap=2, sticky=False) + sticky_only = _score(overlap=0, sticky=True) + assert overlap > sticky_only + + +def test_overlap_plus_sticky_beats_overlap_alone(): + """Two D's with equal overlap: sticky one wins (sticky_bonus contributes + to primary AND wins tie-1).""" + sticky_d = _score(overlap=5, sticky=True) + fresh_d = _score(overlap=5, sticky=False) + assert sticky_d > fresh_d + + +# -- Tie breakers ---------------------------------------------------------------- + + +def test_tiebreaker_inflight_lower_wins(): + """Equal primary & sticky: prefer the D with fewer in-flight requests.""" + low = _score(overlap=3, sticky=False, inflight=0, assigned=10) + high = _score(overlap=3, sticky=False, inflight=5, assigned=10) + assert low > high + + +def test_tiebreaker_assigned_lower_wins(): + """Equal primary & sticky & inflight: prefer rarely-picked D.""" + rare = _score(overlap=3, sticky=False, inflight=2, assigned=1) + frequent = _score(overlap=3, sticky=False, inflight=2, assigned=99) + assert rare > frequent + + +def test_tiebreaker_strict_lex_order(): + """Sticky always beats non-sticky on tie-1 even if non-sticky has lower + inflight (the lex order is strict, position 1 outranks positions 2/3).""" + sticky_busy = _score(overlap=4, sticky=True, inflight=10, assigned=10) + fresh_idle = _score(overlap=4, sticky=False, inflight=0, assigned=0) + # Note: with sticky_bonus=1 added to position 0, sticky_busy actually wins + # on position 0 first (5 > 4). Force equal primary by lowering sticky's + # overlap. + sticky_busy_eq_primary = _score(overlap=3, sticky=True, inflight=10, assigned=10) + fresh_idle_eq_primary = _score(overlap=4, sticky=False, inflight=0, assigned=0) + # Now equal primary (3+1=4 vs 4). Sticky wins position 1. + assert sticky_busy_eq_primary > fresh_idle_eq_primary + + +# -- Load-floor bonus ------------------------------------------------------------ + + +def test_load_floor_disabled_by_default(): + """load_floor_bonus=0 → no contribution to primary.""" + s = _score(overlap=0, sticky=False, mean_assigned=10, assigned=0) + assert s[0] == 0 + + +def test_load_floor_gated_off_when_sticky(): + """Even with load_floor_bonus>0, sticky D does NOT receive the boost. + Otherwise a session would migrate away from its warm D under load.""" + sticky_under_loaded = _score( + overlap=0, sticky=True, mean_assigned=10, assigned=0, load_floor_bonus=200 + ) + # primary = overlap(0) + sticky_bonus(1) + floor(0) = 1 + assert sticky_under_loaded[0] == 1 + + +def test_load_floor_zero_when_mean_zero(): + """Warmup case: mean_assigned=0 -> no D gets boost -> degenerate to lex + tiebreak by iteration order.""" + s = _score( + overlap=0, sticky=False, mean_assigned=0, assigned=0, load_floor_bonus=200 + ) + assert s[0] == 0 + + +def test_load_floor_proportional_to_deficit(): + """floor_bonus = K * deficit / mean. assigned=0, mean=10, K=200 -> 200.""" + s_zero = _score( + overlap=0, sticky=False, mean_assigned=10, assigned=0, load_floor_bonus=200 + ) + s_half = _score( + overlap=0, sticky=False, mean_assigned=10, assigned=5, load_floor_bonus=200 + ) + s_full = _score( + overlap=0, sticky=False, mean_assigned=10, assigned=10, load_floor_bonus=200 + ) + # deficit = max(0, 10-0)=10 -> bonus = int(200*10/10) = 200 + # deficit = max(0, 10-5)=5 -> bonus = int(200*5/10) = 100 + # deficit = max(0, 10-10)=0 -> bonus = 0 + assert s_zero[0] == 200 + assert s_half[0] == 100 + assert s_full[0] == 0 + + +def test_load_floor_does_not_underflow_when_overloaded(): + """assigned > mean -> deficit clamped to 0, no negative bonus.""" + s = _score( + overlap=0, sticky=False, mean_assigned=10, assigned=50, load_floor_bonus=200 + ) + assert s[0] == 0 + + +# -- Routing intent: real overlap beats load-floor bonus ------------------------- + + +def test_real_prefix_overlap_beats_load_floor_on_warm_d(): + """E1_E2_FIX_DESIGN_ZH §Q2: load_floor should be set such that + real per-session prefix overlap outweighs the cold-D bonus. + With overlap=800 (a per-session prefix) and load_floor_bonus=200, + a warm D (high overlap, possibly high load) should still win against + a cold D with floor bonus.""" + warm = _score( + overlap=800, sticky=True, mean_assigned=10, assigned=10, load_floor_bonus=200 + ) + cold = _score( + overlap=0, sticky=False, mean_assigned=10, assigned=0, load_floor_bonus=200 + ) + # warm primary = 800 + 1 + 0 = 801. cold primary = 0 + 0 + 200 = 200. + assert warm[0] == 801 + assert cold[0] == 200 + assert warm > cold + + +def test_boilerplate_overlap_loses_to_load_floor_for_cold_d(): + """Same §Q2: load_floor should beat cross-session boilerplate overlap. + If load_floor_bonus=200 and the worst-case boilerplate overlap is ~50, + a fresh cold D should still win against a slightly-warm-from-boilerplate D.""" + warm_boilerplate = _score( + overlap=50, sticky=False, mean_assigned=10, assigned=10, load_floor_bonus=200 + ) + cold_under_loaded = _score( + overlap=0, sticky=False, mean_assigned=10, assigned=0, load_floor_bonus=200 + ) + # warm_boilerplate primary = 50 + 0 + 0 = 50 (assigned=mean, no deficit). + # cold_under_loaded primary = 0 + 0 + 200 = 200. + assert cold_under_loaded > warm_boilerplate