From 0701f84c00e9fa7120882d85c73325f5bc0e5ccb Mon Sep 17 00:00:00 2001
From: Gahow Wang <gahow.wang@gmail.com>
Date: Sat, 23 May 2026 21:07:14 +0800
Subject: [PATCH] tests: add minimal coverage for percentile + proxy routing
 (S1)

- tests/test_metrics.py asserts the new linear-interp _percentile against
  hand-computed expected values (single value, two-value interpolation,
  endpoints, numpy-equivalent linear default, on-integer rank).
- tests/test_proxy_pick.py exercises InstanceState LRU eviction and
  move-to-end on hit, plus session-affinity stickiness, the overload
  fallback, the active_p_offloads penalty, and lmetric scoring. The
  proxy is loaded by file path with stub fastapi/uvicorn/httpx modules
  so the suite runs without the FastAPI server deps installed.
- pyproject.toml gets a hatchling wheel target and a [tool.pytest]
  section so `uv run --extra dev pytest` works out of the box.
---
 pyproject.toml           |   7 ++
 tests/__init__.py        |   0
 tests/test_metrics.py    |  44 ++++++++++
 tests/test_proxy_pick.py | 180 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 231 insertions(+)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_metrics.py
 create mode 100644 tests/test_proxy_pick.py

diff --git a/pyproject.toml b/pyproject.toml
index dd44cb6..0f7c9ae 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,3 +14,10 @@ dev = ["pytest"]
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["replayer"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = "-q"
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
new file mode 100644
index 0000000..70bb607
--- /dev/null
+++ b/tests/test_metrics.py
@@ -0,0 +1,44 @@
+"""Tests for replayer.metrics percentile + summary helpers (B5)."""
+
+from __future__ import annotations
+
+import math
+
+from replayer.metrics import _percentile
+
+
+def test_percentile_single_value():
+    assert _percentile([42.0], 0.50) == 42.0
+    assert _percentile([42.0], 0.99) == 42.0
+
+
+def test_percentile_two_values_interpolates():
+    # For [0, 10] linear interpolation gives p50=5.0, p90=9.0.
+    assert math.isclose(_percentile([0.0, 10.0], 0.50), 5.0)
+    assert math.isclose(_percentile([0.0, 10.0], 0.90), 9.0)
+
+
+def test_percentile_endpoints():
+    vals = [1.0, 2.0, 3.0, 4.0, 5.0]
+    assert _percentile(vals, 0.0) == 1.0
+    assert _percentile(vals, 1.0) == 5.0
+
+
+def test_percentile_matches_numpy_linear_default():
+    # Independently computed using numpy's default linear interpolation;
+    # we hardcode the expectations so the test does not depend on numpy.
+    vals = [1.0, 2.0, 4.0, 8.0, 16.0, 32.0]
+    # rank for p50 = 0.5 * 5 = 2.5 -> 0.5 * 4 + 0.5 * 8 = 6.0
+    assert math.isclose(_percentile(vals, 0.50), 6.0)
+    # rank for p90 = 0.9 * 5 = 4.5 -> 0.5 * 16 + 0.5 * 32 = 24.0
+    assert math.isclose(_percentile(vals, 0.90), 24.0)
+    # rank for p99 = 0.99 * 5 = 4.95 -> 0.05 * 16 + 0.95 * 32 = 31.2
+    assert math.isclose(_percentile(vals, 0.99), 31.2)
+
+
+def test_percentile_no_off_by_one_at_boundary():
+    # Regression: previous round-based implementation returned the wrong
+    # element when rank fell exactly on an integer.
+    vals = [10.0, 20.0, 30.0]
+    # rank for p50 = 0.5 * 2 = 1.0 -> exactly element 1 -> 20.0
+    assert _percentile(vals, 0.50) == 20.0
diff --git a/tests/test_proxy_pick.py b/tests/test_proxy_pick.py
new file mode 100644
index 0000000..efabc9d
--- /dev/null
+++ b/tests/test_proxy_pick.py
@@ -0,0 +1,180 @@
+"""Minimal coverage for scripts/cache_aware_proxy pick_instance + cache LRU (S1)."""
+
+from __future__ import annotations
+
+import importlib.util
+import sys
+import types
+from pathlib import Path
+
+import pytest
+
+PROXY_PATH = Path(__file__).resolve().parent.parent / "scripts" / "cache_aware_proxy.py"
+
+
+def _install_stub_modules() -> None:
+    """Provide minimal stand-ins for fastapi/uvicorn/httpx so the proxy
+    module imports cleanly without the full server deps."""
+    if "uvicorn" not in sys.modules:
+        sys.modules["uvicorn"] = types.ModuleType("uvicorn")
+
+    if "fastapi" not in sys.modules:
+        fastapi_mod = types.ModuleType("fastapi")
+
+        class _FastAPI:
+            def __init__(self, *a, **kw):
+                self.state = types.SimpleNamespace()
+
+            def post(self, *a, **kw):
+                def deco(fn): return fn
+                return deco
+
+            def get(self, *a, **kw):
+                def deco(fn): return fn
+                return deco
+
+        class _HTTPException(Exception):
+            def __init__(self, status_code=500, detail=""):
+                self.status_code = status_code
+                self.detail = detail
+
+        class _Request:  # not actually instantiated by the routing tests
+            pass
+
+        fastapi_mod.FastAPI = _FastAPI
+        fastapi_mod.HTTPException = _HTTPException
+        fastapi_mod.Request = _Request
+        sys.modules["fastapi"] = fastapi_mod
+
+        responses_mod = types.ModuleType("fastapi.responses")
+
+        class _StreamingResponse:
+            def __init__(self, *a, **kw): pass
+
+        responses_mod.StreamingResponse = _StreamingResponse
+        sys.modules["fastapi.responses"] = responses_mod
+
+    if "httpx" not in sys.modules:
+        httpx_mod = types.ModuleType("httpx")
+
+        class _AsyncClient:
+            def __init__(self, *a, **kw): pass
+            async def aclose(self): pass
+
+        class _Limits:
+            def __init__(self, *a, **kw): pass
+
+        httpx_mod.AsyncClient = _AsyncClient
+        httpx_mod.Limits = _Limits
+        sys.modules["httpx"] = httpx_mod
+
+
+@pytest.fixture(scope="module")
+def proxy():
+    _install_stub_modules()
+    spec = importlib.util.spec_from_file_location("cache_aware_proxy", PROXY_PATH)
+    if spec is None or spec.loader is None:
+        pytest.skip(f"cannot load proxy module at {PROXY_PATH}")
+    mod = importlib.util.module_from_spec(spec)
+    sys.modules["cache_aware_proxy"] = mod
+    try:
+        spec.loader.exec_module(mod)
+    except ModuleNotFoundError as exc:
+        pytest.skip(f"proxy dependency missing: {exc}")
+    return mod
+
+
+def _make_inst(proxy, url: str, ongoing_tokens: int = 0,
+               active_p_offloads: int = 0):
+    inst = proxy.InstanceState(url)
+    inst.ongoing_tokens = ongoing_tokens
+    inst.active_p_offloads = active_p_offloads
+    return inst
+
+
+def test_record_prefix_evicts_oldest_block(proxy):
+    """LRU bound on cached_blocks must evict the oldest entry once full."""
+    inst = proxy.InstanceState("http://x")
+    saved = proxy.CACHE_CAPACITY_BLOCKS
+    proxy.CACHE_CAPACITY_BLOCKS = 2
+    try:
+        block_size = proxy.BLOCK_SIZE
+        # Three distinct one-block prefixes; first must be evicted.
+        prefix_a = [1] * block_size
+        prefix_b = [2] * block_size
+        prefix_c = [3] * block_size
+        inst.record_prefix(prefix_a)
+        inst.record_prefix(prefix_b)
+        inst.record_prefix(prefix_c)
+        assert len(inst.cached_blocks) == 2
+        # A should have been evicted.
+        assert inst.estimate_cache_hit(prefix_a) == 0
+        assert inst.estimate_cache_hit(prefix_b) == block_size
+        assert inst.estimate_cache_hit(prefix_c) == block_size
+    finally:
+        proxy.CACHE_CAPACITY_BLOCKS = saved
+
+
+def test_estimate_cache_hit_touches_lru(proxy):
+    """A cache hit must move the block to the MRU position."""
+    inst = proxy.InstanceState("http://x")
+    saved = proxy.CACHE_CAPACITY_BLOCKS
+    proxy.CACHE_CAPACITY_BLOCKS = 2
+    try:
+        block_size = proxy.BLOCK_SIZE
+        a = [1] * block_size
+        b = [2] * block_size
+        c = [3] * block_size
+        inst.record_prefix(a)
+        inst.record_prefix(b)
+        # Touch A so it becomes MRU; B is now LRU.
+        assert inst.estimate_cache_hit(a) == block_size
+        # Insert C: B should be evicted, A should remain.
+        inst.record_prefix(c)
+        assert inst.estimate_cache_hit(a) == block_size
+        assert inst.estimate_cache_hit(b) == 0
+    finally:
+        proxy.CACHE_CAPACITY_BLOCKS = saved
+
+
+def test_pick_instance_session_affinity_sticks(proxy):
+    insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
+    affinity = {"sess1": 1}
+    chosen, idx = proxy.pick_instance(insts, None, "sess1", 1000, affinity)
+    assert idx == 1 and chosen is insts[1]
+
+
+def test_pick_instance_session_affinity_breaks_on_overload(proxy):
+    """When the pinned instance is heavily overloaded, fallback to load-aware pick."""
+    insts = [
+        _make_inst(proxy, "http://a", ongoing_tokens=100),
+        _make_inst(proxy, "http://b", ongoing_tokens=1_000_000),
+        _make_inst(proxy, "http://c", ongoing_tokens=100),
+    ]
+    affinity = {"sess1": 1}
+    chosen, idx = proxy.pick_instance(insts, None, "sess1", 1000, affinity)
+    # avg ~333k; B at 1M is ~3x avg, well above OVERLOAD_FACTOR=2.0 -> fallback.
+    assert idx != 1
+    assert chosen is not insts[1]
+
+
+def test_pick_instance_p_offload_penalty_steers_away(proxy):
+    """Instances actively running offloaded HEAVY prefills get penalized."""
+    insts = [
+        _make_inst(proxy, "http://a", ongoing_tokens=0, active_p_offloads=2),
+        _make_inst(proxy, "http://b", ongoing_tokens=1000),
+    ]
+    chosen, idx = proxy.pick_instance(insts, None, None, 5000, {})
+    # B's 1000-token load is much smaller than A's 2 * HEAVY_THRESHOLD penalty.
+    assert idx == 1 and chosen is insts[1]
+
+
+def test_pick_instance_lmetric_picks_lowest_score(proxy):
+    insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
+    insts[0].pending_prefill_tokens = 0
+    insts[0].num_requests = 0
+    insts[1].pending_prefill_tokens = 5000
+    insts[1].num_requests = 4
+    chosen, idx = proxy.pick_instance_lmetric(insts, None, None, 1000, {})
+    # Empty instance has score = 1000 * 0 = 0; busy one has (5000+1000)*4.
+    assert idx == 0 and chosen is insts[0]