B3: load_only + sticky policies, capped-trace builder, sweep driver
Three additions land together because B3's whole point is comparing
LMetric against meaningful controls.
- scripts/cache_aware_proxy.py: two new --policy values.
- load_only: pure min(num_requests) routing, no cache or affinity.
The B3 control that strips locality so the LMetric-vs-load gap is
legible.
- sticky: first turn goes to min-load, subsequent turns ALWAYS
return to the same instance, even under saturation. The B3
control that maxes out locality so the hot-spot cost is legible.
- scripts/build_capped_trace.py: per-session turn cap (default 8).
Generates the session-mass-equalized variant the TODO calls for so
that hot-spot index can be re-measured with the heavy-tail removed.
- scripts/b3_sweep.sh: orchestrates the 5-cell sweep.
- GPU_INDICES makes it easy to skip a dead GPU.
- EXTRA_VLLM_ARGS defaults to --enable-prompt-tokens-details so
usage.prompt_tokens_details.cached_tokens is populated. vLLM
0.18.1 omits the field by default and breaks the reuse-decomp
pipeline; the smoke run surfaced this.
- Trap kills EngineCore by name in addition to "vllm serve" — the
parent dies first but the child holds GPU memory. Was the root
cause of the 89 GB ghost on GPU 0 earlier today.
- Proxy readiness is a polling loop, not a fixed sleep.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -178,6 +178,48 @@ def pick_instance(instances: list[InstanceState], token_ids: list[int] | None,
|
||||
return instances[best_idx], best_idx
|
||||
|
||||
|
||||
def pick_instance_load_only(
|
||||
instances: list[InstanceState],
|
||||
token_ids: list[int] | None,
|
||||
session_id: str | None,
|
||||
input_length: int,
|
||||
affinity: dict[str, int],
|
||||
) -> tuple[InstanceState, int]:
|
||||
"""Pure load balancing: pick instance with fewest in-flight requests.
|
||||
|
||||
Ignores cache hits and session affinity. Used as a B3 control to
|
||||
isolate the locality contribution of cache-aware policies.
|
||||
"""
|
||||
best_idx = min(range(len(instances)),
|
||||
key=lambda i: instances[i].num_requests)
|
||||
return instances[best_idx], best_idx
|
||||
|
||||
|
||||
def pick_instance_sticky(
|
||||
instances: list[InstanceState],
|
||||
token_ids: list[int] | None,
|
||||
session_id: str | None,
|
||||
input_length: int,
|
||||
affinity: dict[str, int],
|
||||
) -> tuple[InstanceState, int]:
|
||||
"""Hard session affinity: once assigned, never break.
|
||||
|
||||
First turn of a session picks the instance with the lowest
|
||||
num_requests; subsequent turns always return to the same instance
|
||||
regardless of load. Used as a B3 control to isolate the hot-spot
|
||||
cost of perfect locality.
|
||||
"""
|
||||
if session_id and session_id in affinity:
|
||||
idx = affinity[session_id]
|
||||
if idx < len(instances):
|
||||
return instances[idx], idx
|
||||
best_idx = min(range(len(instances)),
|
||||
key=lambda i: instances[i].num_requests)
|
||||
if session_id:
|
||||
affinity[session_id] = best_idx
|
||||
return instances[best_idx], best_idx
|
||||
|
||||
|
||||
def pick_instance_lmetric(instances: list[InstanceState], token_ids: list[int] | None,
|
||||
session_id: str | None, input_length: int,
|
||||
affinity: dict[str, int]) -> tuple[InstanceState, int]:
|
||||
@@ -585,6 +627,14 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
|
||||
chosen, best_idx = pick_instance_lmetric(
|
||||
combined_instances, token_ids, session_id, input_length,
|
||||
session_affinity_combined)
|
||||
elif policy == "load_only":
|
||||
chosen, best_idx = pick_instance_load_only(
|
||||
combined_instances, token_ids, session_id, input_length,
|
||||
session_affinity_combined)
|
||||
elif policy == "sticky":
|
||||
chosen, best_idx = pick_instance_sticky(
|
||||
combined_instances, token_ids, session_id, input_length,
|
||||
session_affinity_combined)
|
||||
elif policy == "unified":
|
||||
chosen, best_idx, decision = pick_instance_unified_hybrid(
|
||||
combined_instances, token_ids, session_id, input_length,
|
||||
@@ -799,8 +849,10 @@ def parse_args():
|
||||
p.add_argument("--bootstrap-ports", type=str, default="",
|
||||
help="Comma-separated bootstrap ports for combined instances (for offload mode)")
|
||||
p.add_argument("--policy", type=str, default="linear",
|
||||
choices=["linear", "lmetric", "unified"],
|
||||
choices=["linear", "lmetric", "load_only", "sticky", "unified"],
|
||||
help="Routing policy: linear (cache-aware), lmetric (P_tokens × BS), "
|
||||
"load_only (B3 control: pure min-num_requests), "
|
||||
"sticky (B3 control: hard session affinity), "
|
||||
"or unified (hybrid affinity + LMetric fallback)")
|
||||
p.add_argument("--overload-factor", type=float, default=2.0,
|
||||
help="Break session affinity when instance load > factor * avg")
|
||||
|
||||
Reference in New Issue
Block a user