Add transfer queue admission knobs
This commit is contained in:
@@ -59,6 +59,8 @@ class ReplayConfig:
|
||||
"release-after-transfer"
|
||||
)
|
||||
kvcache_seed_max_inflight_decode: int | None = 3
|
||||
kvcache_seed_max_decode_transfer_queue_reqs: int | None = None
|
||||
kvcache_direct_max_decode_transfer_queue_reqs: int | None = None
|
||||
kvcache_prefill_priority_eviction: bool = False
|
||||
kvcache_prefill_direct_priority: int = -100
|
||||
kvcache_prefill_normal_priority: int = 100
|
||||
@@ -922,16 +924,29 @@ async def _fetch_decode_load_snapshot(
|
||||
def _decode_load_backpressure_reason(
|
||||
snapshot: DecodeLoadSnapshot | None,
|
||||
*,
|
||||
config: ReplayConfig,
|
||||
routing_mode: Literal["direct", "seed"],
|
||||
) -> str | None:
|
||||
if snapshot is None:
|
||||
return None
|
||||
if routing_mode == "direct":
|
||||
if (
|
||||
config.kvcache_direct_max_decode_transfer_queue_reqs is not None
|
||||
and snapshot.decode_transfer_queue_reqs
|
||||
> config.kvcache_direct_max_decode_transfer_queue_reqs
|
||||
):
|
||||
return "d-transfer-backpressure"
|
||||
if snapshot.decode_retracted_queue_reqs > 0 and snapshot.token_usage >= 0.99:
|
||||
return "d-retracted"
|
||||
if snapshot.token_usage >= 0.992:
|
||||
return "d-token-usage-critical"
|
||||
else:
|
||||
if (
|
||||
config.kvcache_seed_max_decode_transfer_queue_reqs is not None
|
||||
and snapshot.decode_transfer_queue_reqs
|
||||
> config.kvcache_seed_max_decode_transfer_queue_reqs
|
||||
):
|
||||
return "d-transfer-backpressure"
|
||||
if snapshot.decode_retracted_queue_reqs > 0:
|
||||
return "d-retracted"
|
||||
if snapshot.token_usage >= 0.985:
|
||||
@@ -949,6 +964,7 @@ def _is_decode_backpressure_reason(reason: str | None) -> bool:
|
||||
"d-retracted",
|
||||
"d-token-usage-critical",
|
||||
"d-prealloc-backpressure",
|
||||
"d-transfer-backpressure",
|
||||
}
|
||||
|
||||
|
||||
@@ -1221,6 +1237,7 @@ async def _reserve_prefill_backup_capacity(
|
||||
async def _reserve_decode_session_capacity(
|
||||
*,
|
||||
client: httpx.AsyncClient,
|
||||
config: ReplayConfig,
|
||||
request: TraceRequest,
|
||||
server_url: str,
|
||||
session: DirectSessionState,
|
||||
@@ -1233,6 +1250,7 @@ async def _reserve_decode_session_capacity(
|
||||
if admission_mode == "router":
|
||||
return await _reserve_decode_session_capacity_from_router_state(
|
||||
client=client,
|
||||
config=config,
|
||||
request=request,
|
||||
server_url=server_url,
|
||||
session=session,
|
||||
@@ -1315,6 +1333,7 @@ async def _reserve_decode_session_capacity(
|
||||
residency.capacity_tokens[server_url] = load_snapshot.max_total_num_tokens
|
||||
backpressure_reason = _decode_load_backpressure_reason(
|
||||
load_snapshot,
|
||||
config=config,
|
||||
routing_mode=routing_mode,
|
||||
)
|
||||
if backpressure_reason is not None:
|
||||
@@ -1429,6 +1448,7 @@ async def _reserve_decode_session_capacity(
|
||||
async def _reserve_decode_session_capacity_from_router_state(
|
||||
*,
|
||||
client: httpx.AsyncClient,
|
||||
config: ReplayConfig,
|
||||
request: TraceRequest,
|
||||
server_url: str,
|
||||
session: DirectSessionState,
|
||||
@@ -1836,6 +1856,7 @@ async def _execute_request(
|
||||
can_seed, reserved_tokens, _evicted, _p_backed, seed_reason = (
|
||||
await _reserve_decode_session_capacity(
|
||||
client=client,
|
||||
config=config,
|
||||
request=request,
|
||||
server_url=decode_url,
|
||||
session=decode_session,
|
||||
@@ -1911,6 +1932,7 @@ async def _execute_request(
|
||||
) = (
|
||||
await _reserve_decode_session_capacity(
|
||||
client=client,
|
||||
config=config,
|
||||
request=request,
|
||||
server_url=decode_url,
|
||||
session=decode_session,
|
||||
@@ -1996,6 +2018,7 @@ async def _execute_request(
|
||||
) = (
|
||||
await _reserve_decode_session_capacity(
|
||||
client=client,
|
||||
config=config,
|
||||
request=request,
|
||||
server_url=decode_url,
|
||||
session=decode_session,
|
||||
@@ -2084,6 +2107,7 @@ async def _execute_request(
|
||||
) = (
|
||||
await _reserve_decode_session_capacity(
|
||||
client=client,
|
||||
config=config,
|
||||
request=request,
|
||||
server_url=decode_url,
|
||||
session=decode_session,
|
||||
|
||||
Reference in New Issue
Block a user