From 1cf03c6e790a7490a643759a1f013fcd2d0c4ea1 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Sat, 23 May 2026 23:59:06 +0800 Subject: [PATCH] Cost model: add interference penalty for co-located heavy prefill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Old cost model: offload_cost = colocated_cost + RDMA_overhead, so offload was always 0.1s more expensive. Result: only 19/117 HEAVY offloaded. New: colocated_cost includes interference penalty when C_s has decode requests: penalty = prefill_time × min(num_requests, 3) × 0.3. Offload now wins when C_s has 1+ concurrent request. Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/cache_aware_proxy.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/cache_aware_proxy.py b/scripts/cache_aware_proxy.py index bcb285c..a60f66a 100644 --- a/scripts/cache_aware_proxy.py +++ b/scripts/cache_aware_proxy.py @@ -384,14 +384,18 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h d_candidate = min(remaining, key=lambda c: c.ongoing_tokens) if remaining else p_candidate # Cost model: compare co-located vs direct-RDMA-read offload - # Co-located: queue on C_s + prefill new tokens on C_s + # Co-located cost includes interference: heavy prefill on C_s blocks + # its ongoing decode requests, degrading their TPOT. cs_queue = best_inst.pending_prefill_tokens / SETTINGS.prefill_throughput - colocated_cost = cs_queue + estimated_new / SETTINGS.prefill_throughput + prefill_time = estimated_new / SETTINGS.prefill_throughput + # Interference penalty: if C_s has decode requests, heavy prefill disrupts them + interference = prefill_time * min(best_inst.num_requests, 3) * 0.3 + colocated_cost = cs_queue + prefill_time + interference - # Direct RDMA read: D reads C_s's cached blocks via RDMA + D prefills new tokens locally - # D's queue + RDMA read time + D local prefill of new tokens only + # Direct RDMA read: D reads cached blocks + prefills new tokens locally + # C_s is not involved → zero interference on C_s's decode d_queue = d_candidate.pending_prefill_tokens / SETTINGS.prefill_throughput - offload_cost = d_queue + SETTINGS.rdma_overhead_s + estimated_new / SETTINGS.prefill_throughput + offload_cost = d_queue + SETTINGS.rdma_overhead_s + prefill_time breakdown["cache_ratio"] = cache_ratio breakdown["colocated_cost"] = round(colocated_cost, 2)