From 1cf03c6e790a7490a643759a1f013fcd2d0c4ea1 Mon Sep 17 00:00:00 2001
From: Gahow Wang <gahow.wang@gmail.com>
Date: Sat, 23 May 2026 23:59:06 +0800
Subject: [PATCH] Cost model: add interference penalty for co-located heavy
 prefill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Old cost model: offload_cost = colocated_cost + RDMA_overhead, so offload
was always 0.1s more expensive. Result: only 19/117 HEAVY offloaded.

New: colocated_cost includes interference penalty when C_s has decode
requests: penalty = prefill_time × min(num_requests, 3) × 0.3.
Offload now wins when C_s has 1+ concurrent request.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/cache_aware_proxy.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/scripts/cache_aware_proxy.py b/scripts/cache_aware_proxy.py
index bcb285c..a60f66a 100644
--- a/scripts/cache_aware_proxy.py
+++ b/scripts/cache_aware_proxy.py
@@ -384,14 +384,18 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
         d_candidate = min(remaining, key=lambda c: c.ongoing_tokens) if remaining else p_candidate
 
         # Cost model: compare co-located vs direct-RDMA-read offload
-        # Co-located: queue on C_s + prefill new tokens on C_s
+        # Co-located cost includes interference: heavy prefill on C_s blocks
+        # its ongoing decode requests, degrading their TPOT.
         cs_queue = best_inst.pending_prefill_tokens / SETTINGS.prefill_throughput
-        colocated_cost = cs_queue + estimated_new / SETTINGS.prefill_throughput
+        prefill_time = estimated_new / SETTINGS.prefill_throughput
+        # Interference penalty: if C_s has decode requests, heavy prefill disrupts them
+        interference = prefill_time * min(best_inst.num_requests, 3) * 0.3
+        colocated_cost = cs_queue + prefill_time + interference
 
-        # Direct RDMA read: D reads C_s's cached blocks via RDMA + D prefills new tokens locally
-        # D's queue + RDMA read time + D local prefill of new tokens only
+        # Direct RDMA read: D reads cached blocks + prefills new tokens locally
+        # C_s is not involved → zero interference on C_s's decode
         d_queue = d_candidate.pending_prefill_tokens / SETTINGS.prefill_throughput
-        offload_cost = d_queue + SETTINGS.rdma_overhead_s + estimated_new / SETTINGS.prefill_throughput
+        offload_cost = d_queue + SETTINGS.rdma_overhead_s + prefill_time
 
         breakdown["cache_ratio"] = cache_ratio
         breakdown["colocated_cost"] = round(colocated_cost, 2)