Add --offload-mode switch for ablation (direct_read vs cached_prefill)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-24 11:24:15 +08:00
parent cdf83493ab
commit bf76273778
2 changed files with 29 additions and 10 deletions
--- a/scripts/bench.sh
+++ b/scripts/bench.sh
@@ -37,6 +37,7 @@ OVERLOAD_FACTOR_ARG=""
 MAX_BATCHED_TOKENS=""
 MAX_OFFLOAD_INFLIGHT=""
 CACHE_GATE_RATIO=""
+OFFLOAD_MODE=""

 # Parse args
 while [[ $# -gt 0 ]]; do
@@ -53,6 +54,7 @@ while [[ $# -gt 0 ]]; do
        --max-batched-tokens) MAX_BATCHED_TOKENS="$2"; shift 2 ;;
        --max-offload-inflight) MAX_OFFLOAD_INFLIGHT="$2"; shift 2 ;;
        --cache-gate-ratio) CACHE_GATE_RATIO="$2"; shift 2 ;;
+        --offload-mode) OFFLOAD_MODE="$2"; shift 2 ;;
        *) echo "Unknown: $1"; exit 1 ;;
    esac
 done
@@ -218,6 +220,9 @@ launch_proxy() {
    if [ -n "$CACHE_GATE_RATIO" ]; then
        extra_args="$extra_args --cache-gate-ratio $CACHE_GATE_RATIO"
    fi
+    if [ -n "$OFFLOAD_MODE" ]; then
+        extra_args="$extra_args --offload-mode $OFFLOAD_MODE"
+    fi
    if [ "$MODE" = "elastic" ]; then
        local bp_list=""
        for i in $(seq 0 $((N_INSTANCES - 1))); do
--- a/scripts/cache_aware_proxy.py
+++ b/scripts/cache_aware_proxy.py
@@ -448,23 +448,33 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
        else:
            push_cache_hit = best_cache_hit  # fallback to shadow estimate

-        # If real hit > 0, proceed with cached prefill on C → decode on D
+        # If real hit > 0, proceed with offload
        if push_cache_hit > 0:
            push_new = max(0, input_length - push_cache_hit)

-            c_inst.ongoing_tokens += input_length
-            c_inst.pending_prefill_tokens += push_new
-            c_inst.num_requests += 1
-            c_inst.active_p_offloads += 1
-
-            breakdown["route_class"] = "CACHED_PREFILL_OFFLOAD"
+            offload_mode = getattr(global_args, 'offload_mode', 'cached_prefill')
            breakdown["c_inst"] = c_inst.url
            breakdown["d_inst"] = d_inst.url
            breakdown["push_cache_hit"] = push_cache_hit

+            if offload_mode == "cached_prefill":
+                c_inst.ongoing_tokens += input_length
+                c_inst.pending_prefill_tokens += push_new
+                c_inst.num_requests += 1
+                c_inst.active_p_offloads += 1
+                breakdown["route_class"] = "CACHED_PREFILL_OFFLOAD"
                return await _handle_cached_prefill_offload(
                    api, req_data, headers, token_ids, input_length,
                    c_inst, d_inst, push_cache_hit, push_new, breakdown)
+            else:
+                d_inst.ongoing_tokens += input_length
+                d_inst.pending_prefill_tokens += push_new
+                d_inst.num_requests += 1
+                c_inst.active_p_offloads += 1
+                breakdown["route_class"] = "PUSH_MIGRATE"
+                return await _handle_direct_read_offload(
+                    api, req_data, headers, token_ids, input_length,
+                    c_inst, d_inst, push_cache_hit, push_new, breakdown)

        # Real hit is 0 — downgrade to LOCAL
        breakdown["push_downgraded"] = True
@@ -775,6 +785,10 @@ def parse_args():
                   help="Break session affinity when instance load > factor * avg")
    p.add_argument("--max-offload-inflight", type=int, default=4,
                   help="Global cap on concurrent P-role offloads (M3)")
+    p.add_argument("--offload-mode", type=str, default="cached_prefill",
+                   choices=["direct_read", "cached_prefill"],
+                   help="direct_read: D pulls KV from C (PUSH). "
+                        "cached_prefill: C prefills then D decodes (PD-sep style).")
    p.add_argument("--cache-gate-ratio", type=float, default=0.3,
                   help="Min cache_hit/input ratio to allow offload "
                        "(0.0 disables gate, 1.0 disables offload entirely)")