Add --offload-mode switch for ablation (direct_read vs cached_prefill)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-24 11:24:15 +08:00
parent cdf83493ab
commit bf76273778
2 changed files with 29 additions and 10 deletions

View File

@@ -37,6 +37,7 @@ OVERLOAD_FACTOR_ARG=""
MAX_BATCHED_TOKENS=""
MAX_OFFLOAD_INFLIGHT=""
CACHE_GATE_RATIO=""
OFFLOAD_MODE=""
# Parse args
while [[ $# -gt 0 ]]; do
@@ -53,6 +54,7 @@ while [[ $# -gt 0 ]]; do
--max-batched-tokens) MAX_BATCHED_TOKENS="$2"; shift 2 ;;
--max-offload-inflight) MAX_OFFLOAD_INFLIGHT="$2"; shift 2 ;;
--cache-gate-ratio) CACHE_GATE_RATIO="$2"; shift 2 ;;
--offload-mode) OFFLOAD_MODE="$2"; shift 2 ;;
*) echo "Unknown: $1"; exit 1 ;;
esac
done
@@ -218,6 +220,9 @@ launch_proxy() {
if [ -n "$CACHE_GATE_RATIO" ]; then
extra_args="$extra_args --cache-gate-ratio $CACHE_GATE_RATIO"
fi
if [ -n "$OFFLOAD_MODE" ]; then
extra_args="$extra_args --offload-mode $OFFLOAD_MODE"
fi
if [ "$MODE" = "elastic" ]; then
local bp_list=""
for i in $(seq 0 $((N_INSTANCES - 1))); do

View File

@@ -448,23 +448,33 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
else:
push_cache_hit = best_cache_hit # fallback to shadow estimate
# If real hit > 0, proceed with cached prefill on C → decode on D
# If real hit > 0, proceed with offload
if push_cache_hit > 0:
push_new = max(0, input_length - push_cache_hit)
c_inst.ongoing_tokens += input_length
c_inst.pending_prefill_tokens += push_new
c_inst.num_requests += 1
c_inst.active_p_offloads += 1
breakdown["route_class"] = "CACHED_PREFILL_OFFLOAD"
offload_mode = getattr(global_args, 'offload_mode', 'cached_prefill')
breakdown["c_inst"] = c_inst.url
breakdown["d_inst"] = d_inst.url
breakdown["push_cache_hit"] = push_cache_hit
if offload_mode == "cached_prefill":
c_inst.ongoing_tokens += input_length
c_inst.pending_prefill_tokens += push_new
c_inst.num_requests += 1
c_inst.active_p_offloads += 1
breakdown["route_class"] = "CACHED_PREFILL_OFFLOAD"
return await _handle_cached_prefill_offload(
api, req_data, headers, token_ids, input_length,
c_inst, d_inst, push_cache_hit, push_new, breakdown)
else:
d_inst.ongoing_tokens += input_length
d_inst.pending_prefill_tokens += push_new
d_inst.num_requests += 1
c_inst.active_p_offloads += 1
breakdown["route_class"] = "PUSH_MIGRATE"
return await _handle_direct_read_offload(
api, req_data, headers, token_ids, input_length,
c_inst, d_inst, push_cache_hit, push_new, breakdown)
# Real hit is 0 — downgrade to LOCAL
breakdown["push_downgraded"] = True
@@ -775,6 +785,10 @@ def parse_args():
help="Break session affinity when instance load > factor * avg")
p.add_argument("--max-offload-inflight", type=int, default=4,
help="Global cap on concurrent P-role offloads (M3)")
p.add_argument("--offload-mode", type=str, default="cached_prefill",
choices=["direct_read", "cached_prefill"],
help="direct_read: D pulls KV from C (PUSH). "
"cached_prefill: C prefills then D decodes (PD-sep style).")
p.add_argument("--cache-gate-ratio", type=float, default=0.3,
help="Min cache_hit/input ratio to allow offload "
"(0.0 disables gate, 1.0 disables offload entirely)")