Add --offload-mode switch for ablation (direct_read vs cached_prefill)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -37,6 +37,7 @@ OVERLOAD_FACTOR_ARG=""
|
||||
MAX_BATCHED_TOKENS=""
|
||||
MAX_OFFLOAD_INFLIGHT=""
|
||||
CACHE_GATE_RATIO=""
|
||||
OFFLOAD_MODE=""
|
||||
|
||||
# Parse args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
@@ -53,6 +54,7 @@ while [[ $# -gt 0 ]]; do
|
||||
--max-batched-tokens) MAX_BATCHED_TOKENS="$2"; shift 2 ;;
|
||||
--max-offload-inflight) MAX_OFFLOAD_INFLIGHT="$2"; shift 2 ;;
|
||||
--cache-gate-ratio) CACHE_GATE_RATIO="$2"; shift 2 ;;
|
||||
--offload-mode) OFFLOAD_MODE="$2"; shift 2 ;;
|
||||
*) echo "Unknown: $1"; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
@@ -218,6 +220,9 @@ launch_proxy() {
|
||||
if [ -n "$CACHE_GATE_RATIO" ]; then
|
||||
extra_args="$extra_args --cache-gate-ratio $CACHE_GATE_RATIO"
|
||||
fi
|
||||
if [ -n "$OFFLOAD_MODE" ]; then
|
||||
extra_args="$extra_args --offload-mode $OFFLOAD_MODE"
|
||||
fi
|
||||
if [ "$MODE" = "elastic" ]; then
|
||||
local bp_list=""
|
||||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||||
|
||||
@@ -448,23 +448,33 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
|
||||
else:
|
||||
push_cache_hit = best_cache_hit # fallback to shadow estimate
|
||||
|
||||
# If real hit > 0, proceed with cached prefill on C → decode on D
|
||||
# If real hit > 0, proceed with offload
|
||||
if push_cache_hit > 0:
|
||||
push_new = max(0, input_length - push_cache_hit)
|
||||
|
||||
c_inst.ongoing_tokens += input_length
|
||||
c_inst.pending_prefill_tokens += push_new
|
||||
c_inst.num_requests += 1
|
||||
c_inst.active_p_offloads += 1
|
||||
|
||||
breakdown["route_class"] = "CACHED_PREFILL_OFFLOAD"
|
||||
offload_mode = getattr(global_args, 'offload_mode', 'cached_prefill')
|
||||
breakdown["c_inst"] = c_inst.url
|
||||
breakdown["d_inst"] = d_inst.url
|
||||
breakdown["push_cache_hit"] = push_cache_hit
|
||||
|
||||
if offload_mode == "cached_prefill":
|
||||
c_inst.ongoing_tokens += input_length
|
||||
c_inst.pending_prefill_tokens += push_new
|
||||
c_inst.num_requests += 1
|
||||
c_inst.active_p_offloads += 1
|
||||
breakdown["route_class"] = "CACHED_PREFILL_OFFLOAD"
|
||||
return await _handle_cached_prefill_offload(
|
||||
api, req_data, headers, token_ids, input_length,
|
||||
c_inst, d_inst, push_cache_hit, push_new, breakdown)
|
||||
else:
|
||||
d_inst.ongoing_tokens += input_length
|
||||
d_inst.pending_prefill_tokens += push_new
|
||||
d_inst.num_requests += 1
|
||||
c_inst.active_p_offloads += 1
|
||||
breakdown["route_class"] = "PUSH_MIGRATE"
|
||||
return await _handle_direct_read_offload(
|
||||
api, req_data, headers, token_ids, input_length,
|
||||
c_inst, d_inst, push_cache_hit, push_new, breakdown)
|
||||
|
||||
# Real hit is 0 — downgrade to LOCAL
|
||||
breakdown["push_downgraded"] = True
|
||||
@@ -775,6 +785,10 @@ def parse_args():
|
||||
help="Break session affinity when instance load > factor * avg")
|
||||
p.add_argument("--max-offload-inflight", type=int, default=4,
|
||||
help="Global cap on concurrent P-role offloads (M3)")
|
||||
p.add_argument("--offload-mode", type=str, default="cached_prefill",
|
||||
choices=["direct_read", "cached_prefill"],
|
||||
help="direct_read: D pulls KV from C (PUSH). "
|
||||
"cached_prefill: C prefills then D decodes (PD-sep style).")
|
||||
p.add_argument("--cache-gate-ratio", type=float, default=0.3,
|
||||
help="Min cache_hit/input ratio to allow offload "
|
||||
"(0.0 disables gate, 1.0 disables offload entirely)")
|
||||
|
||||
Reference in New Issue
Block a user