A5 fix: worker-id resolution and vLLM cmpl- rid stripping
Smoke validation on dash0 surfaced three real bugs that broke interference and failure-attribution labels end-to-end: 1. endpoint_url in metrics is the proxy URL (e.g. http://h:9200); the vLLM worker URL lives in breakdown's routed_to. The interference index and label path were taking endpoint_url first, so every request looked routed to a non-existent worker and the overlap counter stayed at zero. 2. _normalize_worker hard-coded base port 8000, so a smoke run on port 9100 resolved to engine_1100 instead of engine_0. Added a --worker-map URL=engine_id CLI flag and _resolve_worker() that prefers the explicit map and falls back to the heuristic. 3. vLLM rewrites the per-step rid as cmpl-<proxy_id>-<i>-<hash>, so the str equality check between per_req rid and our proxy request_id never matched -> every prefill step looked like "other request prefill", which would have flipped overlap to 100%. Added _vllm_rid_matches() that strips the cmpl-/chatcmpl- prefix. After the fix, the same smoke run reports interference_index = 22.9 across 24 overlap / 6 clean requests on a single instance, which is the expected shape for serial dispatch into a cold engine. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -204,6 +204,7 @@ def _fractions(*parts: int) -> JsonDict:
|
||||
def interference_index(
|
||||
joined: list[JsonDict],
|
||||
engine_state_by_worker: dict[str, list[JsonDict]],
|
||||
worker_map: dict[str, str] | None = None,
|
||||
) -> JsonDict:
|
||||
"""Label each completed request's decode period as overlap / no-overlap.
|
||||
|
||||
@@ -219,7 +220,12 @@ def interference_index(
|
||||
tpot_clean: list[float] = []
|
||||
for r in joined:
|
||||
rid = r.get("request_id")
|
||||
worker = _normalize_worker(r.get("endpoint_url") or r.get("routed_to"))
|
||||
# routed_to is the vLLM worker URL; endpoint_url is the proxy URL.
|
||||
# For worker-id matching we want routed_to.
|
||||
worker = _resolve_worker(
|
||||
r.get("routed_to") or r.get("endpoint_url"),
|
||||
worker_map,
|
||||
)
|
||||
steps = engine_state_by_worker.get(worker)
|
||||
if not steps:
|
||||
continue
|
||||
@@ -239,9 +245,15 @@ def interference_index(
|
||||
# this request's own prefill warming up, not interference.
|
||||
other_prefill = False
|
||||
for pr in s.get("per_req", []) or []:
|
||||
if pr.get("phase") == "prefill" and pr.get("rid") != rid:
|
||||
other_prefill = True
|
||||
break
|
||||
if pr.get("phase") != "prefill":
|
||||
continue
|
||||
# vLLM rewrites rid as `cmpl-<our_id>-<idx>-<hash>`; strip
|
||||
# the prefix and the trailing suffix so equality to our
|
||||
# proxy request_id works.
|
||||
if _vllm_rid_matches(pr.get("rid"), rid):
|
||||
continue
|
||||
other_prefill = True
|
||||
break
|
||||
if other_prefill:
|
||||
overlap = True
|
||||
break
|
||||
@@ -264,12 +276,15 @@ def interference_index(
|
||||
|
||||
|
||||
def _normalize_worker(url_or_id: str | None) -> str | None:
|
||||
"""Map endpoint URLs to AGENTIC_WORKER_ID convention engine_{i}."""
|
||||
"""Best-effort: map URLs like http://h:8000 to engine_0 by base-port 8000.
|
||||
|
||||
Use `_resolve_worker(url, worker_map)` instead when you have an
|
||||
explicit URL→worker_id map (e.g. from bench.sh).
|
||||
"""
|
||||
if not url_or_id:
|
||||
return None
|
||||
if url_or_id.startswith("engine_"):
|
||||
return url_or_id
|
||||
# Endpoint URLs look like http://host:8000; map by port offset against 8000.
|
||||
try:
|
||||
port = int(url_or_id.rsplit(":", 1)[1].split("/")[0])
|
||||
return f"engine_{port - 8000}"
|
||||
@@ -277,10 +292,36 @@ def _normalize_worker(url_or_id: str | None) -> str | None:
|
||||
return url_or_id
|
||||
|
||||
|
||||
def _resolve_worker(
|
||||
url_or_id: str | None,
|
||||
worker_map: dict[str, str] | None,
|
||||
) -> str | None:
|
||||
if not url_or_id:
|
||||
return None
|
||||
if worker_map and url_or_id in worker_map:
|
||||
return worker_map[url_or_id]
|
||||
return _normalize_worker(url_or_id)
|
||||
|
||||
|
||||
def _vllm_rid_matches(vllm_rid: Any, proxy_rid: Any) -> bool:
|
||||
"""vLLM internally rewrites rid as `cmpl-<proxy_id>-<i>-<hash>`."""
|
||||
if vllm_rid is None or proxy_rid is None:
|
||||
return False
|
||||
if vllm_rid == proxy_rid:
|
||||
return True
|
||||
s = str(vllm_rid)
|
||||
p = str(proxy_rid)
|
||||
return s.startswith(f"cmpl-{p}-") or s.startswith(f"chatcmpl-{p}-")
|
||||
|
||||
|
||||
# ---------- Hotspot index (B3) ------------------------------------------
|
||||
|
||||
def hotspot_index(joined: list[JsonDict]) -> JsonDict:
|
||||
"""max/median per-worker queue-delay p90 across completed requests."""
|
||||
"""max/median per-worker queue-delay p90 across completed requests.
|
||||
|
||||
Worker key is the raw `routed_to` URL (or proxy `endpoint_url`
|
||||
fallback), so per-worker rows match the user's mental model.
|
||||
"""
|
||||
if not joined:
|
||||
return {"status": "unavailable"}
|
||||
|
||||
@@ -289,6 +330,7 @@ def hotspot_index(joined: list[JsonDict]) -> JsonDict:
|
||||
for r in joined:
|
||||
if r.get("error"):
|
||||
continue
|
||||
# routed_to is the vLLM worker URL; endpoint_url is the proxy URL.
|
||||
worker = r.get("routed_to") or r.get("endpoint_url")
|
||||
if not worker:
|
||||
continue
|
||||
@@ -335,6 +377,7 @@ def label_slow_requests(
|
||||
engine_state_by_worker: dict[str, list[JsonDict]],
|
||||
slo: JsonDict | None = None,
|
||||
slow_ttft_factor: float = 2.0,
|
||||
worker_map: dict[str, str] | None = None,
|
||||
) -> list[JsonDict]:
|
||||
slo = slo or DEFAULT_SLO
|
||||
ttft_threshold = float(slo["ttft_p90_s"]) * slow_ttft_factor
|
||||
@@ -358,7 +401,8 @@ def label_slow_requests(
|
||||
continue
|
||||
if ttft <= ttft_threshold:
|
||||
continue
|
||||
label = _assign_label(r, hot_workers, engine_state_by_worker)
|
||||
label = _assign_label(r, hot_workers, engine_state_by_worker,
|
||||
worker_map)
|
||||
labels.append({
|
||||
"request_id": r.get("request_id"),
|
||||
"session_id": r.get("session_id"),
|
||||
@@ -377,8 +421,12 @@ def _assign_label(
|
||||
r: JsonDict,
|
||||
hot_workers: set[str],
|
||||
engine_state_by_worker: dict[str, list[JsonDict]],
|
||||
worker_map: dict[str, str] | None = None,
|
||||
) -> str:
|
||||
worker = _normalize_worker(r.get("endpoint_url") or r.get("routed_to"))
|
||||
worker = _resolve_worker(
|
||||
r.get("routed_to") or r.get("endpoint_url"),
|
||||
worker_map,
|
||||
)
|
||||
rid = r.get("request_id")
|
||||
steps = engine_state_by_worker.get(worker, [])
|
||||
t0 = r.get("t_first_token_unix")
|
||||
@@ -389,8 +437,11 @@ def _assign_label(
|
||||
if t is None or t < t0 or t > t1:
|
||||
continue
|
||||
for pr in s.get("per_req", []) or []:
|
||||
if pr.get("phase") == "prefill" and pr.get("rid") != rid:
|
||||
return "same_worker_prefill_overlap"
|
||||
if pr.get("phase") != "prefill":
|
||||
continue
|
||||
if _vllm_rid_matches(pr.get("rid"), rid):
|
||||
continue
|
||||
return "same_worker_prefill_overlap"
|
||||
if (r.get("routed_to") or "") in hot_workers:
|
||||
return "hot_worker_queue"
|
||||
est = r.get("estimated_new_tokens") or 0
|
||||
@@ -489,7 +540,17 @@ def main(argv: list[str] | None = None) -> None:
|
||||
help="run_meta or window_summary.json from SRR loadgen")
|
||||
p.add_argument("--out-dir", type=Path, required=True)
|
||||
p.add_argument("--slow-ttft-factor", type=float, default=2.0)
|
||||
p.add_argument("--worker-map", type=str, default=None,
|
||||
help="Comma-separated URL=worker_id pairs, e.g. "
|
||||
"http://h:9100=engine_0,http://h:9101=engine_1")
|
||||
args = p.parse_args(argv)
|
||||
worker_map: dict[str, str] | None = None
|
||||
if args.worker_map:
|
||||
worker_map = {}
|
||||
for entry in args.worker_map.split(","):
|
||||
url, _, wid = entry.strip().partition("=")
|
||||
if url and wid:
|
||||
worker_map[url] = wid
|
||||
|
||||
metrics = load_jsonl(args.metrics)
|
||||
breakdown_raw = load_json(args.breakdown) if args.breakdown else []
|
||||
@@ -512,11 +573,12 @@ def main(argv: list[str] | None = None) -> None:
|
||||
write_json(args.out_dir / "reuse_decomposition.json",
|
||||
reuse_decomposition(joined))
|
||||
write_json(args.out_dir / "interference_index.json",
|
||||
interference_index(joined, engine_state))
|
||||
interference_index(joined, engine_state, worker_map))
|
||||
write_json(args.out_dir / "hotspot_index.json",
|
||||
hotspot_index(joined))
|
||||
labels = label_slow_requests(joined, engine_state,
|
||||
slow_ttft_factor=args.slow_ttft_factor)
|
||||
slow_ttft_factor=args.slow_ttft_factor,
|
||||
worker_map=worker_map)
|
||||
write_jsonl(args.out_dir / "failure_label.jsonl", labels)
|
||||
counts: dict[str, int] = defaultdict(int)
|
||||
for L in labels:
|
||||
|
||||
@@ -11,6 +11,8 @@ from analysis.characterization.joined_analysis import (
|
||||
window_summary,
|
||||
_normalize_worker,
|
||||
_percentile,
|
||||
_resolve_worker,
|
||||
_vllm_rid_matches,
|
||||
)
|
||||
|
||||
|
||||
@@ -70,18 +72,22 @@ def test_normalize_worker_maps_port_to_engine_id():
|
||||
|
||||
def test_interference_index_marks_overlap_when_other_request_prefilling():
|
||||
metrics = [
|
||||
_mk_metric("decode_target", endpoint_url="http://h:8000",
|
||||
_mk_metric("decode_target",
|
||||
t_first_token_unix=10.0, t_finish_unix=11.0,
|
||||
tpot_s=0.10),
|
||||
_mk_metric("decode_clean", endpoint_url="http://h:8001",
|
||||
_mk_metric("decode_clean",
|
||||
t_first_token_unix=20.0, t_finish_unix=21.0,
|
||||
tpot_s=0.04),
|
||||
]
|
||||
joined = build_joined_records(metrics, [], [])
|
||||
breakdown = [
|
||||
{"request_id": "decode_target", "routed_to": "http://h:8000"},
|
||||
{"request_id": "decode_clean", "routed_to": "http://h:8001"},
|
||||
]
|
||||
joined = build_joined_records(metrics, breakdown, [])
|
||||
engine_state = {
|
||||
"engine_0": [
|
||||
{"t_unix": 10.5, "prefill_tokens": 8000,
|
||||
"per_req": [{"rid": "other", "phase": "prefill"}]},
|
||||
"per_req": [{"rid": "cmpl-other-0-aaaa", "phase": "prefill"}]},
|
||||
],
|
||||
"engine_1": [
|
||||
{"t_unix": 20.5, "prefill_tokens": 0,
|
||||
@@ -93,10 +99,22 @@ def test_interference_index_marks_overlap_when_other_request_prefilling():
|
||||
assert out["n_overlap_requests"] == 1
|
||||
assert out["n_clean_requests"] == 1
|
||||
assert out["interference_index"] is not None
|
||||
# Overlap p90 = 0.10; clean p90 = 0.04; ratio > 2
|
||||
assert out["interference_index"] > 2.0
|
||||
|
||||
|
||||
def test_resolve_worker_prefers_explicit_map():
|
||||
assert _resolve_worker("http://h:9100", {"http://h:9100": "engine_0"}) == "engine_0"
|
||||
assert _resolve_worker("http://h:9100", None) == "engine_1100"
|
||||
|
||||
|
||||
def test_vllm_rid_matches_strips_cmpl_prefix():
|
||||
assert _vllm_rid_matches("cmpl-1237198:1:1237198:0-0-b07fed77",
|
||||
"1237198:1:1237198:0")
|
||||
assert _vllm_rid_matches("chatcmpl-abc-0-xx", "abc")
|
||||
assert not _vllm_rid_matches("cmpl-other-0-xx", "1237198:1:1237198:0")
|
||||
assert not _vllm_rid_matches(None, "x")
|
||||
|
||||
|
||||
def test_hotspot_index_max_over_median_p90():
|
||||
"""One hot worker with TTFT 10x the others should drive a >1 index."""
|
||||
rows = []
|
||||
@@ -118,14 +136,10 @@ def test_label_slow_requests_flags_overlap_and_hot_worker():
|
||||
_mk_metric("slow_overlap", ttft_s=10.0,
|
||||
t_first_token_unix=10.0, t_finish_unix=11.0),
|
||||
_mk_metric("slow_no_overlap", ttft_s=10.0,
|
||||
endpoint_url="http://h:8005",
|
||||
t_first_token_unix=20.0, t_finish_unix=21.0),
|
||||
_mk_metric("fast", ttft_s=0.5,
|
||||
t_first_token_unix=15.0, t_finish_unix=16.0),
|
||||
]
|
||||
metrics[0]["routed_to"] = "http://h:8000"
|
||||
metrics[1]["routed_to"] = "http://h:8005"
|
||||
metrics[2]["routed_to"] = "http://h:8000"
|
||||
bk = [
|
||||
{"request_id": "slow_overlap", "routed_to": "http://h:8000"},
|
||||
{"request_id": "slow_no_overlap", "routed_to": "http://h:8005"},
|
||||
@@ -134,7 +148,8 @@ def test_label_slow_requests_flags_overlap_and_hot_worker():
|
||||
joined = build_joined_records(metrics, bk, [])
|
||||
engine_state = {
|
||||
"engine_0": [{"t_unix": 10.5, "prefill_tokens": 5000,
|
||||
"per_req": [{"rid": "other", "phase": "prefill"}]}],
|
||||
"per_req": [{"rid": "cmpl-other-0-x",
|
||||
"phase": "prefill"}]}],
|
||||
}
|
||||
labels = label_slow_requests(joined, engine_state, slow_ttft_factor=2.0)
|
||||
by_id = {L["request_id"]: L["label"] for L in labels}
|
||||
|
||||
Reference in New Issue
Block a user