Validates the elastic_migration_v2 finding that kv_role=kv_both adds
TTFT p90 +45% even when PD-sep never fires. Replicates under
single-instance, synthetic, open-loop workload to disambiguate
mechanism cost from 8-instance feedback amplification.
Configurations (8):
plain, noop_connector, mooncake_{producer,consumer,both},
nixl_both, lmcache_only, multi_mooncake_lmcache.
Pre-flight verification gates risky configs (kv_consumer needs dummy
bootstrap, multi-connector composition, NoOp custom class loading).
Workload: two-phase sweep
Phase A: rate {0.5..32} req/s × shape (4096, 256), saturation criteria
Phase B: ref_safe rate × cartesian (input ∈ {512,4k,32k}, output ∈ {64,256,1024})
Step-timing patch enriches vLLM's existing AGENTIC_STEP_LOG_PATH emit
with step_duration_us and build_meta_us — directly measures per-step
substrate cost, not just user-visible TTFT/TPOT.
run_all.sh runs as 5-stage barrier:
0 pre-flight + apply patch
1 Phase A all configs
2 pick ref_safe / ref_load
3 Phase B all configs
4 revert patch + analyze + plot
Outputs aggregate.{json,csv}, MANIFEST.tsv, and 5 figures.
Estimated runtime: 4-5.5 hours on idle dash0 H20.
111 lines
3.5 KiB
Python
111 lines
3.5 KiB
Python
#!/usr/bin/env python3
|
|
"""1 Hz /metrics scraper for connector_tax microbench.
|
|
|
|
Usage:
|
|
metrics_sampler.py --url http://127.0.0.1:8000/metrics \
|
|
--output results/<run>/metrics.jsonl \
|
|
--interval 1.0
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import time
|
|
import urllib.request
|
|
|
|
|
|
def parse_prom(text: str) -> dict:
|
|
"""Parse Prometheus text-format metrics. Returns {name: [(labels, value)]}."""
|
|
out: dict[str, list[tuple[dict[str, str], float]]] = {}
|
|
for line in text.splitlines():
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
# name{labels} value [timestamp]
|
|
if "{" in line:
|
|
name, rest = line.split("{", 1)
|
|
labels_str, val_str = rest.rsplit("}", 1)
|
|
labels = {}
|
|
for piece in labels_str.split(","):
|
|
if "=" in piece:
|
|
k, v = piece.split("=", 1)
|
|
labels[k.strip()] = v.strip().strip('"')
|
|
try:
|
|
val = float(val_str.strip().split()[0])
|
|
except (ValueError, IndexError):
|
|
continue
|
|
else:
|
|
parts = line.split()
|
|
if len(parts) < 2:
|
|
continue
|
|
name = parts[0]
|
|
try:
|
|
val = float(parts[1])
|
|
except ValueError:
|
|
continue
|
|
labels = {}
|
|
out.setdefault(name, []).append((labels, val))
|
|
return out
|
|
|
|
|
|
KEEP_PREFIXES = (
|
|
"vllm:num_requests_running",
|
|
"vllm:num_requests_waiting",
|
|
"vllm:gpu_cache_usage_perc",
|
|
"vllm:time_to_first_token_seconds",
|
|
"vllm:time_per_output_token_seconds",
|
|
"vllm:request_prefill_time_seconds",
|
|
"vllm:request_decode_time_seconds",
|
|
"vllm:iteration_tokens_total",
|
|
"vllm:e2e_request_latency_seconds",
|
|
)
|
|
|
|
|
|
def collapse(parsed: dict) -> dict:
|
|
"""Keep only metrics whose names start with one of the prefixes; flatten
|
|
histogram counts into '_bucket' / '_count' / '_sum' suffix entries."""
|
|
out = {}
|
|
for name, entries in parsed.items():
|
|
if not any(name.startswith(p) for p in KEEP_PREFIXES):
|
|
continue
|
|
# Most are scalars (ignore label dimensions for compactness)
|
|
# For histograms we keep _count/_sum and skip individual buckets
|
|
if name.endswith("_bucket"):
|
|
continue
|
|
# Sum across labels to get a single number
|
|
total = sum(v for _lbl, v in entries)
|
|
out[name] = total
|
|
return out
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--url", required=True,
|
|
help="http://host:port/metrics")
|
|
ap.add_argument("--output", required=True)
|
|
ap.add_argument("--interval", type=float, default=1.0)
|
|
ap.add_argument("--duration", type=float, default=0.0,
|
|
help="Stop after N seconds; 0 = run until killed")
|
|
args = ap.parse_args()
|
|
|
|
out = open(args.output, "a", buffering=1)
|
|
t_start = time.time()
|
|
while True:
|
|
try:
|
|
with urllib.request.urlopen(args.url, timeout=2.0) as r:
|
|
text = r.read().decode("utf-8")
|
|
parsed = parse_prom(text)
|
|
sample = collapse(parsed)
|
|
sample["t_unix"] = time.time()
|
|
out.write(json.dumps(sample) + "\n")
|
|
except Exception as e:
|
|
err = {"t_unix": time.time(), "error": str(e)}
|
|
out.write(json.dumps(err) + "\n")
|
|
|
|
if args.duration > 0 and time.time() - t_start >= args.duration:
|
|
break
|
|
time.sleep(args.interval)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|