Files
agentic-kvc/microbench/connector_tax/metrics_sampler.py
Gahow Wang 297fed6e73 Microbench 3 (connector_tax): infrastructure for KV connector substrate tax
Validates the elastic_migration_v2 finding that kv_role=kv_both adds
TTFT p90 +45% even when PD-sep never fires. Replicates under
single-instance, synthetic, open-loop workload to disambiguate
mechanism cost from 8-instance feedback amplification.

Configurations (8):
  plain, noop_connector, mooncake_{producer,consumer,both},
  nixl_both, lmcache_only, multi_mooncake_lmcache.

Pre-flight verification gates risky configs (kv_consumer needs dummy
bootstrap, multi-connector composition, NoOp custom class loading).

Workload: two-phase sweep
  Phase A: rate {0.5..32} req/s × shape (4096, 256), saturation criteria
  Phase B: ref_safe rate × cartesian (input ∈ {512,4k,32k}, output ∈ {64,256,1024})

Step-timing patch enriches vLLM's existing AGENTIC_STEP_LOG_PATH emit
with step_duration_us and build_meta_us — directly measures per-step
substrate cost, not just user-visible TTFT/TPOT.

run_all.sh runs as 5-stage barrier:
  0 pre-flight + apply patch
  1 Phase A all configs
  2 pick ref_safe / ref_load
  3 Phase B all configs
  4 revert patch + analyze + plot

Outputs aggregate.{json,csv}, MANIFEST.tsv, and 5 figures.
Estimated runtime: 4-5.5 hours on idle dash0 H20.
2026-05-26 17:27:41 +08:00

111 lines
3.5 KiB
Python

#!/usr/bin/env python3
"""1 Hz /metrics scraper for connector_tax microbench.
Usage:
metrics_sampler.py --url http://127.0.0.1:8000/metrics \
--output results/<run>/metrics.jsonl \
--interval 1.0
"""
import argparse
import json
import time
import urllib.request
def parse_prom(text: str) -> dict:
"""Parse Prometheus text-format metrics. Returns {name: [(labels, value)]}."""
out: dict[str, list[tuple[dict[str, str], float]]] = {}
for line in text.splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
# name{labels} value [timestamp]
if "{" in line:
name, rest = line.split("{", 1)
labels_str, val_str = rest.rsplit("}", 1)
labels = {}
for piece in labels_str.split(","):
if "=" in piece:
k, v = piece.split("=", 1)
labels[k.strip()] = v.strip().strip('"')
try:
val = float(val_str.strip().split()[0])
except (ValueError, IndexError):
continue
else:
parts = line.split()
if len(parts) < 2:
continue
name = parts[0]
try:
val = float(parts[1])
except ValueError:
continue
labels = {}
out.setdefault(name, []).append((labels, val))
return out
KEEP_PREFIXES = (
"vllm:num_requests_running",
"vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:time_to_first_token_seconds",
"vllm:time_per_output_token_seconds",
"vllm:request_prefill_time_seconds",
"vllm:request_decode_time_seconds",
"vllm:iteration_tokens_total",
"vllm:e2e_request_latency_seconds",
)
def collapse(parsed: dict) -> dict:
"""Keep only metrics whose names start with one of the prefixes; flatten
histogram counts into '_bucket' / '_count' / '_sum' suffix entries."""
out = {}
for name, entries in parsed.items():
if not any(name.startswith(p) for p in KEEP_PREFIXES):
continue
# Most are scalars (ignore label dimensions for compactness)
# For histograms we keep _count/_sum and skip individual buckets
if name.endswith("_bucket"):
continue
# Sum across labels to get a single number
total = sum(v for _lbl, v in entries)
out[name] = total
return out
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--url", required=True,
help="http://host:port/metrics")
ap.add_argument("--output", required=True)
ap.add_argument("--interval", type=float, default=1.0)
ap.add_argument("--duration", type=float, default=0.0,
help="Stop after N seconds; 0 = run until killed")
args = ap.parse_args()
out = open(args.output, "a", buffering=1)
t_start = time.time()
while True:
try:
with urllib.request.urlopen(args.url, timeout=2.0) as r:
text = r.read().decode("utf-8")
parsed = parse_prom(text)
sample = collapse(parsed)
sample["t_unix"] = time.time()
out.write(json.dumps(sample) + "\n")
except Exception as e:
err = {"t_unix": time.time(), "error": str(e)}
out.write(json.dumps(err) + "\n")
if args.duration > 0 and time.time() - t_start >= args.duration:
break
time.sleep(args.interval)
if __name__ == "__main__":
main()