Fix replay methodology: trace-driven dispatch, no artificial limits
The replayer was artificially limiting concurrency with --max-inflight-sessions (semaphore) and --time-scale (time compression), producing unrealistically low 1 req/GPU load that masked prefill-decode interference. Replayer changes: - Remove session_sem and time_scale entirely - Each request dispatched at its trace timestamp exactly - Sessions still sequential (turn N+1 waits for turn N completion) - If turn completes late, next turn fires immediately Sampler changes: - Add --sample-ratio for GPU-proportional session sampling - Keep --target-requests for backwards compat - No time compression (preserve original arrival pattern) bench.sh: remove --time-scale and --max-inflight-sessions args Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,16 +1,15 @@
|
||||
"""Trace replayer — send requests to vLLM following trace timing.
|
||||
|
||||
Supports both vLLM's /v1/completions (OpenAI-compatible) and /generate
|
||||
(SGLang-style) endpoints. Uses hash_ids from the trace to construct
|
||||
synthetic prompts that reproduce realistic prefix-cache hit patterns.
|
||||
Uses hash_ids from the trace to construct synthetic prompts that
|
||||
reproduce realistic prefix-cache hit patterns.
|
||||
|
||||
Key behaviors:
|
||||
- Trace-driven dispatch: each request is sent at its trace timestamp.
|
||||
No artificial concurrency limits or time compression.
|
||||
- Per-session sequencing: turns within a session are sent in order,
|
||||
each waiting for the previous to complete before dispatching.
|
||||
- Inter-session arrival: sessions start at their trace timestamps,
|
||||
scaled by --time-scale.
|
||||
- Concurrency control: --max-inflight-sessions caps concurrent sessions;
|
||||
--concurrency-limit caps total in-flight requests.
|
||||
If a turn completes after its successor's timestamp, the successor
|
||||
fires immediately (no waiting for a past timestamp).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -56,9 +55,7 @@ class ReplayConfig:
|
||||
trace_path: Path
|
||||
output_path: Path
|
||||
endpoint_url: str # comma-separated for round-robin: "http://host:8000,http://host:8001"
|
||||
time_scale: float = 1.0
|
||||
max_inflight_sessions: int = 32
|
||||
concurrency_limit: int = 256
|
||||
concurrency_limit: int = 2000
|
||||
request_timeout_s: float = 600.0
|
||||
request_limit: int | None = None
|
||||
model_name: str = "default"
|
||||
@@ -214,34 +211,25 @@ async def _run_session(
|
||||
state: _SessionState,
|
||||
config: ReplayConfig,
|
||||
client: httpx.AsyncClient,
|
||||
session_sem: asyncio.Semaphore,
|
||||
request_sem: asyncio.Semaphore,
|
||||
earliest_ts: float,
|
||||
sweep_start: float,
|
||||
sink: IncrementalMetricSink,
|
||||
) -> list[RequestMetrics]:
|
||||
async with session_sem:
|
||||
# Wait until this session's start time
|
||||
offset = (state.turns[0].timestamp_s - earliest_ts) / config.time_scale
|
||||
wait = offset - (time.perf_counter() - sweep_start)
|
||||
if wait > 0:
|
||||
await asyncio.sleep(wait)
|
||||
for req in state.turns:
|
||||
# Wait until this request's trace timestamp
|
||||
target_wall = (req.timestamp_s - earliest_ts)
|
||||
elapsed = time.perf_counter() - sweep_start
|
||||
if elapsed < target_wall:
|
||||
await asyncio.sleep(target_wall - elapsed)
|
||||
|
||||
for req in state.turns:
|
||||
# Intra-session: wait for turn's relative offset
|
||||
if req != state.turns[0]:
|
||||
target = (req.timestamp_s - state.turns[0].timestamp_s) / config.time_scale
|
||||
elapsed = time.perf_counter() - sweep_start - offset
|
||||
if elapsed < target:
|
||||
await asyncio.sleep(target - elapsed)
|
||||
|
||||
token_ids = _build_prompt_token_ids(req)
|
||||
metric = await _dispatch_request(
|
||||
client=client, config=config, req=req,
|
||||
prompt_token_ids=token_ids, sem=request_sem,
|
||||
)
|
||||
state.metrics.append(metric)
|
||||
await sink.append(metric)
|
||||
token_ids = _build_prompt_token_ids(req)
|
||||
metric = await _dispatch_request(
|
||||
client=client, config=config, req=req,
|
||||
prompt_token_ids=token_ids, sem=request_sem,
|
||||
)
|
||||
state.metrics.append(metric)
|
||||
await sink.append(metric)
|
||||
|
||||
return state.metrics
|
||||
|
||||
@@ -283,16 +271,18 @@ async def replay_trace(config: ReplayConfig) -> list[RequestMetrics]:
|
||||
|
||||
sessions = sorted(by_session.items(), key=lambda kv: kv[1][0].timestamp_s)
|
||||
earliest_ts = sessions[0][1][0].timestamp_s
|
||||
latest_ts = max(r.timestamp_s for r in requests)
|
||||
trace_span = latest_ts - earliest_ts
|
||||
|
||||
session_sem = asyncio.Semaphore(config.max_inflight_sessions)
|
||||
request_sem = asyncio.Semaphore(config.concurrency_limit)
|
||||
|
||||
sink = IncrementalMetricSink(config.output_path)
|
||||
|
||||
n_sessions = len(sessions)
|
||||
n_requests = len(requests)
|
||||
logger.info("Replaying %d sessions (%d requests), time_scale=%.1f",
|
||||
n_sessions, n_requests, config.time_scale)
|
||||
qps = n_requests / trace_span if trace_span > 0 else 0
|
||||
logger.info("Replaying %d sessions (%d requests) over %.0fs (%.2f req/s)",
|
||||
n_sessions, n_requests, trace_span, qps)
|
||||
|
||||
pre_metrics = await _snapshot_prefix_cache_metrics(config.endpoint_url)
|
||||
sweep_start = time.perf_counter()
|
||||
@@ -312,7 +302,7 @@ async def replay_trace(config: ReplayConfig) -> list[RequestMetrics]:
|
||||
asyncio.create_task(_run_session(
|
||||
state=_SessionState(session_id=sid, turns=turns),
|
||||
config=config, client=client,
|
||||
session_sem=session_sem, request_sem=request_sem,
|
||||
request_sem=request_sem,
|
||||
earliest_ts=earliest_ts, sweep_start=sweep_start,
|
||||
sink=sink,
|
||||
))
|
||||
|
||||
Reference in New Issue
Block a user