Fix replay methodology: trace-driven dispatch, no artificial limits

The replayer was artificially limiting concurrency with --max-inflight-sessions
(semaphore) and --time-scale (time compression), producing unrealistically low
1 req/GPU load that masked prefill-decode interference.

Replayer changes:
- Remove session_sem and time_scale entirely
- Each request dispatched at its trace timestamp exactly
- Sessions still sequential (turn N+1 waits for turn N completion)
- If turn completes late, next turn fires immediately

Sampler changes:
- Add --sample-ratio for GPU-proportional session sampling
- Keep --target-requests for backwards compat
- No time compression (preserve original arrival pattern)

bench.sh: remove --time-scale and --max-inflight-sessions args

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-23 12:43:41 +08:00
parent c8ba666517
commit 4089ffd63f
4 changed files with 84 additions and 103 deletions

View File

@@ -1,16 +1,15 @@
"""Trace replayer — send requests to vLLM following trace timing.
Supports both vLLM's /v1/completions (OpenAI-compatible) and /generate
(SGLang-style) endpoints. Uses hash_ids from the trace to construct
synthetic prompts that reproduce realistic prefix-cache hit patterns.
Uses hash_ids from the trace to construct synthetic prompts that
reproduce realistic prefix-cache hit patterns.
Key behaviors:
- Trace-driven dispatch: each request is sent at its trace timestamp.
No artificial concurrency limits or time compression.
- Per-session sequencing: turns within a session are sent in order,
each waiting for the previous to complete before dispatching.
- Inter-session arrival: sessions start at their trace timestamps,
scaled by --time-scale.
- Concurrency control: --max-inflight-sessions caps concurrent sessions;
--concurrency-limit caps total in-flight requests.
If a turn completes after its successor's timestamp, the successor
fires immediately (no waiting for a past timestamp).
"""
from __future__ import annotations
@@ -56,9 +55,7 @@ class ReplayConfig:
trace_path: Path
output_path: Path
endpoint_url: str # comma-separated for round-robin: "http://host:8000,http://host:8001"
time_scale: float = 1.0
max_inflight_sessions: int = 32
concurrency_limit: int = 256
concurrency_limit: int = 2000
request_timeout_s: float = 600.0
request_limit: int | None = None
model_name: str = "default"
@@ -214,34 +211,25 @@ async def _run_session(
state: _SessionState,
config: ReplayConfig,
client: httpx.AsyncClient,
session_sem: asyncio.Semaphore,
request_sem: asyncio.Semaphore,
earliest_ts: float,
sweep_start: float,
sink: IncrementalMetricSink,
) -> list[RequestMetrics]:
async with session_sem:
# Wait until this session's start time
offset = (state.turns[0].timestamp_s - earliest_ts) / config.time_scale
wait = offset - (time.perf_counter() - sweep_start)
if wait > 0:
await asyncio.sleep(wait)
for req in state.turns:
# Wait until this request's trace timestamp
target_wall = (req.timestamp_s - earliest_ts)
elapsed = time.perf_counter() - sweep_start
if elapsed < target_wall:
await asyncio.sleep(target_wall - elapsed)
for req in state.turns:
# Intra-session: wait for turn's relative offset
if req != state.turns[0]:
target = (req.timestamp_s - state.turns[0].timestamp_s) / config.time_scale
elapsed = time.perf_counter() - sweep_start - offset
if elapsed < target:
await asyncio.sleep(target - elapsed)
token_ids = _build_prompt_token_ids(req)
metric = await _dispatch_request(
client=client, config=config, req=req,
prompt_token_ids=token_ids, sem=request_sem,
)
state.metrics.append(metric)
await sink.append(metric)
token_ids = _build_prompt_token_ids(req)
metric = await _dispatch_request(
client=client, config=config, req=req,
prompt_token_ids=token_ids, sem=request_sem,
)
state.metrics.append(metric)
await sink.append(metric)
return state.metrics
@@ -283,16 +271,18 @@ async def replay_trace(config: ReplayConfig) -> list[RequestMetrics]:
sessions = sorted(by_session.items(), key=lambda kv: kv[1][0].timestamp_s)
earliest_ts = sessions[0][1][0].timestamp_s
latest_ts = max(r.timestamp_s for r in requests)
trace_span = latest_ts - earliest_ts
session_sem = asyncio.Semaphore(config.max_inflight_sessions)
request_sem = asyncio.Semaphore(config.concurrency_limit)
sink = IncrementalMetricSink(config.output_path)
n_sessions = len(sessions)
n_requests = len(requests)
logger.info("Replaying %d sessions (%d requests), time_scale=%.1f",
n_sessions, n_requests, config.time_scale)
qps = n_requests / trace_span if trace_span > 0 else 0
logger.info("Replaying %d sessions (%d requests) over %.0fs (%.2f req/s)",
n_sessions, n_requests, trace_span, qps)
pre_metrics = await _snapshot_prefix_cache_metrics(config.endpoint_url)
sweep_start = time.perf_counter()
@@ -312,7 +302,7 @@ async def replay_trace(config: ReplayConfig) -> list[RequestMetrics]:
asyncio.create_task(_run_session(
state=_SessionState(session_id=sid, turns=turns),
config=config, client=client,
session_sem=session_sem, request_sem=request_sem,
request_sem=request_sem,
earliest_ts=earliest_ts, sweep_start=sweep_start,
sink=sink,
))