Fix replay methodology: trace-driven dispatch, no artificial limits

The replayer was artificially limiting concurrency with --max-inflight-sessions (semaphore) and --time-scale (time compression), producing unrealistically low 1 req/GPU load that masked prefill-decode interference. Replayer changes: - Remove session_sem and time_scale entirely - Each request dispatched at its trace timestamp exactly - Sessions still sequential (turn N+1 waits for turn N completion) - If turn completes late, next turn fires immediately Sampler changes: - Add --sample-ratio for GPU-proportional session sampling - Keep --target-requests for backwards compat - No time compression (preserve original arrival pattern) bench.sh: remove --time-scale and --max-inflight-sessions args Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-23 12:43:41 +08:00
parent c8ba666517
commit 4089ffd63f
4 changed files with 84 additions and 103 deletions
--- a/replayer/replay.py
+++ b/replayer/replay.py
@@ -1,16 +1,15 @@
 """Trace replayer — send requests to vLLM following trace timing.

-Supports both vLLM's /v1/completions (OpenAI-compatible) and /generate
-(SGLang-style) endpoints. Uses hash_ids from the trace to construct
-synthetic prompts that reproduce realistic prefix-cache hit patterns.
+Uses hash_ids from the trace to construct synthetic prompts that
+reproduce realistic prefix-cache hit patterns.

 Key behaviors:
+  - Trace-driven dispatch: each request is sent at its trace timestamp.
+    No artificial concurrency limits or time compression.
  - Per-session sequencing: turns within a session are sent in order,
    each waiting for the previous to complete before dispatching.
-  - Inter-session arrival: sessions start at their trace timestamps,
-    scaled by --time-scale.
-  - Concurrency control: --max-inflight-sessions caps concurrent sessions;
-    --concurrency-limit caps total in-flight requests.
+    If a turn completes after its successor's timestamp, the successor
+    fires immediately (no waiting for a past timestamp).
 """

 from __future__ import annotations
@@ -56,9 +55,7 @@ class ReplayConfig:
    trace_path: Path
    output_path: Path
    endpoint_url: str  # comma-separated for round-robin: "http://host:8000,http://host:8001"
-    time_scale: float = 1.0
-    max_inflight_sessions: int = 32
-    concurrency_limit: int = 256
+    concurrency_limit: int = 2000
    request_timeout_s: float = 600.0
    request_limit: int | None = None
    model_name: str = "default"
@@ -214,34 +211,25 @@ async def _run_session(
    state: _SessionState,
    config: ReplayConfig,
    client: httpx.AsyncClient,
-    session_sem: asyncio.Semaphore,
    request_sem: asyncio.Semaphore,
    earliest_ts: float,
    sweep_start: float,
    sink: IncrementalMetricSink,
 ) -> list[RequestMetrics]:
-    async with session_sem:
-        # Wait until this session's start time
-        offset = (state.turns[0].timestamp_s - earliest_ts) / config.time_scale
-        wait = offset - (time.perf_counter() - sweep_start)
-        if wait > 0:
-            await asyncio.sleep(wait)
+    for req in state.turns:
+        # Wait until this request's trace timestamp
+        target_wall = (req.timestamp_s - earliest_ts)
+        elapsed = time.perf_counter() - sweep_start
+        if elapsed < target_wall:
+            await asyncio.sleep(target_wall - elapsed)

-        for req in state.turns:
-            # Intra-session: wait for turn's relative offset
-            if req != state.turns[0]:
-                target = (req.timestamp_s - state.turns[0].timestamp_s) / config.time_scale
-                elapsed = time.perf_counter() - sweep_start - offset
-                if elapsed < target:
-                    await asyncio.sleep(target - elapsed)
-
-            token_ids = _build_prompt_token_ids(req)
-            metric = await _dispatch_request(
-                client=client, config=config, req=req,
-                prompt_token_ids=token_ids, sem=request_sem,
-            )
-            state.metrics.append(metric)
-            await sink.append(metric)
+        token_ids = _build_prompt_token_ids(req)
+        metric = await _dispatch_request(
+            client=client, config=config, req=req,
+            prompt_token_ids=token_ids, sem=request_sem,
+        )
+        state.metrics.append(metric)
+        await sink.append(metric)

    return state.metrics

@@ -283,16 +271,18 @@ async def replay_trace(config: ReplayConfig) -> list[RequestMetrics]:

    sessions = sorted(by_session.items(), key=lambda kv: kv[1][0].timestamp_s)
    earliest_ts = sessions[0][1][0].timestamp_s
+    latest_ts = max(r.timestamp_s for r in requests)
+    trace_span = latest_ts - earliest_ts

-    session_sem = asyncio.Semaphore(config.max_inflight_sessions)
    request_sem = asyncio.Semaphore(config.concurrency_limit)

    sink = IncrementalMetricSink(config.output_path)

    n_sessions = len(sessions)
    n_requests = len(requests)
-    logger.info("Replaying %d sessions (%d requests), time_scale=%.1f",
-                n_sessions, n_requests, config.time_scale)
+    qps = n_requests / trace_span if trace_span > 0 else 0
+    logger.info("Replaying %d sessions (%d requests) over %.0fs (%.2f req/s)",
+                n_sessions, n_requests, trace_span, qps)

    pre_metrics = await _snapshot_prefix_cache_metrics(config.endpoint_url)
    sweep_start = time.perf_counter()
@@ -312,7 +302,7 @@ async def replay_trace(config: ReplayConfig) -> list[RequestMetrics]:
                asyncio.create_task(_run_session(
                    state=_SessionState(session_id=sid, turns=turns),
                    config=config, client=client,
-                    session_sem=session_sem, request_sem=request_sem,
+                    request_sem=request_sem,
                    earliest_ts=earliest_ts, sweep_start=sweep_start,
                    sink=sink,
                ))