scripts/sample_trace_subset.py — file-order head-cut that takes the
first N sessions of a converted trace. No RNG, no hashing — same
input yields byte-identical output (the included assertion compares
md5 across two runs).
scripts/sweep_e1_naive_1p3d.sh — E1 of ONBOARDING_NEXT_AGENT_ZH §3.1:
mechanism=pd-disaggregation, policy=kv-aware, 1P3D, RDMA on
(mlx5_60). Defaults to outputs/inferact_50sess.jsonl so E1 and E2
can share the exact same subset; override via TRACE= env var to run
on the full 20,230-request trace.
Reproducing the subset:
uv run --no-sync python scripts/sample_trace_subset.py \\
--input outputs/inferact_codex_swebenchpro.jsonl \\
--output outputs/inferact_50sess.jsonl \\
--sessions 50
# expected output_md5: 7bb263a32600ef5a6ef5099ba340a487
# 1285 requests, mean input_length 67631 tokens
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
82 lines
2.6 KiB
Python
82 lines
2.6 KiB
Python
"""Deterministically slice the first N sessions of an agentic-pd-hybrid trace.
|
|
|
|
Method: scan in file order, count records whose `parent_chat_id == -1` (= a
|
|
session's turn 0), and write every record until the (N+1)-th such record is
|
|
seen. No RNG, no hashing — re-running on the same input produces a byte-
|
|
identical output. Used to derive matched subsets for paired sweeps (E1 vs E2)
|
|
without spending GPU hours on the full trace.
|
|
|
|
Usage:
|
|
uv run --no-sync python scripts/sample_trace_subset.py \
|
|
--input outputs/inferact_codex_swebenchpro.jsonl \
|
|
--output outputs/inferact_50sess.jsonl \
|
|
--sessions 50
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import hashlib
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def slice_first_n_sessions(input_path: Path, output_path: Path, n_sessions: int) -> dict:
|
|
sessions_seen = 0
|
|
requests_written = 0
|
|
input_length_sum = 0
|
|
output_length_sum = 0
|
|
min_in = float("inf")
|
|
max_in = 0
|
|
|
|
with input_path.open("r", encoding="utf-8") as f_in, output_path.open(
|
|
"w", encoding="utf-8"
|
|
) as f_out:
|
|
for line in f_in:
|
|
rec = json.loads(line)
|
|
if rec["parent_chat_id"] == -1:
|
|
sessions_seen += 1
|
|
if sessions_seen > n_sessions:
|
|
break
|
|
f_out.write(line)
|
|
requests_written += 1
|
|
il = int(rec["input_length"])
|
|
input_length_sum += il
|
|
output_length_sum += int(rec["output_length"])
|
|
if il < min_in:
|
|
min_in = il
|
|
if il > max_in:
|
|
max_in = il
|
|
|
|
h = hashlib.md5(output_path.read_bytes()).hexdigest()
|
|
return {
|
|
"sessions": min(sessions_seen, n_sessions),
|
|
"requests": requests_written,
|
|
"input_length_mean": input_length_sum / max(1, requests_written),
|
|
"input_length_min": int(min_in) if min_in != float("inf") else 0,
|
|
"input_length_max": max_in,
|
|
"output_length_mean": output_length_sum / max(1, requests_written),
|
|
"output_md5": h,
|
|
}
|
|
|
|
|
|
def main() -> None:
|
|
p = argparse.ArgumentParser(description=__doc__)
|
|
p.add_argument(
|
|
"--input",
|
|
type=Path,
|
|
default=Path("outputs/inferact_codex_swebenchpro.jsonl"),
|
|
)
|
|
p.add_argument("--output", type=Path, required=True)
|
|
p.add_argument("--sessions", type=int, default=50)
|
|
args = p.parse_args()
|
|
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
stats = slice_first_n_sessions(args.input, args.output, args.sessions)
|
|
print(json.dumps(stats, indent=2), file=sys.stderr)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|