Use time-based trace window ids

This commit is contained in:
2026-04-04 22:09:43 +08:00
parent 4e1401f50c
commit 7b7eaafd78
4 changed files with 32 additions and 39 deletions

View File

@@ -47,7 +47,7 @@
}, },
"trace": { "trace": {
"windows_path": "trace_windows/windows.json", "windows_path": "trace_windows/windows.json",
"window_id": "chat_w_example_peak_0001", "window_id": "chat_w_example_0001",
"u_field": "sampling_u", "u_field": "sampling_u",
"timestamp_field": "timestamp", "timestamp_field": "timestamp",
"max_concurrency": 64 "max_concurrency": 64

View File

@@ -4,9 +4,9 @@
"window_duration_seconds": 10.0, "window_duration_seconds": 10.0,
"windows": [ "windows": [
{ {
"window_id": "chat_w_example_peak_0001", "window_id": "chat_w_example_0001",
"trace_type": "chat", "trace_type": "chat",
"trace_file": "traces/chat_w_example_peak_0001.jsonl", "trace_file": "traces/chat_w_example_0001.jsonl",
"window_start": 0.0, "window_start": 0.0,
"window_end": 10.0, "window_end": 10.0,
"num_requests": 3 "num_requests": 3

View File

@@ -18,30 +18,23 @@ DEFAULT_THINKING_SOURCE = Path(
DEFAULT_OUTPUT_ROOT = REPO_ROOT / "trace_windows" DEFAULT_OUTPUT_ROOT = REPO_ROOT / "trace_windows"
LEGACY_TARGET_DATES = ["2026-03-11", "2026-03-12", "2026-03-13", "2026-03-16", "2026-03-17"] LEGACY_TARGET_DATES = ["2026-03-11", "2026-03-12", "2026-03-13", "2026-03-16", "2026-03-17"]
THINKING_WINDOWS = [ THINKING_WINDOWS = [
("2026-03-21", "peak_weekend"), ("2026-03-21", "1000"),
("2026-03-22", "peak_weekend"), ("2026-03-22", "1000"),
("2026-03-23", "peak"), ("2026-03-23", "1000"),
("2026-03-24", "peak"), ("2026-03-24", "1000"),
("2026-03-25", "peak"), ("2026-03-25", "1000"),
("2026-03-26", "peak"), ("2026-03-26", "1000"),
("2026-03-27", "peak"), ("2026-03-27", "1000"),
] ]
WINDOW_SPECS = { WINDOW_SPECS = {
"peak": { "1000": {
"start_hour": 9, "start_hour": 9,
"window_start": 3600.0, "window_start": 3600.0,
"window_end": 4200.0, "window_end": 4200.0,
"slot_label": "10:00-10:10", "slot_label": "10:00-10:10",
"slot_token": "1000", "slot_token": "1000",
}, },
"peak_weekend": { "2200": {
"start_hour": 9,
"window_start": 3600.0,
"window_end": 4200.0,
"slot_label": "10:00-10:10",
"slot_token": "1000",
},
"valley": {
"start_hour": 21, "start_hour": 21,
"window_start": 3600.0, "window_start": 3600.0,
"window_end": 4200.0, "window_end": 4200.0,
@@ -54,18 +47,18 @@ TRACE_SPECS = {
"block_size": 64, "block_size": 64,
"source_key": "legacy", "source_key": "legacy",
"target_dates": LEGACY_TARGET_DATES, "target_dates": LEGACY_TARGET_DATES,
"day_parts": ("peak", "valley"), "slot_tokens": ("1000", "2200"),
}, },
"coder": { "coder": {
"block_size": 512, "block_size": 512,
"source_key": "legacy", "source_key": "legacy",
"target_dates": LEGACY_TARGET_DATES, "target_dates": LEGACY_TARGET_DATES,
"day_parts": ("peak", "valley"), "slot_tokens": ("1000", "2200"),
}, },
"thinking": { "thinking": {
"block_size": 64, "block_size": 64,
"source_key": "thinking", "source_key": "thinking",
"date_day_parts": THINKING_WINDOWS, "date_slot_tokens": THINKING_WINDOWS,
}, },
} }
@@ -109,9 +102,9 @@ def stable_uniform(*, seed: int, window_id: str, index: int, row: dict[str, Any]
return int.from_bytes(digest, byteorder="big", signed=False) / float(1 << 64) return int.from_bytes(digest, byteorder="big", signed=False) / float(1 << 64)
def build_source_filename(*, trace_type: str, date_text: str, day_part: str) -> str: def build_source_filename(*, trace_type: str, date_text: str, slot_token: str) -> str:
month, day = date_text[5:7], date_text[8:10] month, day = date_text[5:7], date_text[8:10]
start_hour = int(WINDOW_SPECS[day_part]["start_hour"]) start_hour = int(WINDOW_SPECS[slot_token]["start_hour"])
return ( return (
f"qwen_{trace_type}_blksz_{TRACE_SPECS[trace_type]['block_size']}_" f"qwen_{trace_type}_blksz_{TRACE_SPECS[trace_type]['block_size']}_"
f"{month}{day}{start_hour:02d}-{month}{day}{start_hour + 2:02d}.jsonl" f"{month}{day}{start_hour:02d}-{month}{day}{start_hour + 2:02d}.jsonl"
@@ -135,23 +128,23 @@ def build_windows(
for trace_type in workloads: for trace_type in workloads:
spec = TRACE_SPECS[trace_type] spec = TRACE_SPECS[trace_type]
source_dir = source_dirs[str(spec["source_key"])] source_dir = source_dirs[str(spec["source_key"])]
if "date_day_parts" in spec: if "date_slot_tokens" in spec:
date_day_parts = list(spec["date_day_parts"]) date_slot_tokens = list(spec["date_slot_tokens"])
else: else:
date_day_parts = [ date_slot_tokens = [
(date_text, day_part) (date_text, slot_token)
for day_part in spec["day_parts"] for slot_token in spec["slot_tokens"]
for date_text in spec["target_dates"] for date_text in spec["target_dates"]
] ]
sample_order_by_group: dict[str, int] = {} sample_order_by_group: dict[str, int] = {}
for date_text, day_part in date_day_parts: for date_text, slot_token in date_slot_tokens:
window_spec = WINDOW_SPECS[day_part] window_spec = WINDOW_SPECS[slot_token]
date_token = date_text.replace("-", "") date_token = date_text.replace("-", "")
sample_order = sample_order_by_group.setdefault(day_part, 0) sample_order = sample_order_by_group.setdefault(slot_token, 0)
sample_order_by_group[day_part] += 1 sample_order_by_group[slot_token] += 1
window_id = f"{trace_type}_w{date_token}_{day_part}_{window_spec['slot_token']}" window_id = f"{trace_type}_w{date_token}_{window_spec['slot_token']}"
source_trace_path = source_dir / build_source_filename( source_trace_path = source_dir / build_source_filename(
trace_type=trace_type, date_text=date_text, day_part=day_part trace_type=trace_type, date_text=date_text, slot_token=slot_token
) )
source_prompt_path = Path(str(source_trace_path).replace(".jsonl", "_prompt.jsonl")) source_prompt_path = Path(str(source_trace_path).replace(".jsonl", "_prompt.jsonl"))
windows.append( windows.append(
@@ -164,7 +157,7 @@ def build_windows(
"window_index": 6, "window_index": 6,
"sample_order": sample_order, "sample_order": sample_order,
"date": date_text, "date": date_text,
"day_part": day_part, "slot_token": window_spec["slot_token"],
"slot_label": window_spec["slot_label"], "slot_label": window_spec["slot_label"],
"source_trace_path": str(source_trace_path), "source_trace_path": str(source_trace_path),
"source_prompt_path": str(source_prompt_path), "source_prompt_path": str(source_prompt_path),

View File

@@ -344,10 +344,10 @@ class CoreFlowTests(unittest.TestCase):
windows_payload = json.loads((output_root / "windows.json").read_text(encoding="utf-8")) windows_payload = json.loads((output_root / "windows.json").read_text(encoding="utf-8"))
windows = {item["window_id"]: item for item in windows_payload["windows"]} windows = {item["window_id"]: item for item in windows_payload["windows"]}
self.assertIn("chat_w20260311_peak_1000", windows) self.assertIn("chat_w20260311_1000", windows)
self.assertEqual(windows["chat_w20260311_peak_1000"]["num_requests"], 1) self.assertEqual(windows["chat_w20260311_1000"]["num_requests"], 1)
trace_path = output_root / windows["chat_w20260311_peak_1000"]["trace_file"] trace_path = output_root / windows["chat_w20260311_1000"]["trace_file"]
rows = [json.loads(line) for line in trace_path.read_text(encoding="utf-8").splitlines()] rows = [json.loads(line) for line in trace_path.read_text(encoding="utf-8").splitlines()]
self.assertEqual(len(rows), 1) self.assertEqual(len(rows), 1)
self.assertEqual(rows[0]["prompt"], "real prompt") self.assertEqual(rows[0]["prompt"], "real prompt")