Use time-based trace window ids

This commit is contained in:
2026-04-04 22:09:43 +08:00
parent 4e1401f50c
commit 7b7eaafd78
4 changed files with 32 additions and 39 deletions

View File

@@ -18,30 +18,23 @@ DEFAULT_THINKING_SOURCE = Path(
DEFAULT_OUTPUT_ROOT = REPO_ROOT / "trace_windows"
LEGACY_TARGET_DATES = ["2026-03-11", "2026-03-12", "2026-03-13", "2026-03-16", "2026-03-17"]
THINKING_WINDOWS = [
("2026-03-21", "peak_weekend"),
("2026-03-22", "peak_weekend"),
("2026-03-23", "peak"),
("2026-03-24", "peak"),
("2026-03-25", "peak"),
("2026-03-26", "peak"),
("2026-03-27", "peak"),
("2026-03-21", "1000"),
("2026-03-22", "1000"),
("2026-03-23", "1000"),
("2026-03-24", "1000"),
("2026-03-25", "1000"),
("2026-03-26", "1000"),
("2026-03-27", "1000"),
]
WINDOW_SPECS = {
"peak": {
"1000": {
"start_hour": 9,
"window_start": 3600.0,
"window_end": 4200.0,
"slot_label": "10:00-10:10",
"slot_token": "1000",
},
"peak_weekend": {
"start_hour": 9,
"window_start": 3600.0,
"window_end": 4200.0,
"slot_label": "10:00-10:10",
"slot_token": "1000",
},
"valley": {
"2200": {
"start_hour": 21,
"window_start": 3600.0,
"window_end": 4200.0,
@@ -54,18 +47,18 @@ TRACE_SPECS = {
"block_size": 64,
"source_key": "legacy",
"target_dates": LEGACY_TARGET_DATES,
"day_parts": ("peak", "valley"),
"slot_tokens": ("1000", "2200"),
},
"coder": {
"block_size": 512,
"source_key": "legacy",
"target_dates": LEGACY_TARGET_DATES,
"day_parts": ("peak", "valley"),
"slot_tokens": ("1000", "2200"),
},
"thinking": {
"block_size": 64,
"source_key": "thinking",
"date_day_parts": THINKING_WINDOWS,
"date_slot_tokens": THINKING_WINDOWS,
},
}
@@ -109,9 +102,9 @@ def stable_uniform(*, seed: int, window_id: str, index: int, row: dict[str, Any]
return int.from_bytes(digest, byteorder="big", signed=False) / float(1 << 64)
def build_source_filename(*, trace_type: str, date_text: str, day_part: str) -> str:
def build_source_filename(*, trace_type: str, date_text: str, slot_token: str) -> str:
month, day = date_text[5:7], date_text[8:10]
start_hour = int(WINDOW_SPECS[day_part]["start_hour"])
start_hour = int(WINDOW_SPECS[slot_token]["start_hour"])
return (
f"qwen_{trace_type}_blksz_{TRACE_SPECS[trace_type]['block_size']}_"
f"{month}{day}{start_hour:02d}-{month}{day}{start_hour + 2:02d}.jsonl"
@@ -135,23 +128,23 @@ def build_windows(
for trace_type in workloads:
spec = TRACE_SPECS[trace_type]
source_dir = source_dirs[str(spec["source_key"])]
if "date_day_parts" in spec:
date_day_parts = list(spec["date_day_parts"])
if "date_slot_tokens" in spec:
date_slot_tokens = list(spec["date_slot_tokens"])
else:
date_day_parts = [
(date_text, day_part)
for day_part in spec["day_parts"]
date_slot_tokens = [
(date_text, slot_token)
for slot_token in spec["slot_tokens"]
for date_text in spec["target_dates"]
]
sample_order_by_group: dict[str, int] = {}
for date_text, day_part in date_day_parts:
window_spec = WINDOW_SPECS[day_part]
for date_text, slot_token in date_slot_tokens:
window_spec = WINDOW_SPECS[slot_token]
date_token = date_text.replace("-", "")
sample_order = sample_order_by_group.setdefault(day_part, 0)
sample_order_by_group[day_part] += 1
window_id = f"{trace_type}_w{date_token}_{day_part}_{window_spec['slot_token']}"
sample_order = sample_order_by_group.setdefault(slot_token, 0)
sample_order_by_group[slot_token] += 1
window_id = f"{trace_type}_w{date_token}_{window_spec['slot_token']}"
source_trace_path = source_dir / build_source_filename(
trace_type=trace_type, date_text=date_text, day_part=day_part
trace_type=trace_type, date_text=date_text, slot_token=slot_token
)
source_prompt_path = Path(str(source_trace_path).replace(".jsonl", "_prompt.jsonl"))
windows.append(
@@ -164,7 +157,7 @@ def build_windows(
"window_index": 6,
"sample_order": sample_order,
"date": date_text,
"day_part": day_part,
"slot_token": window_spec["slot_token"],
"slot_label": window_spec["slot_label"],
"source_trace_path": str(source_trace_path),
"source_prompt_path": str(source_prompt_path),