agentic-pd-hybrid/scripts/convert_audit_to_trace.py

#!/usr/bin/env python3
"""Convert sibench audit.jsonl to agentic-pd-hybrid trace format.

Source format (sibench audit.jsonl):
  {"instance_id": "...", "ts": float, "messages": [...],
   "audit": {"prompt_tokens": int, "completion_tokens": int, ...}}

Target format (agentic-pd-hybrid trace JSONL):
  {"chat_id": int, "parent_chat_id": int, "timestamp": float,
   "turn": int, "input_length": int, "output_length": int,
   "type": str, "hash_ids": [int, ...]}
"""

import json
import sys
from collections import defaultdict
from pathlib import Path

BLOCK_TOKEN_BUDGET = 24  # tokens per block, matching trace.py default


def convert(src: Path, dst: Path) -> None:
    # Group lines by instance_id, preserving order within each instance
    instances: dict[str, list[dict]] = defaultdict(list)
    with src.open() as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rec = json.loads(line)
            instances[rec["instance_id"]].append(rec)

    # Sort each instance's turns by timestamp
    for iid in instances:
        instances[iid].sort(key=lambda r: r["ts"])

    # Assign stable chat_id bases: each instance gets a block of IDs
    # Max turns across all instances determines the spacing
    max_turns = max(len(turns) for turns in instances.values())
    spacing = max_turns + 10  # extra headroom

    total_written = 0
    with dst.open("w") as out:
        for inst_idx, (iid, turns) in enumerate(instances.items()):
            base_chat_id = (inst_idx + 1) * spacing  # start from spacing to avoid 0
            # Track cumulative hash_ids for prefix cache simulation
            cumulative_hash_ids: list[int] = []
            global_block_counter = inst_idx * 100_000  # unique block namespace per instance

            for turn_idx, rec in enumerate(turns):
                audit = rec.get("audit", {})
                input_length = audit.get("prompt_tokens", 0)
                output_length = audit.get("completion_tokens", 0)

                if input_length <= 0:
                    # Fallback: estimate from message content
                    total_chars = sum(len(m.get("content", "")) for m in rec.get("messages", []))
                    input_length = max(1, total_chars // 4)
                if output_length <= 0:
                    output_length = 128  # reasonable default

                chat_id = base_chat_id + turn_idx
                if turn_idx == 0:
                    parent_chat_id = -1
                else:
                    parent_chat_id = base_chat_id + turn_idx - 1

                # Build hash_ids: for turn 0, generate blocks for full input
                # For turn N>0, keep previous blocks and add new ones for the delta
                if turn_idx == 0:
                    num_blocks = input_length // BLOCK_TOKEN_BUDGET
                    cumulative_hash_ids = list(
                        range(global_block_counter, global_block_counter + num_blocks)
                    )
                    global_block_counter += num_blocks
                else:
                    # The new input is the full prompt (cumulative), so the delta
                    # is the new tokens beyond what was in the previous turn's prompt
                    prev_input = audit.get("prompt_tokens", 0)
                    prev_rec_audit = turns[turn_idx - 1].get("audit", {})
                    prev_input_length = prev_rec_audit.get("prompt_tokens", 0)
                    delta = max(0, prev_input - prev_input_length) if prev_input_length > 0 else 0
                    new_blocks = delta // BLOCK_TOKEN_BUDGET
                    new_ids = list(
                        range(global_block_counter, global_block_counter + new_blocks)
                    )
                    global_block_counter += new_blocks
                    cumulative_hash_ids = cumulative_hash_ids + new_ids

                trace_line = {
                    "chat_id": chat_id,
                    "parent_chat_id": parent_chat_id,
                    "timestamp": rec["ts"],
                    "turn": turn_idx,
                    "input_length": input_length,
                    "output_length": output_length,
                    "type": "chat",
                    "hash_ids": cumulative_hash_ids,
                }
                out.write(json.dumps(trace_line, separators=(",", ":")) + "\n")
                total_written += 1

    print(f"Converted {total_written} lines from {len(instances)} instances -> {dst}")


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print(f"Usage: {sys.argv[0]} <input_audit.jsonl> <output_trace.jsonl>")
        sys.exit(1)
    convert(Path(sys.argv[1]), Path(sys.argv[2]))