agentic-pd-hybrid/scripts/convert_inferact_to_trace.py

"""Convert Inferact codex_swebenchpro_traces (ShareGPT) to agentic-pd-hybrid trace JSONL.

Output schema (one JSON object per line, matching src/agentic_pd_hybrid/trace.py):
  chat_id, parent_chat_id, timestamp, input_length, output_length, type, turn, hash_ids

Each trial in the input becomes one session. Each (human, gpt) pair within a trial
becomes one turn. The prefix at turn N is the concatenation of all (human, gpt) pairs
from turns 0..N-1 plus the current human message — this mirrors how agentic coding
agents grow context across calls.

hash_ids are derived per 24-token block via sha256 of the block's text + previous hash,
which gives stable, deterministic, prefix-shared hashes across turns of the same session.
"""

from __future__ import annotations

import argparse
import hashlib
import json
import sys
import time
from pathlib import Path

BLOCK_TOKEN_BUDGET = 24


def _block_hash(text: str, prev_hash: int) -> int:
    h = hashlib.sha256(text.encode("utf-8") + prev_hash.to_bytes(8, "big")).digest()
    return int.from_bytes(h[:8], "big") & 0x7FFFFFFFFFFFFFFF


def _build_hash_ids(token_ids: list[int]) -> list[int]:
    out: list[int] = []
    prev = 0
    for start in range(0, len(token_ids), BLOCK_TOKEN_BUDGET):
        block = token_ids[start : start + BLOCK_TOKEN_BUDGET]
        block_repr = ",".join(str(t) for t in block)
        prev = _block_hash(block_repr, prev)
        out.append(prev)
    return out


def _pair_turns(conv: list[dict]) -> list[tuple[str, str]]:
    """Pair consecutive (human, gpt) messages. Skip malformed."""
    pairs: list[tuple[str, str]] = []
    i = 0
    while i + 1 < len(conv):
        a, b = conv[i], conv[i + 1]
        if (
            isinstance(a, dict)
            and isinstance(b, dict)
            and a.get("from") == "human"
            and b.get("from") == "gpt"
        ):
            pairs.append((str(a.get("value", "")), str(b.get("value", ""))))
            i += 2
        else:
            i += 1
    return pairs


def convert(
    input_path: Path,
    output_path: Path,
    *,
    tokenizer_path: str,
    max_trials: int | None,
    inter_turn_gap_s: float,
    session_stagger_s: float,
    request_type: str,
) -> None:
    from transformers import AutoTokenizer

    print(f"loading tokenizer from {tokenizer_path}", file=sys.stderr)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)

    print(f"loading {input_path}", file=sys.stderr)
    data = json.loads(input_path.read_text())
    if max_trials is not None:
        data = data[:max_trials]
    print(f"{len(data)} trials to process", file=sys.stderr)

    next_chat_id = 1_000_000
    written = 0
    skipped_trials = 0
    t0 = time.time()

    with output_path.open("w", encoding="utf-8") as out_f:
        for trial_idx, trial in enumerate(data):
            conv = trial.get("conversations") or []
            turns = _pair_turns(conv)
            if not turns:
                skipped_trials += 1
                continue

            base_ts = trial_idx * session_stagger_s
            ts = base_ts
            parent_chat_id = -1
            prefix_text = ""

            for turn_idx, (human, assistant) in enumerate(turns):
                # Input at this turn = full prior context + current human message.
                current_text = (
                    prefix_text + ("\n\n[USER]\n" if prefix_text else "[USER]\n") + human
                )
                input_ids = tokenizer.encode(current_text, add_special_tokens=False)
                input_length = len(input_ids)

                output_ids = tokenizer.encode(assistant, add_special_tokens=False)
                output_length = max(1, len(output_ids))

                hash_ids = _build_hash_ids(input_ids)

                chat_id = next_chat_id
                next_chat_id += 1
                record = {
                    "chat_id": chat_id,
                    "parent_chat_id": parent_chat_id,
                    "timestamp": round(ts, 6),
                    "input_length": input_length,
                    "output_length": output_length,
                    "type": request_type,
                    "turn": turn_idx,
                    "hash_ids": hash_ids,
                }
                out_f.write(json.dumps(record) + "\n")
                written += 1

                parent_chat_id = chat_id
                ts += inter_turn_gap_s
                prefix_text = current_text + "\n\n[ASSISTANT]\n" + assistant

            if (trial_idx + 1) % 20 == 0:
                elapsed = time.time() - t0
                rate = (trial_idx + 1) / elapsed if elapsed > 0 else 0
                eta = (len(data) - trial_idx - 1) / rate if rate > 0 else 0
                print(
                    f"  trial {trial_idx + 1}/{len(data)} reqs={written} "
                    f"rate={rate:.1f} trial/s eta={eta:.0f}s",
                    file=sys.stderr,
                )

    elapsed = time.time() - t0
    print(
        f"done: wrote {written} requests across {len(data) - skipped_trials} sessions "
        f"({skipped_trials} trials skipped, empty conversations) in {elapsed:.1f}s "
        f"to {output_path}",
        file=sys.stderr,
    )


def main() -> None:
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument(
        "--input",
        type=Path,
        default=Path("third_party/codex_swebenchpro_traces/codex_swebenchpro.json"),
    )
    p.add_argument("--output", type=Path, required=True)
    p.add_argument(
        "--tokenizer",
        default="/mnt/models/Qwen/Qwen3-30B-A3B-Instruct-2507",
        help="Path or HF id for the tokenizer. Default matches v2 sweep model.",
    )
    p.add_argument(
        "--max-trials",
        type=int,
        default=None,
        help="Cap number of trials processed (useful for smoke / quick tests).",
    )
    p.add_argument("--inter-turn-gap-s", type=float, default=2.5)
    p.add_argument("--session-stagger-s", type=float, default=1.0)
    p.add_argument("--request-type", default="chat")
    args = p.parse_args()

    args.output.parent.mkdir(parents=True, exist_ok=True)
    convert(
        input_path=args.input,
        output_path=args.output,
        tokenizer_path=args.tokenizer,
        max_trials=args.max_trials,
        inter_turn_gap_s=args.inter_turn_gap_s,
        session_stagger_s=args.session_stagger_s,
        request_type=args.request_type,
    )


if __name__ == "__main__":
    main()